pintos-os.org Git - pspp/blob - src/language/lexer/lexer.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "language/lexer/lexer.h"
  20
  21 #include <errno.h>
  22 #include <fcntl.h>
  23 #include <limits.h>
  24 #include <math.h>
  25 #include <stdarg.h>
  26 #include <stdlib.h>
  27 #include <string.h>
  28 #include <unictype.h>
  29 #include <unistd.h>
  30 #include <unistr.h>
  31
  32 #include "language/command.h"
  33 #include "language/lexer/macro.h"
  34 #include "language/lexer/scan.h"
  35 #include "language/lexer/segment.h"
  36 #include "language/lexer/token.h"
  37 #include "libpspp/assertion.h"
  38 #include "libpspp/cast.h"
  39 #include "libpspp/deque.h"
  40 #include "libpspp/i18n.h"
  41 #include "libpspp/intern.h"
  42 #include "libpspp/ll.h"
  43 #include "libpspp/message.h"
  44 #include "libpspp/misc.h"
  45 #include "libpspp/str.h"
  46 #include "libpspp/u8-istream.h"
  47 #include "output/journal.h"
  48 #include "output/output-item.h"
  49
  50 #include "gl/c-ctype.h"
  51 #include "gl/minmax.h"
  52 #include "gl/xalloc.h"
  53 #include "gl/xmemdup0.h"
  54
  55 #include "gettext.h"
  56 #define _(msgid) gettext (msgid)
  57 #define N_(msgid) msgid
  58
  59 /* A token within a lex_source. */
  60 struct lex_token
  61   {
  62     /* The regular token information. */
  63     struct token token;
  64
  65     /* For a token obtained through the lexer in an ordinary way, this is the
  66        location of the token in terms of the lex_source's buffer.
  67
  68        For a token produced through macro expansion, this is the entire macro
  69        call. */
  70     size_t token_pos;           /* Offset into src->buffer of token start. */
  71     size_t token_len;           /* Length of source for token in bytes. */
  72
  73     /* For a token obtained through macro expansion, this is just this token.
  74
  75        For a token obtained through the lexer in an ordinary way, these are
  76        nulls and zeros. */
  77     char *macro_rep;        /* The whole macro expansion. */
  78     size_t ofs;             /* Offset of this token in macro_rep. */
  79     size_t len;             /* Length of this token in macro_rep. */
  80     size_t *ref_cnt;        /* Number of lex_tokens that refer to macro_rep. */
  81   };
  82
  83 static struct msg_point lex_token_start_point (const struct lex_source *,
  84                                                const struct lex_token *);
  85 static struct msg_point lex_token_end_point (const struct lex_source *,
  86                                              const struct lex_token *);
  87
  88 static bool lex_ofs_at_phrase__ (struct lexer *, int ofs, const char *s,
  89                                  size_t *n_matchedp);
  90
  91 /* Source offset of the last byte in TOKEN. */
  92 static size_t
  93 lex_token_end (const struct lex_token *token)
  94 {
  95   return token->token_pos + MAX (token->token_len, 1) - 1;
  96 }
  97
  98 static void
  99 lex_token_destroy (struct lex_token *t)
 100 {
 101   token_uninit (&t->token);
 102   if (t->ref_cnt)
 103     {
 104       assert (*t->ref_cnt > 0);
 105       if (!--*t->ref_cnt)
 106         {
 107           free (t->macro_rep);
 108           free (t->ref_cnt);
 109         }
 110     }
 111   free (t);
 112 }
 113 \f
 114 /* A deque of lex_tokens that comprises one stage in the token pipeline in a
 115    lex_source. */
 116 struct lex_stage
 117   {
 118     struct deque deque;
 119     struct lex_token **tokens;
 120   };
 121
 122 static void lex_stage_clear (struct lex_stage *);
 123 static void lex_stage_uninit (struct lex_stage *);
 124
 125 static size_t lex_stage_count (const struct lex_stage *);
 126 static bool lex_stage_is_empty (const struct lex_stage *);
 127
 128 static struct lex_token *lex_stage_first (struct lex_stage *);
 129 static struct lex_token *lex_stage_nth (struct lex_stage *, size_t ofs);
 130
 131 static void lex_stage_push_last (struct lex_stage *, struct lex_token *);
 132 static void lex_stage_pop_first (struct lex_stage *);
 133
 134 static void lex_stage_shift (struct lex_stage *dst, struct lex_stage *src,
 135                              size_t n);
 136
 137 /* Deletes all the tokens from STAGE. */
 138 static void
 139 lex_stage_clear (struct lex_stage *stage)
 140 {
 141   while (!deque_is_empty (&stage->deque))
 142     lex_stage_pop_first (stage);
 143 }
 144
 145 /* Deletes all the tokens from STAGE and frees storage for the deque. */
 146 static void
 147 lex_stage_uninit (struct lex_stage *stage)
 148 {
 149   lex_stage_clear (stage);
 150   free (stage->tokens);
 151 }
 152
 153 /* Returns true if STAGE contains no tokens, otherwise false. */
 154 static bool
 155 lex_stage_is_empty (const struct lex_stage *stage)
 156 {
 157   return deque_is_empty (&stage->deque);
 158 }
 159
 160 /* Returns the number of tokens in STAGE. */
 161 static size_t
 162 lex_stage_count (const struct lex_stage *stage)
 163 {
 164   return deque_count (&stage->deque);
 165 }
 166
 167 /* Returns the first token in STAGE, which must be nonempty.
 168    The first token is the one accessed with the least lookahead. */
 169 static struct lex_token *
 170 lex_stage_first (struct lex_stage *stage)
 171 {
 172   return lex_stage_nth (stage, 0);
 173 }
 174
 175 /* Returns the token the given INDEX in STAGE.  The first token (with the least
 176    lookahead) is 0, the second token is 1, and so on.  There must be at least
 177    INDEX + 1 tokens in STAGE. */
 178 static struct lex_token *
 179 lex_stage_nth (struct lex_stage *stage, size_t index)
 180 {
 181   return stage->tokens[deque_back (&stage->deque, index)];
 182 }
 183
 184 /* Adds TOKEN so that it becomes the last token in STAGE. */
 185 static void
 186 lex_stage_push_last (struct lex_stage *stage, struct lex_token *token)
 187 {
 188   if (deque_is_full (&stage->deque))
 189     stage->tokens = deque_expand (&stage->deque, stage->tokens,
 190                                   sizeof *stage->tokens);
 191   stage->tokens[deque_push_front (&stage->deque)] = token;
 192 }
 193
 194 /* Removes and returns the first token from STAGE. */
 195 static struct lex_token *
 196 lex_stage_take_first (struct lex_stage *stage)
 197 {
 198   return stage->tokens[deque_pop_back (&stage->deque)];
 199 }
 200
 201 /* Removes the first token from STAGE and uninitializes it. */
 202 static void
 203 lex_stage_pop_first (struct lex_stage *stage)
 204 {
 205   lex_token_destroy (lex_stage_take_first (stage));
 206 }
 207
 208 /* Removes the first N tokens from SRC, appending them to DST as the last
 209    tokens. */
 210 static void
 211 lex_stage_shift (struct lex_stage *dst, struct lex_stage *src, size_t n)
 212 {
 213   for (size_t i = 0; i < n; i++)
 214     lex_stage_push_last (dst, lex_stage_take_first (src));
 215 }
 216
 217 /* A source of tokens, corresponding to a syntax file.
 218
 219    This is conceptually a lex_reader wrapped with everything needed to convert
 220    its UTF-8 bytes into tokens. */
 221 struct lex_source
 222   {
 223     struct ll ll;               /* In lexer's list of sources. */
 224
 225     /* Reference count:
 226
 227        - One for struct lexer.
 228
 229        - One for each struct msg_location that references this source. */
 230     size_t n_refs;
 231
 232     struct lex_reader *reader;
 233     struct lexer *lexer;
 234     struct segmenter segmenter;
 235     bool eof;                   /* True if T_STOP was read from 'reader'. */
 236
 237     /* Buffer of UTF-8 bytes. */
 238     char *buffer;               /* Source file contents. */
 239     size_t length;              /* Number of bytes filled. */
 240     size_t allocated;           /* Number of bytes allocated. */
 241
 242     /* Offsets into 'buffer'. */
 243     size_t journal_pos;         /* First byte not yet output to journal. */
 244     size_t seg_pos;             /* First byte not yet scanned as token. */
 245
 246     /* Offset into 'buffer' of starts of lines. */
 247     size_t *lines;
 248     size_t n_lines, allocated_lines;
 249
 250     bool suppress_next_newline;
 251
 252     /* Tokens.
 253
 254        This is a pipeline with the following stages.  Each token eventually
 255        made available to the parser passes through of these stages.  The stages
 256        are named after the processing that happens in each one.
 257
 258        Initially, tokens come from the segmenter and scanner to 'pp':
 259
 260        - pp: Tokens that need to pass through the macro preprocessor to end up
 261          in 'merge'.
 262
 263        - merge: Tokens that need to pass through scan_merge() to end up in
 264          'parse'.
 265
 266        - parse: Tokens available to the client for parsing.
 267
 268       'pp' and 'merge' store tokens only temporarily until they pass into
 269       'parse'.  Tokens then live in 'parse' until the command is fully
 270       consumed, at which time they are freed together. */
 271     struct lex_stage pp;
 272     struct lex_stage merge;
 273     struct lex_token **parse;
 274     size_t n_parse, allocated_parse, parse_ofs;
 275   };
 276
 277 static struct lex_source *lex_source_create (struct lexer *,
 278                                              struct lex_reader *);
 279
 280 /* Lexer. */
 281 struct lexer
 282   {
 283     struct ll_list sources;     /* Contains "struct lex_source"s. */
 284     struct macro_set *macros;
 285
 286     /* Temporarily stores errors and warnings to be emitted by the lexer while
 287        lexing is going on, to avoid reentrancy. */
 288     struct msg **messages;
 289     size_t n_messages, allocated_messages;
 290   };
 291
 292 static struct lex_source *lex_source__ (const struct lexer *);
 293 static char *lex_source_syntax__ (const struct lex_source *,
 294                                   int ofs0, int ofs1);
 295 static const struct lex_token *lex_next__ (const struct lexer *, int n);
 296 static void lex_source_push_endcmd__ (struct lex_source *);
 297 static void lex_source_push_parse (struct lex_source *, struct lex_token *);
 298 static void lex_source_clear_parse (struct lex_source *);
 299
 300 static bool lex_source_get_parse (struct lex_source *);
 301 static void lex_source_msg_valist (struct lex_source *, enum msg_class,
 302                                    int ofs0, int ofs1,
 303                                    const char *format, va_list)
 304    PRINTF_FORMAT (5, 0);
 305 static const struct lex_token *lex_source_next__ (const struct lex_source *,
 306                                                   int n);
 307 \f
 308 /* Initializes READER with the specified CLASS and otherwise some reasonable
 309    defaults.  The caller should fill in the others members as desired. */
 310 void
 311 lex_reader_init (struct lex_reader *reader,
 312                  const struct lex_reader_class *class)
 313 {
 314   reader->class = class;
 315   reader->syntax = SEG_MODE_AUTO;
 316   reader->error = LEX_ERROR_CONTINUE;
 317   reader->file_name = NULL;
 318   reader->encoding = NULL;
 319   reader->line_number = 0;
 320   reader->eof = false;
 321 }
 322
 323 /* Frees any file name already in READER and replaces it by a copy of
 324    FILE_NAME, or if FILE_NAME is null then clears any existing name. */
 325 void
 326 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
 327 {
 328   free (reader->file_name);
 329   reader->file_name = xstrdup_if_nonnull (file_name);
 330 }
 331 \f
 332 /* Creates and returns a new lexer. */
 333 struct lexer *
 334 lex_create (void)
 335 {
 336   struct lexer *lexer = xmalloc (sizeof *lexer);
 337   *lexer = (struct lexer) {
 338     .sources = LL_INITIALIZER (lexer->sources),
 339     .macros = macro_set_create (),
 340   };
 341   return lexer;
 342 }
 343
 344 /* Destroys LEXER. */
 345 void
 346 lex_destroy (struct lexer *lexer)
 347 {
 348   if (lexer != NULL)
 349     {
 350       struct lex_source *source, *next;
 351
 352       assert (!lexer->messages);
 353
 354       ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
 355         {
 356           ll_remove (&source->ll);
 357           lex_source_unref (source);
 358         }
 359       macro_set_destroy (lexer->macros);
 360       free (lexer);
 361     }
 362 }
 363
 364 /* Adds M to LEXER's set of macros.  M replaces any existing macro with the
 365    same name.  Takes ownership of M. */
 366 void
 367 lex_define_macro (struct lexer *lexer, struct macro *m)
 368 {
 369   macro_set_add (lexer->macros, m);
 370 }
 371
 372 /* Returns LEXER's macro set.  The caller should not modify it. */
 373 const struct macro_set *
 374 lex_get_macros (const struct lexer *lexer)
 375 {
 376   return lexer->macros;
 377 }
 378
 379 /* Inserts READER into LEXER so that the next token read by LEXER comes from
 380    READER.  Before the caller, LEXER must either be empty or at a T_ENDCMD
 381    token. */
 382 void
 383 lex_include (struct lexer *lexer, struct lex_reader *reader)
 384 {
 385   assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
 386   ll_push_head (&lexer->sources, &lex_source_create (lexer, reader)->ll);
 387 }
 388
 389 /* Appends READER to LEXER, so that it will be read after all other current
 390    readers have already been read. */
 391 void
 392 lex_append (struct lexer *lexer, struct lex_reader *reader)
 393 {
 394   ll_push_tail (&lexer->sources, &lex_source_create (lexer, reader)->ll);
 395 }
 396 \f
 397 /* Advancing. */
 398
 399 /* Advances LEXER to the next token, consuming the current token. */
 400 void
 401 lex_get (struct lexer *lexer)
 402 {
 403   struct lex_source *src;
 404
 405   src = lex_source__ (lexer);
 406   if (src == NULL)
 407     return;
 408
 409   if (src->parse_ofs < src->n_parse)
 410     {
 411       if (src->parse[src->parse_ofs]->token.type == T_ENDCMD)
 412         lex_source_clear_parse (src);
 413       else
 414         src->parse_ofs++;
 415     }
 416
 417   while (src->parse_ofs == src->n_parse)
 418     if (!lex_source_get_parse (src))
 419       {
 420         ll_remove (&src->ll);
 421         lex_source_unref (src);
 422         src = lex_source__ (lexer);
 423         if (src == NULL)
 424           return;
 425       }
 426 }
 427
 428 /* Advances LEXER by N tokens. */
 429 void
 430 lex_get_n (struct lexer *lexer, size_t n)
 431 {
 432   while (n-- > 0)
 433     lex_get (lexer);
 434 }
 435 \f
 436 /* Issuing errors. */
 437
 438 /* Prints a syntax error message containing the current token and
 439    given message MESSAGE (if non-null). */
 440 void
 441 lex_error (struct lexer *lexer, const char *format, ...)
 442 {
 443   va_list args;
 444
 445   va_start (args, format);
 446   lex_ofs_msg_valist (lexer, SE, lex_ofs (lexer), lex_ofs (lexer),
 447                       format, args);
 448   va_end (args);
 449 }
 450
 451 /* Prints a syntax error message for the span of tokens N0 through N1,
 452    inclusive, from the current token in LEXER, adding message MESSAGE (if
 453    non-null). */
 454 void
 455 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
 456 {
 457   va_list args;
 458
 459   va_start (args, format);
 460   int ofs = lex_ofs (lexer);
 461   lex_ofs_msg_valist (lexer, SE, n0 + ofs, n1 + ofs, format, args);
 462   va_end (args);
 463 }
 464
 465 /* Prints a syntax error message for the span of tokens with offsets OFS0
 466    through OFS1, inclusive, within the current command in LEXER, adding message
 467    MESSAGE (if non-null). */
 468 void
 469 lex_ofs_error (struct lexer *lexer, int ofs0, int ofs1, const char *format, ...)
 470 {
 471   va_list args;
 472
 473   va_start (args, format);
 474   lex_ofs_msg_valist (lexer, SE, ofs0, ofs1, format, args);
 475   va_end (args);
 476 }
 477
 478 /* Prints a message of the given CLASS containing the current token and given
 479    message MESSAGE (if non-null). */
 480 void
 481 lex_msg (struct lexer *lexer, enum msg_class class, const char *format, ...)
 482 {
 483   va_list args;
 484
 485   va_start (args, format);
 486   lex_ofs_msg_valist (lexer, class, lex_ofs (lexer), lex_ofs (lexer),
 487                       format, args);
 488   va_end (args);
 489 }
 490
 491 /* Prints a syntax error message for the span of tokens N0 through N1,
 492    inclusive, from the current token in LEXER, adding message MESSAGE (if
 493    non-null). */
 494 void
 495 lex_next_msg (struct lexer *lexer, enum msg_class class, int n0, int n1,
 496               const char *format, ...)
 497 {
 498   va_list args;
 499
 500   va_start (args, format);
 501   int ofs = lex_ofs (lexer);
 502   lex_ofs_msg_valist (lexer, class, n0 + ofs, n1 + ofs, format, args);
 503   va_end (args);
 504 }
 505
 506 /* Prints a message of the given CLASS for the span of tokens with offsets OFS0
 507    through OFS1, inclusive, within the current command in LEXER, adding message
 508    MESSAGE (if non-null). */
 509 void
 510 lex_ofs_msg (struct lexer *lexer, enum msg_class class, int ofs0, int ofs1,
 511              const char *format, ...)
 512 {
 513   va_list args;
 514
 515   va_start (args, format);
 516   lex_ofs_msg_valist (lexer, class, ofs0, ofs1, format, args);
 517   va_end (args);
 518 }
 519
 520 /* Prints a syntax error message saying that one of the strings provided as
 521    varargs, up to the first NULL, is expected. */
 522 void
 523 (lex_error_expecting) (struct lexer *lexer, ...)
 524 {
 525   va_list args;
 526
 527   va_start (args, lexer);
 528   lex_error_expecting_valist (lexer, args);
 529   va_end (args);
 530 }
 531
 532 /* Prints a syntax error message saying that one of the options provided in
 533    ARGS, up to the first NULL, is expected. */
 534 void
 535 lex_error_expecting_valist (struct lexer *lexer, va_list args)
 536 {
 537   const char **options = NULL;
 538   size_t allocated = 0;
 539   size_t n = 0;
 540
 541   for (;;)
 542     {
 543       const char *option = va_arg (args, const char *);
 544       if (!option)
 545         break;
 546
 547       if (n >= allocated)
 548         options = x2nrealloc (options, &allocated, sizeof *options);
 549       options[n++] = option;
 550     }
 551   lex_error_expecting_array (lexer, options, n);
 552   free (options);
 553 }
 554
 555 void
 556 lex_error_expecting_array (struct lexer *lexer, const char **options, size_t n)
 557 {
 558   switch (n)
 559     {
 560     case 0:
 561       lex_error (lexer, NULL);
 562       break;
 563
 564     case 1:
 565       lex_error (lexer, _("Syntax error expecting %s."), options[0]);
 566       break;
 567
 568     case 2:
 569       lex_error (lexer, _("Syntax error expecting %s or %s."),
 570                  options[0], options[1]);
 571       break;
 572
 573     case 3:
 574       lex_error (lexer, _("Syntax error expecting %s, %s, or %s."),
 575                  options[0], options[1], options[2]);
 576       break;
 577
 578     case 4:
 579       lex_error (lexer, _("Syntax error expecting %s, %s, %s, or %s."),
 580                  options[0], options[1], options[2], options[3]);
 581       break;
 582
 583     case 5:
 584       lex_error (lexer, _("Syntax error expecting %s, %s, %s, %s, or %s."),
 585                  options[0], options[1], options[2], options[3], options[4]);
 586       break;
 587
 588     case 6:
 589       lex_error (lexer, _("Syntax error expecting %s, %s, %s, %s, %s, or %s."),
 590                  options[0], options[1], options[2], options[3], options[4],
 591                  options[5]);
 592       break;
 593
 594     case 7:
 595       lex_error (lexer, _("Syntax error expecting %s, %s, %s, %s, %s, %s, "
 596                           "or %s."),
 597                  options[0], options[1], options[2], options[3], options[4],
 598                  options[5], options[6]);
 599       break;
 600
 601     case 8:
 602       lex_error (lexer, _("Syntax error expecting %s, %s, %s, %s, %s, %s, %s, "
 603                           "or %s."),
 604                  options[0], options[1], options[2], options[3], options[4],
 605                  options[5], options[6], options[7]);
 606       break;
 607
 608     default:
 609       {
 610         struct string s = DS_EMPTY_INITIALIZER;
 611         for (size_t i = 0; i < n; i++)
 612           {
 613             if (i > 0)
 614               ds_put_cstr (&s, ", ");
 615             ds_put_cstr (&s, options[i]);
 616           }
 617         lex_error (lexer, _("Syntax error expecting one of the following: %s."),
 618                    ds_cstr (&s));
 619         ds_destroy (&s);
 620       }
 621       break;
 622     }
 623 }
 624
 625 /* Reports an error to the effect that subcommand SBC may only be specified
 626    once. */
 627 void
 628 lex_sbc_only_once (struct lexer *lexer, const char *sbc)
 629 {
 630   int ofs = lex_ofs (lexer) - 1;
 631   if (lex_ofs_token (lexer, ofs)->type == T_EQUALS)
 632     ofs--;
 633
 634   /* lex_ofs_at_phrase__() handles subcommand names that are keywords, such as
 635      BY. */
 636   if (lex_ofs_at_phrase__ (lexer, ofs, sbc, NULL))
 637     lex_ofs_error (lexer, ofs, ofs,
 638                    _("Subcommand %s may only be specified once."), sbc);
 639   else
 640     msg (SE, _("Subcommand %s may only be specified once."), sbc);
 641 }
 642
 643 /* Reports an error to the effect that subcommand SBC is missing.
 644
 645    This function does not take a lexer as an argument or use lex_error(),
 646    because a missing subcommand can normally be detected only after the whole
 647    command has been parsed, and so lex_error() would always report "Syntax
 648    error at end of command", which does not help the user find the error. */
 649 void
 650 lex_sbc_missing (struct lexer *lexer, const char *sbc)
 651 {
 652   lex_ofs_error (lexer, 0, lex_max_ofs (lexer),
 653                  _("Required subcommand %s was not specified."), sbc);
 654 }
 655
 656 /* Reports an error to the effect that specification SPEC may only be specified
 657    once within subcommand SBC. */
 658 void
 659 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
 660 {
 661   lex_error (lexer, _("%s may only be specified once within subcommand %s."),
 662              spec, sbc);
 663 }
 664
 665 /* Reports an error to the effect that specification SPEC is missing within
 666    subcommand SBC. */
 667 void
 668 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
 669 {
 670   lex_error (lexer, _("Required %s specification missing from %s subcommand."),
 671              spec, sbc);
 672 }
 673
 674 /* Prints a syntax error message for the span of tokens with offsets OFS0
 675    through OFS1, inclusive, within the current command in LEXER, adding message
 676    MESSAGE (if non-null) with the given ARGS. */
 677 void
 678 lex_ofs_msg_valist (struct lexer *lexer, enum msg_class class,
 679                     int ofs0, int ofs1, const char *format, va_list args)
 680 {
 681   lex_source_msg_valist (lex_source__ (lexer), class, ofs0, ofs1, format, args);
 682 }
 683
 684 /* Checks that we're at end of command.
 685    If so, returns a successful command completion code.
 686    If not, flags a syntax error and returns an error command
 687    completion code. */
 688 int
 689 lex_end_of_command (struct lexer *lexer)
 690 {
 691   if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
 692     {
 693       lex_error (lexer, _("Syntax error expecting end of command."));
 694       return CMD_FAILURE;
 695     }
 696   else
 697     return CMD_SUCCESS;
 698 }
 699 \f
 700 /* Token testing functions. */
 701
 702 /* Returns true if the current token is a number. */
 703 bool
 704 lex_is_number (const struct lexer *lexer)
 705 {
 706   return lex_next_is_number (lexer, 0);
 707 }
 708
 709 /* Returns true if the current token is a string. */
 710 bool
 711 lex_is_string (const struct lexer *lexer)
 712 {
 713   return lex_next_is_string (lexer, 0);
 714 }
 715
 716 /* Returns the value of the current token, which must be a
 717    floating point number. */
 718 double
 719 lex_number (const struct lexer *lexer)
 720 {
 721   return lex_next_number (lexer, 0);
 722 }
 723
 724 /* Returns true iff the current token is an integer. */
 725 bool
 726 lex_is_integer (const struct lexer *lexer)
 727 {
 728   return lex_next_is_integer (lexer, 0);
 729 }
 730
 731 /* Returns the value of the current token, which must be an
 732    integer. */
 733 long
 734 lex_integer (const struct lexer *lexer)
 735 {
 736   return lex_next_integer (lexer, 0);
 737 }
 738 \f
 739 /* Token testing functions with lookahead.
 740
 741    A value of 0 for N as an argument to any of these functions refers to the
 742    current token.  Lookahead is limited to the current command.  Any N greater
 743    than the number of tokens remaining in the current command will be treated
 744    as referring to a T_ENDCMD token. */
 745
 746 /* Returns true if the token N ahead of the current token is a number. */
 747 bool
 748 lex_next_is_number (const struct lexer *lexer, int n)
 749 {
 750   return token_is_number (lex_next (lexer, n));
 751 }
 752
 753 /* Returns true if the token N ahead of the current token is a string. */
 754 bool
 755 lex_next_is_string (const struct lexer *lexer, int n)
 756 {
 757   return token_is_string (lex_next (lexer, n));
 758 }
 759
 760 /* Returns the value of the token N ahead of the current token, which must be a
 761    floating point number. */
 762 double
 763 lex_next_number (const struct lexer *lexer, int n)
 764 {
 765   return token_number (lex_next (lexer, n));
 766 }
 767
 768 /* Returns true if the token N ahead of the current token is an integer. */
 769 bool
 770 lex_next_is_integer (const struct lexer *lexer, int n)
 771 {
 772   return token_is_integer (lex_next (lexer, n));
 773 }
 774
 775 /* Returns the value of the token N ahead of the current token, which must be
 776    an integer. */
 777 long
 778 lex_next_integer (const struct lexer *lexer, int n)
 779 {
 780   return token_integer (lex_next (lexer, n));
 781 }
 782 \f
 783 /* Token matching functions. */
 784
 785 /* If the current token has the specified TYPE, skips it and returns true.
 786    Otherwise, returns false. */
 787 bool
 788 lex_match (struct lexer *lexer, enum token_type type)
 789 {
 790   if (lex_token (lexer) == type)
 791     {
 792       lex_get (lexer);
 793       return true;
 794     }
 795   else
 796     return false;
 797 }
 798
 799 /* If the current token matches IDENTIFIER, skips it and returns true.
 800    IDENTIFIER may be abbreviated to its first three letters.  Otherwise,
 801    returns false.
 802
 803    IDENTIFIER must be an ASCII string. */
 804 bool
 805 lex_match_id (struct lexer *lexer, const char *identifier)
 806 {
 807   return lex_match_id_n (lexer, identifier, 3);
 808 }
 809
 810 /* If the current token is IDENTIFIER, skips it and returns true.  IDENTIFIER
 811    may be abbreviated to its first N letters.  Otherwise, returns false.
 812
 813    IDENTIFIER must be an ASCII string. */
 814 bool
 815 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
 816 {
 817   if (lex_token (lexer) == T_ID
 818       && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
 819     {
 820       lex_get (lexer);
 821       return true;
 822     }
 823   else
 824     return false;
 825 }
 826
 827 /* If the current token is integer X, skips it and returns true.  Otherwise,
 828    returns false. */
 829 bool
 830 lex_match_int (struct lexer *lexer, int x)
 831 {
 832   if (lex_is_integer (lexer) && lex_integer (lexer) == x)
 833     {
 834       lex_get (lexer);
 835       return true;
 836     }
 837   else
 838     return false;
 839 }
 840 \f
 841 /* Forced matches. */
 842
 843 /* If this token is IDENTIFIER, skips it and returns true.  IDENTIFIER may be
 844    abbreviated to its first 3 letters.  Otherwise, reports an error and returns
 845    false.
 846
 847    IDENTIFIER must be an ASCII string. */
 848 bool
 849 lex_force_match_id (struct lexer *lexer, const char *identifier)
 850 {
 851   if (lex_match_id (lexer, identifier))
 852     return true;
 853   else
 854     {
 855       lex_error_expecting (lexer, identifier);
 856       return false;
 857     }
 858 }
 859
 860 /* If the current token has the specified TYPE, skips it and returns true.
 861    Otherwise, reports an error and returns false. */
 862 bool
 863 lex_force_match (struct lexer *lexer, enum token_type type)
 864 {
 865   if (lex_token (lexer) == type)
 866     {
 867       lex_get (lexer);
 868       return true;
 869     }
 870   else
 871     {
 872       const char *type_string = token_type_to_string (type);
 873       if (type_string)
 874         {
 875           char *s = xasprintf ("`%s'", type_string);
 876           lex_error_expecting (lexer, s);
 877           free (s);
 878         }
 879       else
 880         lex_error_expecting (lexer, token_type_to_name (type));
 881
 882       return false;
 883     }
 884 }
 885
 886 /* If the current token is a string, does nothing and returns true.
 887    Otherwise, reports an error and returns false. */
 888 bool
 889 lex_force_string (struct lexer *lexer)
 890 {
 891   if (lex_is_string (lexer))
 892     return true;
 893   else
 894     {
 895       lex_error (lexer, _("Syntax error expecting string."));
 896       return false;
 897     }
 898 }
 899
 900 /* If the current token is a string or an identifier, does nothing and returns
 901    true.  Otherwise, reports an error and returns false.
 902
 903    This is meant for use in syntactic situations where we want to encourage the
 904    user to supply a quoted string, but for compatibility we also accept
 905    identifiers.  (One example of such a situation is file names.)  Therefore,
 906    the error message issued when the current token is wrong only says that a
 907    string is expected and doesn't mention that an identifier would also be
 908    accepted. */
 909 bool
 910 lex_force_string_or_id (struct lexer *lexer)
 911 {
 912   return lex_token (lexer) == T_ID || lex_force_string (lexer);
 913 }
 914
 915 /* If the current token is an integer, does nothing and returns true.
 916    Otherwise, reports an error and returns false. */
 917 bool
 918 lex_force_int (struct lexer *lexer)
 919 {
 920   if (lex_is_integer (lexer))
 921     return true;
 922   else
 923     {
 924       lex_error (lexer, _("Syntax error expecting integer."));
 925       return false;
 926     }
 927 }
 928
 929 /* If the current token is an integer in the range MIN...MAX (inclusive), does
 930    nothing and returns true.  Otherwise, reports an error and returns false.
 931    If NAME is nonnull, then it is used in the error message. */
 932 bool
 933 lex_force_int_range (struct lexer *lexer, const char *name, long min, long max)
 934 {
 935   bool is_number = lex_is_number (lexer);
 936   bool is_integer = lex_is_integer (lexer);
 937   bool too_small = (is_integer ? lex_integer (lexer) < min
 938                     : is_number ? lex_number (lexer) < min
 939                     : false);
 940   bool too_big = (is_integer ? lex_integer (lexer) > max
 941                   : is_number ? lex_number (lexer) > max
 942                   : false);
 943   if (is_integer && !too_small && !too_big)
 944     return true;
 945
 946   if (min > max)
 947     {
 948       /* Weird, maybe a bug in the caller.  Just report that we needed an
 949          integer. */
 950       if (name)
 951         lex_error (lexer, _("Syntax error expecting integer for %s."), name);
 952       else
 953         lex_error (lexer, _("Syntax error expecting integer."));
 954     }
 955   else if (min == max)
 956     {
 957       if (name)
 958         lex_error (lexer, _("Syntax error expecting %ld for %s."), min, name);
 959       else
 960         lex_error (lexer, _("Syntax error expecting %ld."), min);
 961     }
 962   else if (min + 1 == max)
 963     {
 964       if (name)
 965         lex_error (lexer, _("Syntax error expecting %ld or %ld for %s."),
 966                    min, min + 1, name);
 967       else
 968         lex_error (lexer, _("Syntax error expecting %ld or %ld."),
 969                    min, min + 1);
 970     }
 971   else
 972     {
 973       bool report_lower_bound = (min > INT_MIN / 2) || too_small;
 974       bool report_upper_bound = (max < INT_MAX / 2) || too_big;
 975
 976       if (report_lower_bound && report_upper_bound)
 977         {
 978           if (name)
 979             lex_error (lexer,
 980                        _("Syntax error expecting integer "
 981                          "between %ld and %ld for %s."),
 982                        min, max, name);
 983           else
 984             lex_error (lexer, _("Syntax error expecting integer "
 985                                 "between %ld and %ld."),
 986                        min, max);
 987         }
 988       else if (report_lower_bound)
 989         {
 990           if (min == 0)
 991             {
 992               if (name)
 993                 lex_error (lexer, _("Syntax error expecting "
 994                                     "non-negative integer for %s."),
 995                            name);
 996               else
 997                 lex_error (lexer, _("Syntax error expecting "
 998                                     "non-negative integer."));
 999             }
1000           else if (min == 1)
1001             {
1002               if (name)
1003                 lex_error (lexer, _("Syntax error expecting "
1004                                     "positive integer for %s."),
1005                            name);
1006               else
1007                 lex_error (lexer, _("Syntax error expecting "
1008                                     "positive integer."));
1009             }
1010           else
1011             {
1012               if (name)
1013                 lex_error (lexer, _("Syntax error expecting "
1014                                     "integer %ld or greater for %s."),
1015                            min, name);
1016               else
1017                 lex_error (lexer, _("Syntax error expecting "
1018                                     "integer %ld or greater."), min);
1019             }
1020         }
1021       else if (report_upper_bound)
1022         {
1023           if (name)
1024             lex_error (lexer,
1025                        _("Syntax error expecting integer less than or equal "
1026                          "to %ld for %s."),
1027                        max, name);
1028           else
1029             lex_error (lexer, _("Syntax error expecting integer less than or "
1030                                 "equal to %ld."),
1031                        max);
1032         }
1033       else
1034         {
1035           if (name)
1036             lex_error (lexer, _("Syntax error expecting integer for %s."),
1037                        name);
1038           else
1039             lex_error (lexer, _("Syntax error expecting integer."));
1040         }
1041     }
1042   return false;
1043 }
1044
1045 /* If the current token is a number, does nothing and returns true.
1046    Otherwise, reports an error and returns false. */
1047 bool
1048 lex_force_num (struct lexer *lexer)
1049 {
1050   if (lex_is_number (lexer))
1051     return true;
1052
1053   lex_error (lexer, _("Syntax error expecting number."));
1054   return false;
1055 }
1056
1057 /* If the current token is an number in the closed range [MIN,MAX], does
1058    nothing and returns true.  Otherwise, reports an error and returns false.
1059    If NAME is nonnull, then it is used in the error message. */
1060 bool
1061 lex_force_num_range_closed (struct lexer *lexer, const char *name,
1062                             double min, double max)
1063 {
1064   bool is_number = lex_is_number (lexer);
1065   bool too_small = is_number && lex_number (lexer) < min;
1066   bool too_big = is_number && lex_number (lexer) > max;
1067   if (is_number && !too_small && !too_big)
1068     return true;
1069
1070   if (min > max)
1071     {
1072       /* Weird, maybe a bug in the caller.  Just report that we needed an
1073          number. */
1074       if (name)
1075         lex_error (lexer, _("Syntax error expecting number for %s."), name);
1076       else
1077         lex_error (lexer, _("Syntax error expecting number."));
1078     }
1079   else if (min == max)
1080     {
1081       if (name)
1082         lex_error (lexer, _("Syntax error expecting number %g for %s."),
1083                    min, name);
1084       else
1085         lex_error (lexer, _("Syntax error expecting number %g."), min);
1086     }
1087   else
1088     {
1089       bool report_lower_bound = min > -DBL_MAX || too_small;
1090       bool report_upper_bound = max < DBL_MAX || too_big;
1091
1092       if (report_lower_bound && report_upper_bound)
1093         {
1094           if (name)
1095             lex_error (lexer,
1096                        _("Syntax error expecting number "
1097                          "between %g and %g for %s."),
1098                        min, max, name);
1099           else
1100             lex_error (lexer, _("Syntax error expecting number "
1101                                 "between %g and %g."),
1102                        min, max);
1103         }
1104       else if (report_lower_bound)
1105         {
1106           if (min == 0)
1107             {
1108               if (name)
1109                 lex_error (lexer, _("Syntax error expecting "
1110                                     "non-negative number for %s."),
1111                            name);
1112               else
1113                 lex_error (lexer, _("Syntax error expecting "
1114                                     "non-negative number."));
1115             }
1116           else
1117             {
1118               if (name)
1119                 lex_error (lexer, _("Syntax error expecting number "
1120                                     "%g or greater for %s."),
1121                            min, name);
1122               else
1123                 lex_error (lexer, _("Syntax error expecting number "
1124                                     "%g or greater."), min);
1125             }
1126         }
1127       else if (report_upper_bound)
1128         {
1129           if (name)
1130             lex_error (lexer,
1131                        _("Syntax error expecting number "
1132                          "less than or equal to %g for %s."),
1133                        max, name);
1134           else
1135             lex_error (lexer, _("Syntax error expecting number "
1136                                 "less than or equal to %g."),
1137                        max);
1138         }
1139       else
1140         {
1141           if (name)
1142             lex_error (lexer, _("Syntax error expecting number for %s."), name);
1143           else
1144             lex_error (lexer, _("Syntax error expecting number."));
1145         }
1146     }
1147   return false;
1148 }
1149
1150 /* If the current token is an number in the half-open range [MIN,MAX), does
1151    nothing and returns true.  Otherwise, reports an error and returns false.
1152    If NAME is nonnull, then it is used in the error message. */
1153 bool
1154 lex_force_num_range_halfopen (struct lexer *lexer, const char *name,
1155                               double min, double max)
1156 {
1157   bool is_number = lex_is_number (lexer);
1158   bool too_small = is_number && lex_number (lexer) < min;
1159   bool too_big = is_number && lex_number (lexer) >= max;
1160   if (is_number && !too_small && !too_big)
1161     return true;
1162
1163   if (min >= max)
1164     {
1165       /* Weird, maybe a bug in the caller.  Just report that we needed an
1166          number. */
1167       if (name)
1168         lex_error (lexer, _("Syntax error expecting number for %s."), name);
1169       else
1170         lex_error (lexer, _("Syntax error expecting number."));
1171     }
1172   else
1173     {
1174       bool report_lower_bound = min > -DBL_MAX || too_small;
1175       bool report_upper_bound = max < DBL_MAX || too_big;
1176
1177       if (report_lower_bound && report_upper_bound)
1178         {
1179           if (name)
1180             lex_error (lexer, _("Syntax error expecting number "
1181                                 "in [%g,%g) for %s."),
1182                        min, max, name);
1183           else
1184             lex_error (lexer, _("Syntax error expecting number in [%g,%g)."),
1185                        min, max);
1186         }
1187       else if (report_lower_bound)
1188         {
1189           if (min == 0)
1190             {
1191               if (name)
1192                 lex_error (lexer, _("Syntax error expecting "
1193                                     "non-negative number for %s."),
1194                            name);
1195               else
1196                 lex_error (lexer, _("Syntax error expecting "
1197                                     "non-negative number."));
1198             }
1199           else
1200             {
1201               if (name)
1202                 lex_error (lexer, _("Syntax error expecting "
1203                                     "number %g or greater for %s."),
1204                            min, name);
1205               else
1206                 lex_error (lexer, _("Syntax error expecting "
1207                                     "number %g or greater."), min);
1208             }
1209         }
1210       else if (report_upper_bound)
1211         {
1212           if (name)
1213             lex_error (lexer,
1214                        _("Syntax error expecting "
1215                          "number less than %g for %s."), max, name);
1216           else
1217             lex_error (lexer, _("Syntax error expecting "
1218                                 "number less than %g."), max);
1219         }
1220       else
1221         {
1222           if (name)
1223             lex_error (lexer, _("Syntax error expecting number for %s."), name);
1224           else
1225             lex_error (lexer, _("Syntax error expecting number."));
1226         }
1227     }
1228   return false;
1229 }
1230
1231 /* If the current token is an number in the open range (MIN,MAX), does
1232    nothing and returns true.  Otherwise, reports an error and returns false.
1233    If NAME is nonnull, then it is used in the error message. */
1234 bool
1235 lex_force_num_range_open (struct lexer *lexer, const char *name,
1236                           double min, double max)
1237 {
1238   bool is_number = lex_is_number (lexer);
1239   bool too_small = is_number && lex_number (lexer) <= min;
1240   bool too_big = is_number && lex_number (lexer) >= max;
1241   if (is_number && !too_small && !too_big)
1242     return true;
1243
1244   if (min >= max)
1245     {
1246       /* Weird, maybe a bug in the caller.  Just report that we needed an
1247          number. */
1248       if (name)
1249         lex_error (lexer, _("Syntax error expecting number for %s."), name);
1250       else
1251         lex_error (lexer, _("Syntax error expecting number."));
1252     }
1253   else
1254     {
1255       bool report_lower_bound = min > -DBL_MAX || too_small;
1256       bool report_upper_bound = max < DBL_MAX || too_big;
1257
1258       if (report_lower_bound && report_upper_bound)
1259         {
1260           if (name)
1261             lex_error (lexer, _("Syntax error expecting number "
1262                                 "in (%g,%g) for %s."),
1263                        min, max, name);
1264           else
1265             lex_error (lexer, _("Syntax error expecting number "
1266                                 "in (%g,%g)."), min, max);
1267         }
1268       else if (report_lower_bound)
1269         {
1270           if (min == 0)
1271             {
1272               if (name)
1273                 lex_error (lexer, _("Syntax error expecting "
1274                                     "positive number for %s."), name);
1275               else
1276                 lex_error (lexer, _("Syntax error expecting "
1277                                     "positive number."));
1278             }
1279           else
1280             {
1281               if (name)
1282                 lex_error (lexer, _("Syntax error expecting number "
1283                                     "greater than %g for %s."),
1284                            min, name);
1285               else
1286                 lex_error (lexer, _("Syntax error expecting number "
1287                                     "greater than %g."), min);
1288             }
1289         }
1290       else if (report_upper_bound)
1291         {
1292           if (name)
1293             lex_error (lexer, _("Syntax error expecting number "
1294                                 "less than %g for %s."),
1295                        max, name);
1296           else
1297             lex_error (lexer, _("Syntax error expecting number "
1298                                 "less than %g."), max);
1299         }
1300       else
1301         {
1302           if (name)
1303             lex_error (lexer, _("Syntax error expecting number "
1304                                 "for %s."), name);
1305           else
1306             lex_error (lexer, _("Syntax error expecting number."));
1307         }
1308     }
1309   return false;
1310 }
1311
1312 /* If the current token is an identifier, does nothing and returns true.
1313    Otherwise, reports an error and returns false. */
1314 bool
1315 lex_force_id (struct lexer *lexer)
1316 {
1317   if (lex_token (lexer) == T_ID)
1318     return true;
1319
1320   lex_error (lexer, _("Syntax error expecting identifier."));
1321   return false;
1322 }
1323 \f
1324 /* Token accessors. */
1325
1326 /* Returns the type of LEXER's current token. */
1327 enum token_type
1328 lex_token (const struct lexer *lexer)
1329 {
1330   return lex_next_token (lexer, 0);
1331 }
1332
1333 /* Returns the number in LEXER's current token.
1334
1335    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
1336    tokens this function will always return zero. */
1337 double
1338 lex_tokval (const struct lexer *lexer)
1339 {
1340   return lex_next_tokval (lexer, 0);
1341 }
1342
1343 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
1344
1345    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
1346    this functions this function will always return NULL.
1347
1348    The UTF-8 encoding of the returned string is correct for variable names and
1349    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
1350    data_in() to use it in a "union value".  */
1351 const char *
1352 lex_tokcstr (const struct lexer *lexer)
1353 {
1354   return lex_next_tokcstr (lexer, 0);
1355 }
1356
1357 /* Returns the string in LEXER's current token, UTF-8 encoded.  The string is
1358    null-terminated (but the null terminator is not included in the returned
1359    substring's 'length').
1360
1361    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
1362    this functions this function will always return NULL.
1363
1364    The UTF-8 encoding of the returned string is correct for variable names and
1365    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
1366    data_in() to use it in a "union value".  */
1367 struct substring
1368 lex_tokss (const struct lexer *lexer)
1369 {
1370   return lex_next_tokss (lexer, 0);
1371 }
1372 \f
1373 /* Looking ahead.
1374
1375    A value of 0 for N as an argument to any of these functions refers to the
1376    current token.  Lookahead is limited to the current command.  Any N greater
1377    than the number of tokens remaining in the current command will be treated
1378    as referring to a T_ENDCMD token. */
1379
1380 static const struct lex_token *
1381 lex_next__ (const struct lexer *lexer_, int n)
1382 {
1383   struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
1384   struct lex_source *src = lex_source__ (lexer);
1385
1386   if (src != NULL)
1387     return lex_source_next__ (src, n);
1388   else
1389     {
1390       static const struct lex_token stop_token = { .token = { .type = T_STOP } };
1391       return &stop_token;
1392     }
1393 }
1394
1395 static const struct lex_token *
1396 lex_source_ofs__ (const struct lex_source *src_, int ofs)
1397 {
1398   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1399
1400   if (ofs < 0)
1401     {
1402       static const struct lex_token endcmd_token
1403         = { .token = { .type = T_ENDCMD } };
1404       return &endcmd_token;
1405     }
1406
1407   while (ofs >= src->n_parse)
1408     {
1409       if (src->n_parse > 0)
1410         {
1411           const struct lex_token *t = src->parse[src->n_parse - 1];
1412           if (t->token.type == T_STOP || t->token.type == T_ENDCMD)
1413             return t;
1414         }
1415
1416       lex_source_get_parse (src);
1417     }
1418
1419   return src->parse[ofs];
1420 }
1421
1422 static const struct lex_token *
1423 lex_source_next__ (const struct lex_source *src, int n)
1424 {
1425   return lex_source_ofs__ (src, n + src->parse_ofs);
1426 }
1427
1428 /* Returns the "struct token" of the token N after the current one in LEXER.
1429    The returned pointer can be invalidated by pretty much any succeeding call
1430    into the lexer, although the string pointer within the returned token is
1431    only invalidated by consuming the token (e.g. with lex_get()). */
1432 const struct token *
1433 lex_next (const struct lexer *lexer, int n)
1434 {
1435   return &lex_next__ (lexer, n)->token;
1436 }
1437
1438 /* Returns the type of the token N after the current one in LEXER. */
1439 enum token_type
1440 lex_next_token (const struct lexer *lexer, int n)
1441 {
1442   return lex_next (lexer, n)->type;
1443 }
1444
1445 /* Returns the number in the tokn N after the current one in LEXER.
1446
1447    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
1448    tokens this function will always return zero. */
1449 double
1450 lex_next_tokval (const struct lexer *lexer, int n)
1451 {
1452   return token_number (lex_next (lexer, n));
1453 }
1454
1455 /* Returns the null-terminated string in the token N after the current one, in
1456    UTF-8 encoding.
1457
1458    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
1459    this functions this function will always return NULL.
1460
1461    The UTF-8 encoding of the returned string is correct for variable names and
1462    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
1463    data_in() to use it in a "union value".  */
1464 const char *
1465 lex_next_tokcstr (const struct lexer *lexer, int n)
1466 {
1467   return lex_next_tokss (lexer, n).string;
1468 }
1469
1470 /* Returns the string in the token N after the current one, in UTF-8 encoding.
1471    The string is null-terminated (but the null terminator is not included in
1472    the returned substring's 'length').
1473
1474    Only T_ID, T_MACRO_ID, T_STRING tokens have meaningful strings.  For other
1475    tokens this functions this function will always return NULL.
1476
1477    The UTF-8 encoding of the returned string is correct for variable names and
1478    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
1479    data_in() to use it in a "union value".  */
1480 struct substring
1481 lex_next_tokss (const struct lexer *lexer, int n)
1482 {
1483   return lex_next (lexer, n)->string;
1484 }
1485
1486 /* Returns the offset of the current token within the command being parsed in
1487    LEXER.  This is 0 for the first token in a command, 1 for the second, and so
1488    on.  The return value is useful later for referring to this token in calls
1489    to lex_ofs_*(). */
1490 int
1491 lex_ofs (const struct lexer *lexer)
1492 {
1493   struct lex_source *src = lex_source__ (lexer);
1494   return src ? src->parse_ofs : 0;
1495 }
1496
1497 /* Returns the offset of the last token in the current command. */
1498 int
1499 lex_max_ofs (const struct lexer *lexer)
1500 {
1501   struct lex_source *src = lex_source__ (lexer);
1502   if (!src)
1503     return 0;
1504
1505   int ofs = MAX (1, src->n_parse) - 1;
1506   for (;;)
1507     {
1508       enum token_type type = lex_source_ofs__ (src, ofs)->token.type;
1509       if (type == T_ENDCMD || type == T_STOP)
1510         return ofs;
1511
1512       ofs++;
1513     }
1514 }
1515
1516 /* Returns the token within LEXER's current command with offset OFS.  Use
1517    lex_ofs() to find out the offset of the current token. */
1518 const struct token *
1519 lex_ofs_token (const struct lexer *lexer_, int ofs)
1520 {
1521   struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
1522   struct lex_source *src = lex_source__ (lexer);
1523
1524   if (src != NULL)
1525     return &lex_source_next__ (src, ofs - src->parse_ofs)->token;
1526   else
1527     {
1528       static const struct token stop_token = { .type = T_STOP };
1529       return &stop_token;
1530     }
1531 }
1532
1533 /* Allocates and returns a new struct msg_location that spans tokens with
1534    offsets OFS0 through OFS1, inclusive, within the current command in
1535    LEXER.  See lex_ofs() for an explanation of token offsets.
1536
1537    The caller owns and must eventually free the returned object. */
1538 struct msg_location *
1539 lex_ofs_location (const struct lexer *lexer, int ofs0, int ofs1)
1540 {
1541   int ofs = lex_ofs (lexer);
1542   return lex_get_location (lexer, ofs0 - ofs, ofs1 - ofs);
1543 }
1544
1545 /* Returns a msg_point for the first character in the token with offset OFS,
1546    where offset 0 is the first token in the command currently being parsed, 1
1547    the second token, and so on.  These are absolute offsets, not relative to
1548    the token currently being parsed within the command.
1549
1550    Returns zeros for a T_STOP token.
1551  */
1552 struct msg_point
1553 lex_ofs_start_point (const struct lexer *lexer, int ofs)
1554 {
1555   const struct lex_source *src = lex_source__ (lexer);
1556   return (src
1557           ? lex_token_start_point (src, lex_source_ofs__ (src, ofs))
1558           : (struct msg_point) { 0, 0 });
1559 }
1560
1561 /* Returns a msg_point for the last character, inclusive, in the token with
1562    offset OFS, where offset 0 is the first token in the command currently being
1563    parsed, 1 the second token, and so on.  These are absolute offsets, not
1564    relative to the token currently being parsed within the command.
1565
1566    Returns zeros for a T_STOP token.
1567
1568    Most of the time, a single token is wholly within a single line of syntax,
1569    so that the start and end point for a given offset have the same line
1570    number.  There are two exceptions: a T_STRING token can be made up of
1571    multiple segments on adjacent lines connected with "+" punctuators, and a
1572    T_NEG_NUM token can consist of a "-" on one line followed by the number on
1573    the next.
1574  */
1575 struct msg_point
1576 lex_ofs_end_point (const struct lexer *lexer, int ofs)
1577 {
1578   const struct lex_source *src = lex_source__ (lexer);
1579   return (src
1580           ? lex_token_end_point (src, lex_source_ofs__ (src, ofs))
1581           : (struct msg_point) { 0, 0 });
1582 }
1583
1584 /* Returns the text of the syntax in tokens N0 ahead of the current one,
1585    through N1 ahead of the current one, inclusive.  (For example, if N0 and N1
1586    are both zero, this requests the syntax for the current token.)
1587
1588    The caller must eventually free the returned string (with free()).  The
1589    syntax is encoded in UTF-8 and in the original form supplied to the lexer so
1590    that, for example, it may include comments, spaces, and new-lines if it
1591    spans multiple tokens.  Macro expansion, however, has already been
1592    performed. */
1593 char *
1594 lex_next_representation (const struct lexer *lexer, int n0, int n1)
1595 {
1596   const struct lex_source *src = lex_source__ (lexer);
1597   return (src
1598           ? lex_source_syntax__ (src, n0 + src->parse_ofs, n1 + src->parse_ofs)
1599           : xstrdup (""));
1600 }
1601
1602
1603 /* Returns the text of the syntax in tokens with offsets OFS0 to OFS1,
1604    inclusive.  (For example, if OFS0 and OFS1 are both zero, this requests the
1605    syntax for the first token in the current command.)
1606
1607    The caller must eventually free the returned string (with free()).  The
1608    syntax is encoded in UTF-8 and in the original form supplied to the lexer so
1609    that, for example, it may include comments, spaces, and new-lines if it
1610    spans multiple tokens.  Macro expansion, however, has already been
1611    performed. */
1612 char *
1613 lex_ofs_representation (const struct lexer *lexer, int ofs0, int ofs1)
1614 {
1615   const struct lex_source *src = lex_source__ (lexer);
1616   return src ? lex_source_syntax__ (src, ofs0, ofs1) : xstrdup ("");
1617 }
1618
1619 /* Returns true if the token N ahead of the current one was produced by macro
1620    expansion, false otherwise. */
1621 bool
1622 lex_next_is_from_macro (const struct lexer *lexer, int n)
1623 {
1624   return lex_next__ (lexer, n)->macro_rep != NULL;
1625 }
1626
1627 static bool
1628 lex_tokens_match (const struct token *actual, const struct token *expected)
1629 {
1630   if (actual->type != expected->type)
1631     return false;
1632
1633   switch (actual->type)
1634     {
1635     case T_POS_NUM:
1636     case T_NEG_NUM:
1637       return actual->number == expected->number;
1638
1639     case T_ID:
1640       return lex_id_match (expected->string, actual->string);
1641
1642     case T_STRING:
1643       return (actual->string.length == expected->string.length
1644               && !memcmp (actual->string.string, expected->string.string,
1645                           actual->string.length));
1646
1647     default:
1648       return true;
1649     }
1650 }
1651
1652 static bool
1653 lex_ofs_at_phrase__ (struct lexer *lexer, int ofs, const char *s,
1654                      size_t *n_matchedp)
1655 {
1656   struct string_lexer slex;
1657   struct token token;
1658
1659   size_t n_matched = 0;
1660   bool all_matched = true;
1661   string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE, true);
1662   while (string_lexer_next (&slex, &token))
1663     {
1664       bool match = lex_tokens_match (lex_ofs_token (lexer, ofs + n_matched),
1665                                      &token);
1666       token_uninit (&token);
1667       if (!match)
1668         {
1669           all_matched = false;
1670           break;
1671         }
1672       n_matched++;
1673     }
1674   if (n_matchedp)
1675     *n_matchedp = n_matched;
1676   return all_matched;
1677 }
1678
1679 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1680    returns true.  Otherwise, returns false.
1681
1682    S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1683    "2SLS", or "END INPUT PROGRAM".  Identifiers may be abbreviated to their
1684    first three letters. */
1685 bool
1686 lex_at_phrase (struct lexer *lexer, const char *s)
1687 {
1688   return lex_ofs_at_phrase__ (lexer, lex_ofs (lexer), s, NULL);
1689 }
1690
1691 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1692    skips it and returns true.  Otherwise, returns false.
1693
1694    S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1695    "2SLS", or "END INPUT PROGRAM".  Identifiers may be abbreviated to their
1696    first three letters. */
1697 bool
1698 lex_match_phrase (struct lexer *lexer, const char *s)
1699 {
1700   size_t n_matched;
1701   if (!lex_ofs_at_phrase__ (lexer, lex_ofs (lexer), s, &n_matched))
1702     return false;
1703   lex_get_n (lexer, n_matched);
1704   return true;
1705 }
1706
1707 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1708    skips it and returns true.  Otherwise, issues an error and returns false.
1709
1710    S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1711    "2SLS", or "END INPUT PROGRAM".  Identifiers may be abbreviated to their
1712    first three letters. */
1713 bool
1714 lex_force_match_phrase (struct lexer *lexer, const char *s)
1715 {
1716   size_t n_matched;
1717   bool ok = lex_ofs_at_phrase__ (lexer, lex_ofs (lexer), s, &n_matched);
1718   if (ok)
1719     lex_get_n (lexer, n_matched);
1720   else
1721     lex_next_error (lexer, 0, n_matched, _("Syntax error expecting `%s'."), s);
1722   return ok;
1723 }
1724
1725 /* Returns the 1-based line number of the source text at the byte OFFSET in
1726    SRC. */
1727 static int
1728 lex_source_ofs_to_line_number (const struct lex_source *src, size_t offset)
1729 {
1730   size_t lo = 0;
1731   size_t hi = src->n_lines;
1732   for (;;)
1733     {
1734       size_t mid = (lo + hi) / 2;
1735       if (mid + 1 >= src->n_lines)
1736         return src->n_lines;
1737       else if (offset >= src->lines[mid + 1])
1738         lo = mid;
1739       else if (offset < src->lines[mid])
1740         hi = mid;
1741       else
1742         return mid + 1;
1743     }
1744 }
1745
1746 /* Returns the 1-based column number of the source text at the byte OFFSET in
1747    SRC. */
1748 static int
1749 lex_source_ofs_to_column_number (const struct lex_source *src, size_t offset)
1750 {
1751   const char *newline = memrchr (src->buffer, '\n', offset);
1752   size_t line_ofs = newline ? newline - src->buffer + 1 : 0;
1753   return utf8_count_columns (&src->buffer[line_ofs], offset - line_ofs) + 1;
1754 }
1755
1756 static struct msg_point
1757 lex_source_ofs_to_point__ (const struct lex_source *src, size_t offset)
1758 {
1759   return (struct msg_point) {
1760     .line = lex_source_ofs_to_line_number (src, offset),
1761     .column = lex_source_ofs_to_column_number (src, offset),
1762   };
1763 }
1764
1765 static struct msg_point
1766 lex_token_start_point (const struct lex_source *src,
1767                        const struct lex_token *token)
1768 {
1769   return lex_source_ofs_to_point__ (src, token->token_pos);
1770 }
1771
1772 static struct msg_point
1773 lex_token_end_point (const struct lex_source *src,
1774                      const struct lex_token *token)
1775 {
1776   return lex_source_ofs_to_point__ (src, lex_token_end (token));
1777 }
1778
1779 static struct msg_location
1780 lex_token_location (const struct lex_source *src,
1781                     const struct lex_token *t0,
1782                     const struct lex_token *t1)
1783 {
1784   return (struct msg_location) {
1785     .file_name = intern_new_if_nonnull (src->reader->file_name),
1786     .start = lex_token_start_point (src, t0),
1787     .end = lex_token_end_point (src, t1),
1788     .src = CONST_CAST (struct lex_source *, src),
1789   };
1790 }
1791
1792 static struct msg_location *
1793 lex_token_location_rw (const struct lex_source *src,
1794                        const struct lex_token *t0,
1795                        const struct lex_token *t1)
1796 {
1797   struct msg_location location = lex_token_location (src, t0, t1);
1798   return msg_location_dup (&location);
1799 }
1800
1801 static struct msg_location *
1802 lex_source_get_location (const struct lex_source *src, int ofs0, int ofs1)
1803 {
1804   return lex_token_location_rw (src,
1805                                 lex_source_ofs__ (src, ofs0),
1806                                 lex_source_ofs__ (src, ofs1));
1807 }
1808
1809 /* Returns the name of the syntax file from which the current command is drawn.
1810    Returns NULL for a T_STOP token or if the command's source does not have
1811    line numbers.
1812
1813    There is no version of this function that takes an N argument because
1814    lookahead only works to the end of a command and any given command is always
1815    within a single syntax file. */
1816 const char *
1817 lex_get_file_name (const struct lexer *lexer)
1818 {
1819   struct lex_source *src = lex_source__ (lexer);
1820   return src == NULL ? NULL : src->reader->file_name;
1821 }
1822
1823 /* Returns a newly allocated msg_location for the syntax that represents tokens
1824    with 0-based offsets N0...N1, inclusive, from the current token.  The caller
1825    must eventually free the location (with msg_location_destroy()). */
1826 struct msg_location *
1827 lex_get_location (const struct lexer *lexer, int n0, int n1)
1828 {
1829   struct msg_location *loc = xmalloc (sizeof *loc);
1830   *loc = (struct msg_location) {
1831     .file_name = intern_new_if_nonnull (lex_get_file_name (lexer)),
1832     .start = lex_ofs_start_point (lexer, n0 + lex_ofs (lexer)),
1833     .end = lex_ofs_end_point (lexer, n1 + lex_ofs (lexer)),
1834     .src = lex_source__ (lexer),
1835   };
1836   lex_source_ref (loc->src);
1837   return loc;
1838 }
1839
1840 const char *
1841 lex_get_encoding (const struct lexer *lexer)
1842 {
1843   struct lex_source *src = lex_source__ (lexer);
1844   return src == NULL ? NULL : src->reader->encoding;
1845 }
1846
1847 /* Returns the syntax mode for the syntax file from which the current drawn is
1848    drawn.  Returns SEG_MODE_AUTO for a T_STOP token or if the command's source
1849    does not have line numbers.
1850
1851    There is no version of this function that takes an N argument because
1852    lookahead only works to the end of a command and any given command is always
1853    within a single syntax file. */
1854 enum segmenter_mode
1855 lex_get_syntax_mode (const struct lexer *lexer)
1856 {
1857   struct lex_source *src = lex_source__ (lexer);
1858   return src == NULL ? SEG_MODE_AUTO : src->reader->syntax;
1859 }
1860
1861 /* Returns the error mode for the syntax file from which the current drawn is
1862    drawn.  Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1863    source does not have line numbers.
1864
1865    There is no version of this function that takes an N argument because
1866    lookahead only works to the end of a command and any given command is always
1867    within a single syntax file. */
1868 enum lex_error_mode
1869 lex_get_error_mode (const struct lexer *lexer)
1870 {
1871   struct lex_source *src = lex_source__ (lexer);
1872   return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1873 }
1874
1875 /* If the source that LEXER is currently reading has error mode
1876    LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1877    token to be read comes directly from whatever is next read from the stream.
1878
1879    It makes sense to call this function after encountering an error in a
1880    command entered on the console, because usually the user would prefer not to
1881    have cascading errors. */
1882 void
1883 lex_interactive_reset (struct lexer *lexer)
1884 {
1885   struct lex_source *src = lex_source__ (lexer);
1886   if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1887     {
1888       src->length = 0;
1889       src->journal_pos = src->seg_pos = 0;
1890       src->n_lines = 0;
1891       src->suppress_next_newline = false;
1892       src->segmenter = segmenter_init (segmenter_get_mode (&src->segmenter),
1893                                        false);
1894       lex_stage_clear (&src->pp);
1895       lex_stage_clear (&src->merge);
1896       lex_source_clear_parse (src);
1897       lex_source_push_endcmd__ (src);
1898     }
1899 }
1900
1901 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1902 void
1903 lex_discard_rest_of_command (struct lexer *lexer)
1904 {
1905   while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1906     lex_get (lexer);
1907 }
1908
1909 /* Discards all lookahead tokens in LEXER, then discards all input sources
1910    until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1911    runs out of input sources. */
1912 void
1913 lex_discard_noninteractive (struct lexer *lexer)
1914 {
1915   struct lex_source *src = lex_source__ (lexer);
1916   if (src != NULL)
1917     {
1918       if (src->reader->error == LEX_ERROR_IGNORE)
1919         return;
1920
1921       lex_stage_clear (&src->pp);
1922       lex_stage_clear (&src->merge);
1923       lex_source_clear_parse (src);
1924
1925       for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1926            src = lex_source__ (lexer))
1927         {
1928           ll_remove (&src->ll);
1929           lex_source_unref (src);
1930         }
1931     }
1932 }
1933 \f
1934 static void
1935 lex_source_expand__ (struct lex_source *src)
1936 {
1937   if (src->length >= src->allocated)
1938     src->buffer = x2realloc (src->buffer, &src->allocated);
1939 }
1940
1941 static void
1942 lex_source_read__ (struct lex_source *src)
1943 {
1944   do
1945     {
1946       lex_source_expand__ (src);
1947
1948       size_t space = src->allocated - src->length;
1949       enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1950       size_t n = src->reader->class->read (src->reader,
1951                                            &src->buffer[src->length],
1952                                            space, prompt);
1953       assert (n <= space);
1954
1955       if (n == 0)
1956         {
1957           /* End of input. */
1958           src->reader->eof = true;
1959           return;
1960         }
1961
1962       src->length += n;
1963     }
1964   while (!memchr (&src->buffer[src->seg_pos], '\n',
1965                   src->length - src->seg_pos));
1966 }
1967
1968 static struct lex_source *
1969 lex_source__ (const struct lexer *lexer)
1970 {
1971   return (ll_is_empty (&lexer->sources) ? NULL
1972           : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1973 }
1974
1975 const struct lex_source *
1976 lex_source (const struct lexer *lexer)
1977 {
1978   return lex_source__ (lexer);
1979 }
1980
1981 /* Returns the text of the syntax in SRC for tokens with offsets OFS0 through
1982    OFS1 in the current command, inclusive.  (For example, if OFS0 and OFS1 are
1983    both zero, this requests the syntax for the first token in the current
1984    command.)  The caller must eventually free the returned string (with
1985    free()).  The syntax is encoded in UTF-8 and in the original form supplied
1986    to the lexer so that, for example, it may include comments, spaces, and
1987    new-lines if it spans multiple tokens.  Macro expansion, however, has
1988    already been performed. */
1989 static char *
1990 lex_source_syntax__ (const struct lex_source *src, int ofs0, int ofs1)
1991 {
1992   struct string s = DS_EMPTY_INITIALIZER;
1993   for (size_t i = ofs0; i <= ofs1; )
1994     {
1995       /* Find [I,J) as the longest sequence of tokens not produced by macro
1996          expansion, or otherwise the longest sequence expanded from a single
1997          macro call. */
1998       const struct lex_token *first = lex_source_ofs__ (src, i);
1999       size_t j;
2000       for (j = i + 1; j <= ofs1; j++)
2001         {
2002           const struct lex_token *cur = lex_source_ofs__ (src, j);
2003           if ((first->macro_rep != NULL) != (cur->macro_rep != NULL)
2004               || first->macro_rep != cur->macro_rep)
2005             break;
2006         }
2007       const struct lex_token *last = lex_source_ofs__ (src, j - 1);
2008
2009       /* Now add the syntax for this sequence of tokens to SRC. */
2010       if (!ds_is_empty (&s))
2011         ds_put_byte (&s, ' ');
2012       if (!first->macro_rep)
2013         {
2014           size_t start = first->token_pos;
2015           size_t end = last->token_pos + last->token_len;
2016           ds_put_substring (&s, ss_buffer (&src->buffer[start], end - start));
2017         }
2018       else
2019         {
2020           size_t start = first->ofs;
2021           size_t end = last->ofs + last->len;
2022           ds_put_substring (&s, ss_buffer (first->macro_rep + start,
2023                                            end - start));
2024         }
2025
2026       i = j;
2027     }
2028   return ds_steal_cstr (&s);
2029 }
2030
2031 static bool
2032 lex_source_contains_macro_call (struct lex_source *src, int ofs0, int ofs1)
2033 {
2034   for (int i = ofs0; i <= ofs1; i++)
2035     if (lex_source_ofs__ (src, i)->macro_rep)
2036       return true;
2037   return false;
2038 }
2039
2040 /* If tokens N0...N1 (inclusive) in SRC contains a macro call, this returns the
2041    raw UTF-8 syntax for the macro call (not for the expansion) and for any
2042    other tokens included in that range.  The syntax is encoded in UTF-8 and in
2043    the original form supplied to the lexer so that, for example, it may include
2044    comments, spaces, and new-lines if it spans multiple tokens.
2045
2046    Returns an empty string if the token range doesn't include a macro call.
2047
2048    The caller must not modify or free the returned string. */
2049 static struct substring
2050 lex_source_get_macro_call (struct lex_source *src, int ofs0, int ofs1)
2051 {
2052   if (!lex_source_contains_macro_call (src, ofs0, ofs1))
2053     return ss_empty ();
2054
2055   const struct lex_token *token0 = lex_source_ofs__ (src, ofs0);
2056   const struct lex_token *token1 = lex_source_ofs__ (src, MAX (ofs0, ofs1));
2057   size_t start = token0->token_pos;
2058   size_t end = token1->token_pos + token1->token_len;
2059
2060   return ss_buffer (&src->buffer[start], end - start);
2061 }
2062
2063 static void
2064 lex_source_msg_valist (struct lex_source *src, enum msg_class class,
2065                        int ofs0, int ofs1, const char *format, va_list args)
2066 {
2067   struct string s = DS_EMPTY_INITIALIZER;
2068
2069   if (src)
2070     {
2071       /* Get the macro call(s) that expanded to the syntax that caused the
2072          error. */
2073       char call[64];
2074       str_ellipsize (lex_source_get_macro_call (src, ofs0, ofs1),
2075                      call, sizeof call);
2076       if (call[0])
2077         ds_put_format (&s, _("In syntax expanded from `%s'"), call);
2078     }
2079   else
2080     ds_put_cstr (&s, _("At end of input"));
2081
2082   if (!ds_is_empty (&s))
2083     ds_put_cstr (&s, ": ");
2084   if (format)
2085     ds_put_vformat (&s, format, args);
2086   else
2087     ds_put_cstr (&s, _("Syntax error."));
2088
2089   if (ds_last (&s) != '.')
2090     ds_put_byte (&s, '.');
2091
2092   struct msg *m = xmalloc (sizeof *m);
2093   *m = (struct msg) {
2094     .category = msg_class_to_category (class),
2095     .severity = msg_class_to_severity (class),
2096     .location = src ? lex_source_get_location (src, ofs0, ofs1) : NULL,
2097     .text = ds_steal_cstr (&s),
2098   };
2099   msg_emit (m);
2100 }
2101
2102 static void
2103 lex_get_error (struct lex_source *src, const struct lex_token *token)
2104 {
2105   struct msg *m = xmalloc (sizeof *m);
2106   *m = (struct msg) {
2107     .category = MSG_C_SYNTAX,
2108     .severity = MSG_S_ERROR,
2109     .location = lex_token_location_rw (src, token, token),
2110     .text = ss_xstrdup (token->token.string),
2111   };
2112
2113   struct lexer *lexer = src->lexer;
2114   if (lexer->n_messages >= lexer->allocated_messages)
2115     lexer->messages = x2nrealloc (lexer->messages, &lexer->allocated_messages,
2116                                   sizeof *lexer->messages);
2117   lexer->messages[lexer->n_messages++] = m;
2118 }
2119
2120 /* Attempts to append an additional token to 'pp' in SRC, reading more from the
2121    underlying lex_reader if necessary.  Returns true if a new token was added
2122    to SRC's deque, false otherwise.  The caller should retry failures unless
2123    SRC's 'eof' marker was set to true indicating that there will be no more
2124    tokens from this source. */
2125 static bool
2126 lex_source_try_get_pp (struct lex_source *src)
2127 {
2128   /* Append a new token to SRC and initialize it. */
2129   struct lex_token *token = xmalloc (sizeof *token);
2130   token->token = (struct token) { .type = T_STOP };
2131   token->macro_rep = NULL;
2132   token->ref_cnt = NULL;
2133   token->token_pos = src->seg_pos;
2134
2135   /* Extract a segment. */
2136   const char *segment;
2137   enum segment_type seg_type;
2138   int seg_len;
2139   for (;;)
2140     {
2141       segment = &src->buffer[src->seg_pos];
2142       seg_len = segmenter_push (&src->segmenter, segment,
2143                                 src->length - src->seg_pos,
2144                                 src->reader->eof, &seg_type);
2145       if (seg_len >= 0)
2146         break;
2147
2148       /* The segmenter needs more input to produce a segment. */
2149       assert (!src->reader->eof);
2150       lex_source_read__ (src);
2151     }
2152
2153   /* Update state based on the segment. */
2154   token->token_len = seg_len;
2155   src->seg_pos += seg_len;
2156   if (seg_type == SEG_NEWLINE)
2157     {
2158       if (src->n_lines >= src->allocated_lines)
2159         src->lines = x2nrealloc (src->lines, &src->allocated_lines,
2160                                  sizeof *src->lines);
2161       src->lines[src->n_lines++] = src->seg_pos;
2162     }
2163
2164   /* Get a token from the segment. */
2165   enum tokenize_result result = token_from_segment (
2166     seg_type, ss_buffer (segment, seg_len), &token->token);
2167
2168   /* If we've reached the end of a line, or the end of a command, then pass
2169      the line to the output engine as a syntax text item.  */
2170   int n_lines = seg_type == SEG_NEWLINE;
2171   if (seg_type == SEG_END_COMMAND && !src->suppress_next_newline)
2172     {
2173       n_lines++;
2174       src->suppress_next_newline = true;
2175     }
2176   else if (n_lines > 0 && src->suppress_next_newline)
2177     {
2178       n_lines--;
2179       src->suppress_next_newline = false;
2180     }
2181   for (int i = 0; i < n_lines; i++)
2182     {
2183       /* Beginning of line. */
2184       const char *line = &src->buffer[src->journal_pos];
2185
2186       /* Calculate line length, including \n or \r\n end-of-line if present.
2187
2188          We use src->length even though that may be beyond what we've actually
2189          converted to tokens.  That's because, if we're emitting the line due
2190          to SEG_END_COMMAND, we want to take the whole line through the
2191          newline, not just through the '.'. */
2192       size_t max_len = src->length - src->journal_pos;
2193       const char *newline = memchr (line, '\n', max_len);
2194       size_t line_len = newline ? newline - line + 1 : max_len;
2195
2196       /* Calculate line length excluding end-of-line. */
2197       size_t copy_len = line_len;
2198       if (copy_len > 0 && line[copy_len - 1] == '\n')
2199         copy_len--;
2200       if (copy_len > 0 && line[copy_len - 1] == '\r')
2201         copy_len--;
2202
2203       /* Submit the line as syntax. */
2204       output_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX,
2205                                                    xmemdup0 (line, copy_len),
2206                                                    NULL));
2207
2208       src->journal_pos += line_len;
2209     }
2210
2211   switch (result)
2212     {
2213     case TOKENIZE_ERROR:
2214       lex_get_error (src, token);
2215       /* Fall through. */
2216     case TOKENIZE_EMPTY:
2217       lex_token_destroy (token);
2218       return false;
2219
2220     case TOKENIZE_TOKEN:
2221       if (token->token.type == T_STOP)
2222         {
2223           token->token.type = T_ENDCMD;
2224           src->eof = true;
2225         }
2226       lex_stage_push_last (&src->pp, token);
2227       return true;
2228     }
2229   NOT_REACHED ();
2230 }
2231
2232 /* Attempts to append a new token to SRC.  Returns true if successful, false on
2233    failure.  On failure, the end of SRC has been reached and no more tokens
2234    will be forthcoming from it.
2235
2236    Does not make the new token available for lookahead yet; the caller must
2237    adjust SRC's 'middle' pointer to do so. */
2238 static bool
2239 lex_source_get_pp (struct lex_source *src)
2240 {
2241   while (!src->eof)
2242     if (lex_source_try_get_pp (src))
2243       return true;
2244   return false;
2245 }
2246
2247 static bool
2248 lex_source_try_get_merge (const struct lex_source *src_)
2249 {
2250   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
2251
2252   if (lex_stage_is_empty (&src->pp) && !lex_source_get_pp (src))
2253     return false;
2254
2255   if (!settings_get_mexpand ())
2256     {
2257       lex_stage_shift (&src->merge, &src->pp, lex_stage_count (&src->pp));
2258       return true;
2259     }
2260
2261   /* Now pass tokens one-by-one to the macro expander.
2262
2263      In the common case where there is no macro to expand, the loop is not
2264      entered.  */
2265   struct macro_call *mc;
2266   int n_call = macro_call_create (src->lexer->macros,
2267                                   &lex_stage_first (&src->pp)->token, &mc);
2268   for (int ofs = 1; !n_call; ofs++)
2269     {
2270       if (lex_stage_count (&src->pp) <= ofs && !lex_source_get_pp (src))
2271         {
2272           /* This should not be reachable because we always get a T_ENDCMD at
2273              the end of an input file (transformed from T_STOP by
2274              lex_source_try_get_pp()) and the macro_expander should always
2275              terminate expansion on T_ENDCMD. */
2276           NOT_REACHED ();
2277         }
2278
2279       const struct lex_token *t = lex_stage_nth (&src->pp, ofs);
2280       const struct macro_token mt = {
2281         .token = t->token,
2282         .syntax = ss_buffer (&src->buffer[t->token_pos], t->token_len),
2283       };
2284       const struct msg_location loc = lex_token_location (src, t, t);
2285       n_call = macro_call_add (mc, &mt, &loc);
2286     }
2287   if (n_call < 0)
2288     {
2289       /* False alarm: no macro expansion after all.  Use first token as
2290          lookahead.  We'll retry macro expansion from the second token next
2291          time around. */
2292       macro_call_destroy (mc);
2293       lex_stage_shift (&src->merge, &src->pp, 1);
2294       return true;
2295     }
2296
2297   /* The first 'n_call' tokens in 'pp', which we bracket as C0...C1, inclusive,
2298      are a macro call.  (These are likely to be the only tokens in 'pp'.)
2299      Expand them.  */
2300   const struct lex_token *c0 = lex_stage_first (&src->pp);
2301   const struct lex_token *c1 = lex_stage_nth (&src->pp, n_call - 1);
2302   struct macro_tokens expansion = { .n = 0 };
2303   struct msg_location loc = lex_token_location (src, c0, c1);
2304   macro_call_expand (mc, src->reader->syntax, &loc, &expansion);
2305   macro_call_destroy (mc);
2306
2307   /* Convert the macro expansion into syntax for possible error messages
2308      later. */
2309   size_t *ofs = xnmalloc (expansion.n, sizeof *ofs);
2310   size_t *len = xnmalloc (expansion.n, sizeof *len);
2311   struct string s = DS_EMPTY_INITIALIZER;
2312   macro_tokens_to_syntax (&expansion, &s, ofs, len);
2313
2314   if (settings_get_mprint ())
2315     output_item_submit (text_item_create (TEXT_ITEM_LOG, ds_cstr (&s),
2316                                           _("Macro Expansion")));
2317
2318   /* Append the macro expansion tokens to the lookahead. */
2319   if (expansion.n > 0)
2320     {
2321       char *macro_rep = ds_steal_cstr (&s);
2322       size_t *ref_cnt = xmalloc (sizeof *ref_cnt);
2323       *ref_cnt = expansion.n;
2324       for (size_t i = 0; i < expansion.n; i++)
2325         {
2326           struct lex_token *token = xmalloc (sizeof *token);
2327           *token = (struct lex_token) {
2328             .token = expansion.mts[i].token,
2329             .token_pos = c0->token_pos,
2330             .token_len = (c1->token_pos + c1->token_len) - c0->token_pos,
2331             .macro_rep = macro_rep,
2332             .ofs = ofs[i],
2333             .len = len[i],
2334             .ref_cnt = ref_cnt,
2335           };
2336           lex_stage_push_last (&src->merge, token);
2337
2338           ss_dealloc (&expansion.mts[i].syntax);
2339         }
2340     }
2341   else
2342     ds_destroy (&s);
2343   free (expansion.mts);
2344   free (ofs);
2345   free (len);
2346
2347   /* Destroy the tokens for the call. */
2348   for (size_t i = 0; i < n_call; i++)
2349     lex_stage_pop_first (&src->pp);
2350
2351   return expansion.n > 0;
2352 }
2353
2354 /* Attempts to obtain at least one new token into 'merge' in SRC.
2355
2356    Returns true if successful, false on failure.  In the latter case, SRC is
2357    exhausted and 'src->eof' is now true. */
2358 static bool
2359 lex_source_get_merge (struct lex_source *src)
2360 {
2361   while (!src->eof)
2362     if (lex_source_try_get_merge (src))
2363       return true;
2364   return false;
2365 }
2366
2367 static bool
2368 lex_source_get_parse__ (struct lex_source *src)
2369 {
2370   struct merger m = MERGER_INIT;
2371   struct token out;
2372   for (size_t i = 0; ; i++)
2373     {
2374       while (lex_stage_count (&src->merge) <= i && !lex_source_get_merge (src))
2375         {
2376           /* We always get a T_ENDCMD at the end of an input file
2377              (transformed from T_STOP by lex_source_try_get_pp()) and
2378              merger_add() should never return -1 on T_ENDCMD. */
2379           assert (lex_stage_is_empty (&src->merge));
2380           return false;
2381         }
2382
2383       int retval = merger_add (&m, &lex_stage_nth (&src->merge, i)->token,
2384                                &out);
2385       if (!retval)
2386         {
2387           lex_source_push_parse (src, lex_stage_take_first (&src->merge));
2388           return true;
2389         }
2390       else if (retval > 0)
2391         {
2392           /* Add a token that merges all the tokens together. */
2393           const struct lex_token *first = lex_stage_first (&src->merge);
2394           const struct lex_token *last = lex_stage_nth (&src->merge,
2395                                                         retval - 1);
2396           bool macro = first->macro_rep && first->macro_rep == last->macro_rep;
2397           struct lex_token *t = xmalloc (sizeof *t);
2398           *t = (struct lex_token) {
2399             .token = out,
2400             .token_pos = first->token_pos,
2401             .token_len = (last->token_pos - first->token_pos) + last->token_len,
2402
2403             /* This works well if all the tokens were not expanded from macros,
2404                or if they came from the same macro expansion.  It just gives up
2405                in the other (corner) cases. */
2406             .macro_rep = macro ? first->macro_rep : NULL,
2407             .ofs = macro ? first->ofs : 0,
2408             .len = macro ? (last->ofs - first->ofs) + last->len : 0,
2409             .ref_cnt = macro ? first->ref_cnt : NULL,
2410           };
2411           if (t->ref_cnt)
2412             ++*t->ref_cnt;
2413           lex_source_push_parse (src, t);
2414
2415           for (int i = 0; i < retval; i++)
2416             lex_stage_pop_first (&src->merge);
2417           return true;
2418         }
2419     }
2420 }
2421
2422 /* Attempts to obtain at least one new token into 'lookahead' in SRC.
2423
2424    Returns true if successful, false on failure.  In the latter case, SRC is
2425    exhausted and 'src->eof' is now true. */
2426 static bool
2427 lex_source_get_parse (struct lex_source *src)
2428 {
2429   bool ok = lex_source_get_parse__ (src);
2430   struct lexer *lexer = src->lexer;
2431   if (lexer->n_messages)
2432     {
2433       struct msg **messages = lexer->messages;
2434       size_t n = lexer->n_messages;
2435
2436       lexer->messages = NULL;
2437       lexer->n_messages = lexer->allocated_messages = 0;
2438
2439       for (size_t i = 0; i < n; i++)
2440         msg_emit (messages[i]);
2441       free (messages);
2442     }
2443   return ok;
2444 }
2445 \f
2446 static void
2447 lex_source_push_endcmd__ (struct lex_source *src)
2448 {
2449   assert (src->n_parse == 0);
2450
2451   struct lex_token *token = xmalloc (sizeof *token);
2452   *token = (struct lex_token) { .token = { .type = T_ENDCMD } };
2453   lex_source_push_parse (src, token);
2454 }
2455
2456 static void
2457 lex_source_push_parse (struct lex_source *src, struct lex_token *token)
2458 {
2459   if (src->n_parse >= src->allocated_parse)
2460     src->parse = x2nrealloc (src->parse, &src->allocated_parse,
2461                              sizeof *src->parse);
2462   src->parse[src->n_parse++] = token;
2463 }
2464
2465 static void
2466 lex_source_clear_parse (struct lex_source *src)
2467 {
2468   for (size_t i = 0; i < src->n_parse; i++)
2469     lex_token_destroy (src->parse[i]);
2470   src->n_parse = src->parse_ofs = 0;
2471 }
2472
2473 static struct lex_source *
2474 lex_source_create (struct lexer *lexer, struct lex_reader *reader)
2475 {
2476   size_t allocated_lines = 4;
2477   size_t *lines = xmalloc (allocated_lines * sizeof *lines);
2478   *lines = 0;
2479
2480   struct lex_source *src = xmalloc (sizeof *src);
2481   *src = (struct lex_source) {
2482     .n_refs = 1,
2483     .reader = reader,
2484     .segmenter = segmenter_init (reader->syntax, false),
2485     .lexer = lexer,
2486     .lines = lines,
2487     .n_lines = 1,
2488     .allocated_lines = allocated_lines,
2489   };
2490
2491   lex_source_push_endcmd__ (src);
2492
2493   return src;
2494 }
2495
2496 void
2497 lex_set_message_handler (struct lexer *lexer,
2498                          void (*output_msg) (const struct msg *,
2499                                              struct lexer *))
2500 {
2501   struct msg_handler msg_handler = {
2502     .output_msg = (void (*)(const struct msg *, void *)) output_msg,
2503     .aux = lexer,
2504     .lex_source_ref = lex_source_ref,
2505     .lex_source_unref = lex_source_unref,
2506     .lex_source_get_line = lex_source_get_line,
2507   };
2508   msg_set_handler (&msg_handler);
2509 }
2510
2511 struct lex_source *
2512 lex_source_ref (const struct lex_source *src_)
2513 {
2514   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
2515   if (src)
2516     {
2517       assert (src->n_refs > 0);
2518       src->n_refs++;
2519     }
2520   return src;
2521 }
2522
2523 void
2524 lex_source_unref (struct lex_source *src)
2525 {
2526   if (!src)
2527     return;
2528
2529   assert (src->n_refs > 0);
2530   if (--src->n_refs > 0)
2531     return;
2532
2533   char *file_name = src->reader->file_name;
2534   char *encoding = src->reader->encoding;
2535   if (src->reader->class->destroy != NULL)
2536     src->reader->class->destroy (src->reader);
2537   free (file_name);
2538   free (encoding);
2539   free (src->buffer);
2540   free (src->lines);
2541   lex_stage_uninit (&src->pp);
2542   lex_stage_uninit (&src->merge);
2543   lex_source_clear_parse (src);
2544   free (src->parse);
2545   free (src);
2546 }
2547 \f
2548 struct lex_file_reader
2549   {
2550     struct lex_reader reader;
2551     struct u8_istream *istream;
2552   };
2553
2554 static struct lex_reader_class lex_file_reader_class;
2555
2556 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
2557    from stdin if FILE_NAME is "-").  The file is expected to be encoded with
2558    ENCODING, which should take one of the forms accepted by
2559    u8_istream_for_file().  SYNTAX and ERROR become the syntax mode and error
2560    mode of the new reader, respectively.
2561
2562    Returns a null pointer if FILE_NAME cannot be opened. */
2563 struct lex_reader *
2564 lex_reader_for_file (const char *file_name, const char *encoding,
2565                      enum segmenter_mode syntax,
2566                      enum lex_error_mode error)
2567 {
2568   struct lex_file_reader *r;
2569   struct u8_istream *istream;
2570
2571   istream = (!strcmp(file_name, "-")
2572              ? u8_istream_for_fd (encoding, STDIN_FILENO)
2573              : u8_istream_for_file (encoding, file_name, O_RDONLY));
2574   if (istream == NULL)
2575     {
2576       msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
2577       return NULL;
2578     }
2579
2580   r = xmalloc (sizeof *r);
2581   lex_reader_init (&r->reader, &lex_file_reader_class);
2582   r->reader.syntax = syntax;
2583   r->reader.error = error;
2584   r->reader.file_name = xstrdup (file_name);
2585   r->reader.encoding = xstrdup_if_nonnull (encoding);
2586   r->reader.line_number = 1;
2587   r->istream = istream;
2588
2589   return &r->reader;
2590 }
2591
2592 static struct lex_file_reader *
2593 lex_file_reader_cast (struct lex_reader *r)
2594 {
2595   return UP_CAST (r, struct lex_file_reader, reader);
2596 }
2597
2598 static size_t
2599 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
2600                enum prompt_style prompt_style UNUSED)
2601 {
2602   struct lex_file_reader *r = lex_file_reader_cast (r_);
2603   ssize_t n_read = u8_istream_read (r->istream, buf, n);
2604   if (n_read < 0)
2605     {
2606       msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
2607       return 0;
2608     }
2609   return n_read;
2610 }
2611
2612 static void
2613 lex_file_close (struct lex_reader *r_)
2614 {
2615   struct lex_file_reader *r = lex_file_reader_cast (r_);
2616
2617   if (u8_istream_fileno (r->istream) != STDIN_FILENO)
2618     {
2619       if (u8_istream_close (r->istream) != 0)
2620         msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
2621     }
2622   else
2623     u8_istream_free (r->istream);
2624
2625   free (r);
2626 }
2627
2628 static struct lex_reader_class lex_file_reader_class =
2629   {
2630     lex_file_read,
2631     lex_file_close
2632   };
2633 \f
2634 struct lex_string_reader
2635   {
2636     struct lex_reader reader;
2637     struct substring s;
2638     size_t offset;
2639   };
2640
2641 static struct lex_reader_class lex_string_reader_class;
2642
2643 /* Creates and returns a new lex_reader for the contents of S, which must be
2644    encoded in the given ENCODING.  The new reader takes ownership of S and will free it
2645    with ss_dealloc() when it is closed. */
2646 struct lex_reader *
2647 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
2648 {
2649   struct lex_string_reader *r;
2650
2651   r = xmalloc (sizeof *r);
2652   lex_reader_init (&r->reader, &lex_string_reader_class);
2653   r->reader.syntax = SEG_MODE_AUTO;
2654   r->reader.encoding = xstrdup_if_nonnull (encoding);
2655   r->s = s;
2656   r->offset = 0;
2657
2658   return &r->reader;
2659 }
2660
2661 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
2662    which must be encoded in ENCODING.  The caller retains ownership of S. */
2663 struct lex_reader *
2664 lex_reader_for_string (const char *s, const char *encoding)
2665 {
2666   return lex_reader_for_substring_nocopy (ss_clone (ss_cstr (s)), encoding);
2667 }
2668
2669 /* Formats FORMAT as a printf()-like format string and creates and returns a
2670    new lex_reader for the formatted result.  */
2671 struct lex_reader *
2672 lex_reader_for_format (const char *format, const char *encoding, ...)
2673 {
2674   struct lex_reader *r;
2675   va_list args;
2676
2677   va_start (args, encoding);
2678   r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
2679   va_end (args);
2680
2681   return r;
2682 }
2683
2684 static struct lex_string_reader *
2685 lex_string_reader_cast (struct lex_reader *r)
2686 {
2687   return UP_CAST (r, struct lex_string_reader, reader);
2688 }
2689
2690 static size_t
2691 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
2692                  enum prompt_style prompt_style UNUSED)
2693 {
2694   struct lex_string_reader *r = lex_string_reader_cast (r_);
2695   size_t chunk;
2696
2697   chunk = MIN (n, r->s.length - r->offset);
2698   memcpy (buf, r->s.string + r->offset, chunk);
2699   r->offset += chunk;
2700
2701   return chunk;
2702 }
2703
2704 static void
2705 lex_string_close (struct lex_reader *r_)
2706 {
2707   struct lex_string_reader *r = lex_string_reader_cast (r_);
2708
2709   ss_dealloc (&r->s);
2710   free (r);
2711 }
2712
2713 static struct lex_reader_class lex_string_reader_class =
2714   {
2715     lex_string_read,
2716     lex_string_close
2717   };
2718 \f
2719 struct substring
2720 lex_source_get_line (const struct lex_source *src, int line)
2721 {
2722   if (line < 1 || line > src->n_lines)
2723     return ss_empty ();
2724
2725   size_t ofs = src->lines[line - 1];
2726   size_t end;
2727   if (line < src->n_lines)
2728     end = src->lines[line];
2729   else
2730     {
2731       const char *newline = memchr (src->buffer + ofs, '\n', src->length - ofs);
2732       end = newline ? newline - src->buffer : src->length;
2733     }
2734   return ss_buffer (&src->buffer[ofs], end - ofs);
2735 }