pintos-os.org Git - pspp/blob - src/language/lexer/lexer.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "language/lexer/lexer.h"
  20
  21 #include <errno.h>
  22 #include <fcntl.h>
  23 #include <limits.h>
  24 #include <math.h>
  25 #include <stdarg.h>
  26 #include <stdlib.h>
  27 #include <string.h>
  28 #include <unictype.h>
  29 #include <unistd.h>
  30 #include <unistr.h>
  31
  32 #include "language/command.h"
  33 #include "language/lexer/macro.h"
  34 #include "language/lexer/scan.h"
  35 #include "language/lexer/segment.h"
  36 #include "language/lexer/token.h"
  37 #include "libpspp/assertion.h"
  38 #include "libpspp/cast.h"
  39 #include "libpspp/deque.h"
  40 #include "libpspp/i18n.h"
  41 #include "libpspp/intern.h"
  42 #include "libpspp/ll.h"
  43 #include "libpspp/message.h"
  44 #include "libpspp/misc.h"
  45 #include "libpspp/str.h"
  46 #include "libpspp/u8-istream.h"
  47 #include "output/journal.h"
  48 #include "output/output-item.h"
  49
  50 #include "gl/c-ctype.h"
  51 #include "gl/minmax.h"
  52 #include "gl/xalloc.h"
  53 #include "gl/xmemdup0.h"
  54
  55 #include "gettext.h"
  56 #define _(msgid) gettext (msgid)
  57 #define N_(msgid) msgid
  58
  59 /* A token within a lex_source. */
  60 struct lex_token
  61   {
  62     /* The regular token information. */
  63     struct token token;
  64
  65     /* For a token obtained through the lexer in an ordinary way, this is the
  66        location of the token in terms of the lex_source's buffer.
  67
  68        For a token produced through macro expansion, this is the entire macro
  69        call. */
  70     size_t token_pos;           /* Offset into src->buffer of token start. */
  71     size_t token_len;           /* Length of source for token in bytes. */
  72
  73     /* For a token obtained through macro expansion, this is just this token.
  74
  75        For a token obtained through the lexer in an ordinary way, these are
  76        nulls and zeros. */
  77     char *macro_rep;        /* The whole macro expansion. */
  78     size_t ofs;             /* Offset of this token in macro_rep. */
  79     size_t len;             /* Length of this token in macro_rep. */
  80     size_t *ref_cnt;        /* Number of lex_tokens that refer to macro_rep. */
  81   };
  82
  83 static struct msg_point lex_token_start_point (const struct lex_source *,
  84                                                const struct lex_token *);
  85 static struct msg_point lex_token_end_point (const struct lex_source *,
  86                                              const struct lex_token *);
  87
  88 static bool lex_ofs_at_phrase__ (struct lexer *, int ofs, const char *s,
  89                                  size_t *n_matchedp);
  90
  91 /* Source offset of the last byte in TOKEN. */
  92 static size_t
  93 lex_token_end (const struct lex_token *token)
  94 {
  95   return token->token_pos + MAX (token->token_len, 1) - 1;
  96 }
  97
  98 static void
  99 lex_token_destroy (struct lex_token *t)
 100 {
 101   token_uninit (&t->token);
 102   if (t->ref_cnt)
 103     {
 104       assert (*t->ref_cnt > 0);
 105       if (!--*t->ref_cnt)
 106         {
 107           free (t->macro_rep);
 108           free (t->ref_cnt);
 109         }
 110     }
 111   free (t);
 112 }
 113 \f
 114 /* A deque of lex_tokens that comprises one stage in the token pipeline in a
 115    lex_source. */
 116 struct lex_stage
 117   {
 118     struct deque deque;
 119     struct lex_token **tokens;
 120   };
 121
 122 static void lex_stage_clear (struct lex_stage *);
 123 static void lex_stage_uninit (struct lex_stage *);
 124
 125 static size_t lex_stage_count (const struct lex_stage *);
 126 static bool lex_stage_is_empty (const struct lex_stage *);
 127
 128 static struct lex_token *lex_stage_first (struct lex_stage *);
 129 static struct lex_token *lex_stage_nth (struct lex_stage *, size_t ofs);
 130
 131 static void lex_stage_push_last (struct lex_stage *, struct lex_token *);
 132 static void lex_stage_pop_first (struct lex_stage *);
 133
 134 static void lex_stage_shift (struct lex_stage *dst, struct lex_stage *src,
 135                              size_t n);
 136
 137 /* Deletes all the tokens from STAGE. */
 138 static void
 139 lex_stage_clear (struct lex_stage *stage)
 140 {
 141   while (!deque_is_empty (&stage->deque))
 142     lex_stage_pop_first (stage);
 143 }
 144
 145 /* Deletes all the tokens from STAGE and frees storage for the deque. */
 146 static void
 147 lex_stage_uninit (struct lex_stage *stage)
 148 {
 149   lex_stage_clear (stage);
 150   free (stage->tokens);
 151 }
 152
 153 /* Returns true if STAGE contains no tokens, otherwise false. */
 154 static bool
 155 lex_stage_is_empty (const struct lex_stage *stage)
 156 {
 157   return deque_is_empty (&stage->deque);
 158 }
 159
 160 /* Returns the number of tokens in STAGE. */
 161 static size_t
 162 lex_stage_count (const struct lex_stage *stage)
 163 {
 164   return deque_count (&stage->deque);
 165 }
 166
 167 /* Returns the first token in STAGE, which must be nonempty.
 168    The first token is the one accessed with the least lookahead. */
 169 static struct lex_token *
 170 lex_stage_first (struct lex_stage *stage)
 171 {
 172   return lex_stage_nth (stage, 0);
 173 }
 174
 175 /* Returns the token the given INDEX in STAGE.  The first token (with the least
 176    lookahead) is 0, the second token is 1, and so on.  There must be at least
 177    INDEX + 1 tokens in STAGE. */
 178 static struct lex_token *
 179 lex_stage_nth (struct lex_stage *stage, size_t index)
 180 {
 181   return stage->tokens[deque_back (&stage->deque, index)];
 182 }
 183
 184 /* Adds TOKEN so that it becomes the last token in STAGE. */
 185 static void
 186 lex_stage_push_last (struct lex_stage *stage, struct lex_token *token)
 187 {
 188   if (deque_is_full (&stage->deque))
 189     stage->tokens = deque_expand (&stage->deque, stage->tokens,
 190                                   sizeof *stage->tokens);
 191   stage->tokens[deque_push_front (&stage->deque)] = token;
 192 }
 193
 194 /* Removes and returns the first token from STAGE. */
 195 static struct lex_token *
 196 lex_stage_take_first (struct lex_stage *stage)
 197 {
 198   return stage->tokens[deque_pop_back (&stage->deque)];
 199 }
 200
 201 /* Removes the first token from STAGE and uninitializes it. */
 202 static void
 203 lex_stage_pop_first (struct lex_stage *stage)
 204 {
 205   lex_token_destroy (lex_stage_take_first (stage));
 206 }
 207
 208 /* Removes the first N tokens from SRC, appending them to DST as the last
 209    tokens. */
 210 static void
 211 lex_stage_shift (struct lex_stage *dst, struct lex_stage *src, size_t n)
 212 {
 213   for (size_t i = 0; i < n; i++)
 214     lex_stage_push_last (dst, lex_stage_take_first (src));
 215 }
 216
 217 /* A source of tokens, corresponding to a syntax file.
 218
 219    This is conceptually a lex_reader wrapped with everything needed to convert
 220    its UTF-8 bytes into tokens. */
 221 struct lex_source
 222   {
 223     struct ll ll;               /* In lexer's list of sources. */
 224
 225     /* Reference count:
 226
 227        - One for struct lexer.
 228
 229        - One for each struct msg_location that references this source. */
 230     size_t n_refs;
 231
 232     struct lex_reader *reader;
 233     struct lexer *lexer;
 234     struct segmenter segmenter;
 235     bool eof;                   /* True if T_STOP was read from 'reader'. */
 236
 237     /* Buffer of UTF-8 bytes. */
 238     char *buffer;               /* Source file contents. */
 239     size_t length;              /* Number of bytes filled. */
 240     size_t allocated;           /* Number of bytes allocated. */
 241
 242     /* Offsets into 'buffer'. */
 243     size_t journal_pos;         /* First byte not yet output to journal. */
 244     size_t seg_pos;             /* First byte not yet scanned as token. */
 245
 246     /* Offset into 'buffer' of starts of lines. */
 247     size_t *lines;
 248     size_t n_lines, allocated_lines;
 249
 250     bool suppress_next_newline;
 251
 252     /* Tokens.
 253
 254        This is a pipeline with the following stages.  Each token eventually
 255        made available to the parser passes through of these stages.  The stages
 256        are named after the processing that happens in each one.
 257
 258        Initially, tokens come from the segmenter and scanner to 'pp':
 259
 260        - pp: Tokens that need to pass through the macro preprocessor to end up
 261          in 'merge'.
 262
 263        - merge: Tokens that need to pass through scan_merge() to end up in
 264          'parse'.
 265
 266        - parse: Tokens available to the client for parsing.
 267
 268       'pp' and 'merge' store tokens only temporarily until they pass into
 269       'parse'.  Tokens then live in 'parse' until the command is fully
 270       consumed, at which time they are freed together. */
 271     struct lex_stage pp;
 272     struct lex_stage merge;
 273     struct lex_token **parse;
 274     size_t n_parse, allocated_parse, parse_ofs;
 275   };
 276
 277 static struct lex_source *lex_source_create (struct lexer *,
 278                                              struct lex_reader *);
 279
 280 /* Lexer. */
 281 struct lexer
 282   {
 283     struct ll_list sources;     /* Contains "struct lex_source"s. */
 284     struct macro_set *macros;
 285   };
 286
 287 static struct lex_source *lex_source__ (const struct lexer *);
 288 static char *lex_source_syntax__ (const struct lex_source *,
 289                                   int ofs0, int ofs1);
 290 static const struct lex_token *lex_next__ (const struct lexer *, int n);
 291 static void lex_source_push_endcmd__ (struct lex_source *);
 292 static void lex_source_push_parse (struct lex_source *, struct lex_token *);
 293 static void lex_source_clear_parse (struct lex_source *);
 294
 295 static bool lex_source_get_parse (struct lex_source *);
 296 static void lex_source_msg_valist (struct lex_source *, enum msg_class,
 297                                    int ofs0, int ofs1,
 298                                    const char *format, va_list)
 299    PRINTF_FORMAT (5, 0);
 300 static const struct lex_token *lex_source_next__ (const struct lex_source *,
 301                                                   int n);
 302 \f
 303 /* Initializes READER with the specified CLASS and otherwise some reasonable
 304    defaults.  The caller should fill in the others members as desired. */
 305 void
 306 lex_reader_init (struct lex_reader *reader,
 307                  const struct lex_reader_class *class)
 308 {
 309   reader->class = class;
 310   reader->syntax = SEG_MODE_AUTO;
 311   reader->error = LEX_ERROR_CONTINUE;
 312   reader->file_name = NULL;
 313   reader->encoding = NULL;
 314   reader->line_number = 0;
 315   reader->eof = false;
 316 }
 317
 318 /* Frees any file name already in READER and replaces it by a copy of
 319    FILE_NAME, or if FILE_NAME is null then clears any existing name. */
 320 void
 321 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
 322 {
 323   free (reader->file_name);
 324   reader->file_name = xstrdup_if_nonnull (file_name);
 325 }
 326 \f
 327 /* Creates and returns a new lexer. */
 328 struct lexer *
 329 lex_create (void)
 330 {
 331   struct lexer *lexer = xmalloc (sizeof *lexer);
 332   *lexer = (struct lexer) {
 333     .sources = LL_INITIALIZER (lexer->sources),
 334     .macros = macro_set_create (),
 335   };
 336   return lexer;
 337 }
 338
 339 /* Destroys LEXER. */
 340 void
 341 lex_destroy (struct lexer *lexer)
 342 {
 343   if (lexer != NULL)
 344     {
 345       struct lex_source *source, *next;
 346
 347       ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
 348         {
 349           ll_remove (&source->ll);
 350           lex_source_unref (source);
 351         }
 352       macro_set_destroy (lexer->macros);
 353       free (lexer);
 354     }
 355 }
 356
 357 /* Adds M to LEXER's set of macros.  M replaces any existing macro with the
 358    same name.  Takes ownership of M. */
 359 void
 360 lex_define_macro (struct lexer *lexer, struct macro *m)
 361 {
 362   macro_set_add (lexer->macros, m);
 363 }
 364
 365 /* Inserts READER into LEXER so that the next token read by LEXER comes from
 366    READER.  Before the caller, LEXER must either be empty or at a T_ENDCMD
 367    token. */
 368 void
 369 lex_include (struct lexer *lexer, struct lex_reader *reader)
 370 {
 371   assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
 372   ll_push_head (&lexer->sources, &lex_source_create (lexer, reader)->ll);
 373 }
 374
 375 /* Appends READER to LEXER, so that it will be read after all other current
 376    readers have already been read. */
 377 void
 378 lex_append (struct lexer *lexer, struct lex_reader *reader)
 379 {
 380   ll_push_tail (&lexer->sources, &lex_source_create (lexer, reader)->ll);
 381 }
 382 \f
 383 /* Advancing. */
 384
 385 /* Advances LEXER to the next token, consuming the current token. */
 386 void
 387 lex_get (struct lexer *lexer)
 388 {
 389   struct lex_source *src;
 390
 391   src = lex_source__ (lexer);
 392   if (src == NULL)
 393     return;
 394
 395   if (src->parse_ofs < src->n_parse)
 396     {
 397       if (src->parse[src->parse_ofs]->token.type == T_ENDCMD)
 398         lex_source_clear_parse (src);
 399       else
 400         src->parse_ofs++;
 401     }
 402
 403   while (src->parse_ofs == src->n_parse)
 404     if (!lex_source_get_parse (src))
 405       {
 406         ll_remove (&src->ll);
 407         lex_source_unref (src);
 408         src = lex_source__ (lexer);
 409         if (src == NULL)
 410           return;
 411       }
 412 }
 413
 414 /* Advances LEXER by N tokens. */
 415 void
 416 lex_get_n (struct lexer *lexer, size_t n)
 417 {
 418   while (n-- > 0)
 419     lex_get (lexer);
 420 }
 421 \f
 422 /* Issuing errors. */
 423
 424 /* Prints a syntax error message containing the current token and
 425    given message MESSAGE (if non-null). */
 426 void
 427 lex_error (struct lexer *lexer, const char *format, ...)
 428 {
 429   va_list args;
 430
 431   va_start (args, format);
 432   lex_ofs_msg_valist (lexer, SE, lex_ofs (lexer), lex_ofs (lexer),
 433                       format, args);
 434   va_end (args);
 435 }
 436
 437 /* Prints a syntax error message for the span of tokens N0 through N1,
 438    inclusive, from the current token in LEXER, adding message MESSAGE (if
 439    non-null). */
 440 void
 441 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
 442 {
 443   va_list args;
 444
 445   va_start (args, format);
 446   int ofs = lex_ofs (lexer);
 447   lex_ofs_msg_valist (lexer, SE, n0 + ofs, n1 + ofs, format, args);
 448   va_end (args);
 449 }
 450
 451 /* Prints a syntax error message for the span of tokens with offsets OFS0
 452    through OFS1, inclusive, within the current command in LEXER, adding message
 453    MESSAGE (if non-null). */
 454 void
 455 lex_ofs_error (struct lexer *lexer, int ofs0, int ofs1, const char *format, ...)
 456 {
 457   va_list args;
 458
 459   va_start (args, format);
 460   lex_ofs_msg_valist (lexer, SE, ofs0, ofs1, format, args);
 461   va_end (args);
 462 }
 463
 464 /* Prints a message of the given CLASS containing the current token and given
 465    message MESSAGE (if non-null). */
 466 void
 467 lex_msg (struct lexer *lexer, enum msg_class class, const char *format, ...)
 468 {
 469   va_list args;
 470
 471   va_start (args, format);
 472   lex_ofs_msg_valist (lexer, class, lex_ofs (lexer), lex_ofs (lexer),
 473                       format, args);
 474   va_end (args);
 475 }
 476
 477 /* Prints a syntax error message for the span of tokens N0 through N1,
 478    inclusive, from the current token in LEXER, adding message MESSAGE (if
 479    non-null). */
 480 void
 481 lex_next_msg (struct lexer *lexer, enum msg_class class, int n0, int n1,
 482               const char *format, ...)
 483 {
 484   va_list args;
 485
 486   va_start (args, format);
 487   int ofs = lex_ofs (lexer);
 488   lex_ofs_msg_valist (lexer, class, n0 + ofs, n1 + ofs, format, args);
 489   va_end (args);
 490 }
 491
 492 /* Prints a message of the given CLASS for the span of tokens with offsets OFS0
 493    through OFS1, inclusive, within the current command in LEXER, adding message
 494    MESSAGE (if non-null). */
 495 void
 496 lex_ofs_msg (struct lexer *lexer, enum msg_class class, int ofs0, int ofs1,
 497              const char *format, ...)
 498 {
 499   va_list args;
 500
 501   va_start (args, format);
 502   lex_ofs_msg_valist (lexer, class, ofs0, ofs1, format, args);
 503   va_end (args);
 504 }
 505
 506 /* Prints a syntax error message saying that one of the strings provided as
 507    varargs, up to the first NULL, is expected. */
 508 void
 509 (lex_error_expecting) (struct lexer *lexer, ...)
 510 {
 511   va_list args;
 512
 513   va_start (args, lexer);
 514   lex_error_expecting_valist (lexer, args);
 515   va_end (args);
 516 }
 517
 518 /* Prints a syntax error message saying that one of the options provided in
 519    ARGS, up to the first NULL, is expected. */
 520 void
 521 lex_error_expecting_valist (struct lexer *lexer, va_list args)
 522 {
 523   const char **options = NULL;
 524   size_t allocated = 0;
 525   size_t n = 0;
 526
 527   for (;;)
 528     {
 529       const char *option = va_arg (args, const char *);
 530       if (!option)
 531         break;
 532
 533       if (n >= allocated)
 534         options = x2nrealloc (options, &allocated, sizeof *options);
 535       options[n++] = option;
 536     }
 537   lex_error_expecting_array (lexer, options, n);
 538   free (options);
 539 }
 540
 541 void
 542 lex_error_expecting_array (struct lexer *lexer, const char **options, size_t n)
 543 {
 544   switch (n)
 545     {
 546     case 0:
 547       lex_error (lexer, NULL);
 548       break;
 549
 550     case 1:
 551       lex_error (lexer, _("Syntax error expecting %s."), options[0]);
 552       break;
 553
 554     case 2:
 555       lex_error (lexer, _("Syntax error expecting %s or %s."),
 556                  options[0], options[1]);
 557       break;
 558
 559     case 3:
 560       lex_error (lexer, _("Syntax error expecting %s, %s, or %s."),
 561                  options[0], options[1], options[2]);
 562       break;
 563
 564     case 4:
 565       lex_error (lexer, _("Syntax error expecting %s, %s, %s, or %s."),
 566                  options[0], options[1], options[2], options[3]);
 567       break;
 568
 569     case 5:
 570       lex_error (lexer, _("Syntax error expecting %s, %s, %s, %s, or %s."),
 571                  options[0], options[1], options[2], options[3], options[4]);
 572       break;
 573
 574     case 6:
 575       lex_error (lexer, _("Syntax error expecting %s, %s, %s, %s, %s, or %s."),
 576                  options[0], options[1], options[2], options[3], options[4],
 577                  options[5]);
 578       break;
 579
 580     case 7:
 581       lex_error (lexer, _("Syntax error expecting %s, %s, %s, %s, %s, %s, "
 582                           "or %s."),
 583                  options[0], options[1], options[2], options[3], options[4],
 584                  options[5], options[6]);
 585       break;
 586
 587     case 8:
 588       lex_error (lexer, _("Syntax error expecting %s, %s, %s, %s, %s, %s, %s, "
 589                           "or %s."),
 590                  options[0], options[1], options[2], options[3], options[4],
 591                  options[5], options[6], options[7]);
 592       break;
 593
 594     default:
 595       {
 596         struct string s = DS_EMPTY_INITIALIZER;
 597         for (size_t i = 0; i < n; i++)
 598           {
 599             if (i > 0)
 600               ds_put_cstr (&s, ", ");
 601             ds_put_cstr (&s, options[i]);
 602           }
 603         lex_error (lexer, _("Syntax error expecting one of the following: %s."),
 604                    ds_cstr (&s));
 605         ds_destroy (&s);
 606       }
 607       break;
 608     }
 609 }
 610
 611 /* Reports an error to the effect that subcommand SBC may only be specified
 612    once. */
 613 void
 614 lex_sbc_only_once (struct lexer *lexer, const char *sbc)
 615 {
 616   int ofs = lex_ofs (lexer) - 1;
 617   if (lex_ofs_token (lexer, ofs)->type == T_EQUALS)
 618     ofs--;
 619
 620   /* lex_ofs_at_phrase__() handles subcommand names that are keywords, such as
 621      BY. */
 622   if (lex_ofs_at_phrase__ (lexer, ofs, sbc, NULL))
 623     lex_ofs_error (lexer, ofs, ofs,
 624                    _("Subcommand %s may only be specified once."), sbc);
 625   else
 626     msg (SE, _("Subcommand %s may only be specified once."), sbc);
 627 }
 628
 629 /* Reports an error to the effect that subcommand SBC is missing.
 630
 631    This function does not take a lexer as an argument or use lex_error(),
 632    because a missing subcommand can normally be detected only after the whole
 633    command has been parsed, and so lex_error() would always report "Syntax
 634    error at end of command", which does not help the user find the error. */
 635 void
 636 lex_sbc_missing (struct lexer *lexer, const char *sbc)
 637 {
 638   lex_ofs_error (lexer, 0, lex_max_ofs (lexer),
 639                  _("Required subcommand %s was not specified."), sbc);
 640 }
 641
 642 /* Reports an error to the effect that specification SPEC may only be specified
 643    once within subcommand SBC. */
 644 void
 645 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
 646 {
 647   lex_error (lexer, _("%s may only be specified once within subcommand %s."),
 648              spec, sbc);
 649 }
 650
 651 /* Reports an error to the effect that specification SPEC is missing within
 652    subcommand SBC. */
 653 void
 654 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
 655 {
 656   lex_error (lexer, _("Required %s specification missing from %s subcommand."),
 657              spec, sbc);
 658 }
 659
 660 /* Prints a syntax error message for the span of tokens with offsets OFS0
 661    through OFS1, inclusive, within the current command in LEXER, adding message
 662    MESSAGE (if non-null) with the given ARGS. */
 663 void
 664 lex_ofs_msg_valist (struct lexer *lexer, enum msg_class class,
 665                     int ofs0, int ofs1, const char *format, va_list args)
 666 {
 667   lex_source_msg_valist (lex_source__ (lexer), class, ofs0, ofs1, format, args);
 668 }
 669
 670 /* Checks that we're at end of command.
 671    If so, returns a successful command completion code.
 672    If not, flags a syntax error and returns an error command
 673    completion code. */
 674 int
 675 lex_end_of_command (struct lexer *lexer)
 676 {
 677   if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
 678     {
 679       lex_error (lexer, _("Syntax error expecting end of command."));
 680       return CMD_FAILURE;
 681     }
 682   else
 683     return CMD_SUCCESS;
 684 }
 685 \f
 686 /* Token testing functions. */
 687
 688 /* Returns true if the current token is a number. */
 689 bool
 690 lex_is_number (const struct lexer *lexer)
 691 {
 692   return lex_next_is_number (lexer, 0);
 693 }
 694
 695 /* Returns true if the current token is a string. */
 696 bool
 697 lex_is_string (const struct lexer *lexer)
 698 {
 699   return lex_next_is_string (lexer, 0);
 700 }
 701
 702 /* Returns the value of the current token, which must be a
 703    floating point number. */
 704 double
 705 lex_number (const struct lexer *lexer)
 706 {
 707   return lex_next_number (lexer, 0);
 708 }
 709
 710 /* Returns true iff the current token is an integer. */
 711 bool
 712 lex_is_integer (const struct lexer *lexer)
 713 {
 714   return lex_next_is_integer (lexer, 0);
 715 }
 716
 717 /* Returns the value of the current token, which must be an
 718    integer. */
 719 long
 720 lex_integer (const struct lexer *lexer)
 721 {
 722   return lex_next_integer (lexer, 0);
 723 }
 724 \f
 725 /* Token testing functions with lookahead.
 726
 727    A value of 0 for N as an argument to any of these functions refers to the
 728    current token.  Lookahead is limited to the current command.  Any N greater
 729    than the number of tokens remaining in the current command will be treated
 730    as referring to a T_ENDCMD token. */
 731
 732 /* Returns true if the token N ahead of the current token is a number. */
 733 bool
 734 lex_next_is_number (const struct lexer *lexer, int n)
 735 {
 736   return token_is_number (lex_next (lexer, n));
 737 }
 738
 739 /* Returns true if the token N ahead of the current token is a string. */
 740 bool
 741 lex_next_is_string (const struct lexer *lexer, int n)
 742 {
 743   return token_is_string (lex_next (lexer, n));
 744 }
 745
 746 /* Returns the value of the token N ahead of the current token, which must be a
 747    floating point number. */
 748 double
 749 lex_next_number (const struct lexer *lexer, int n)
 750 {
 751   return token_number (lex_next (lexer, n));
 752 }
 753
 754 /* Returns true if the token N ahead of the current token is an integer. */
 755 bool
 756 lex_next_is_integer (const struct lexer *lexer, int n)
 757 {
 758   return token_is_integer (lex_next (lexer, n));
 759 }
 760
 761 /* Returns the value of the token N ahead of the current token, which must be
 762    an integer. */
 763 long
 764 lex_next_integer (const struct lexer *lexer, int n)
 765 {
 766   return token_integer (lex_next (lexer, n));
 767 }
 768 \f
 769 /* Token matching functions. */
 770
 771 /* If the current token has the specified TYPE, skips it and returns true.
 772    Otherwise, returns false. */
 773 bool
 774 lex_match (struct lexer *lexer, enum token_type type)
 775 {
 776   if (lex_token (lexer) == type)
 777     {
 778       lex_get (lexer);
 779       return true;
 780     }
 781   else
 782     return false;
 783 }
 784
 785 /* If the current token matches IDENTIFIER, skips it and returns true.
 786    IDENTIFIER may be abbreviated to its first three letters.  Otherwise,
 787    returns false.
 788
 789    IDENTIFIER must be an ASCII string. */
 790 bool
 791 lex_match_id (struct lexer *lexer, const char *identifier)
 792 {
 793   return lex_match_id_n (lexer, identifier, 3);
 794 }
 795
 796 /* If the current token is IDENTIFIER, skips it and returns true.  IDENTIFIER
 797    may be abbreviated to its first N letters.  Otherwise, returns false.
 798
 799    IDENTIFIER must be an ASCII string. */
 800 bool
 801 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
 802 {
 803   if (lex_token (lexer) == T_ID
 804       && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
 805     {
 806       lex_get (lexer);
 807       return true;
 808     }
 809   else
 810     return false;
 811 }
 812
 813 /* If the current token is integer X, skips it and returns true.  Otherwise,
 814    returns false. */
 815 bool
 816 lex_match_int (struct lexer *lexer, int x)
 817 {
 818   if (lex_is_integer (lexer) && lex_integer (lexer) == x)
 819     {
 820       lex_get (lexer);
 821       return true;
 822     }
 823   else
 824     return false;
 825 }
 826 \f
 827 /* Forced matches. */
 828
 829 /* If this token is IDENTIFIER, skips it and returns true.  IDENTIFIER may be
 830    abbreviated to its first 3 letters.  Otherwise, reports an error and returns
 831    false.
 832
 833    IDENTIFIER must be an ASCII string. */
 834 bool
 835 lex_force_match_id (struct lexer *lexer, const char *identifier)
 836 {
 837   if (lex_match_id (lexer, identifier))
 838     return true;
 839   else
 840     {
 841       lex_error_expecting (lexer, identifier);
 842       return false;
 843     }
 844 }
 845
 846 /* If the current token has the specified TYPE, skips it and returns true.
 847    Otherwise, reports an error and returns false. */
 848 bool
 849 lex_force_match (struct lexer *lexer, enum token_type type)
 850 {
 851   if (lex_token (lexer) == type)
 852     {
 853       lex_get (lexer);
 854       return true;
 855     }
 856   else
 857     {
 858       const char *type_string = token_type_to_string (type);
 859       if (type_string)
 860         {
 861           char *s = xasprintf ("`%s'", type_string);
 862           lex_error_expecting (lexer, s);
 863           free (s);
 864         }
 865       else
 866         lex_error_expecting (lexer, token_type_to_name (type));
 867
 868       return false;
 869     }
 870 }
 871
 872 /* If the current token is a string, does nothing and returns true.
 873    Otherwise, reports an error and returns false. */
 874 bool
 875 lex_force_string (struct lexer *lexer)
 876 {
 877   if (lex_is_string (lexer))
 878     return true;
 879   else
 880     {
 881       lex_error (lexer, _("Syntax error expecting string."));
 882       return false;
 883     }
 884 }
 885
 886 /* If the current token is a string or an identifier, does nothing and returns
 887    true.  Otherwise, reports an error and returns false.
 888
 889    This is meant for use in syntactic situations where we want to encourage the
 890    user to supply a quoted string, but for compatibility we also accept
 891    identifiers.  (One example of such a situation is file names.)  Therefore,
 892    the error message issued when the current token is wrong only says that a
 893    string is expected and doesn't mention that an identifier would also be
 894    accepted. */
 895 bool
 896 lex_force_string_or_id (struct lexer *lexer)
 897 {
 898   return lex_token (lexer) == T_ID || lex_force_string (lexer);
 899 }
 900
 901 /* If the current token is an integer, does nothing and returns true.
 902    Otherwise, reports an error and returns false. */
 903 bool
 904 lex_force_int (struct lexer *lexer)
 905 {
 906   if (lex_is_integer (lexer))
 907     return true;
 908   else
 909     {
 910       lex_error (lexer, _("Syntax error expecting integer."));
 911       return false;
 912     }
 913 }
 914
 915 /* If the current token is an integer in the range MIN...MAX (inclusive), does
 916    nothing and returns true.  Otherwise, reports an error and returns false.
 917    If NAME is nonnull, then it is used in the error message. */
 918 bool
 919 lex_force_int_range (struct lexer *lexer, const char *name, long min, long max)
 920 {
 921   bool is_number = lex_is_number (lexer);
 922   bool is_integer = lex_is_integer (lexer);
 923   bool too_small = (is_integer ? lex_integer (lexer) < min
 924                     : is_number ? lex_number (lexer) < min
 925                     : false);
 926   bool too_big = (is_integer ? lex_integer (lexer) > max
 927                   : is_number ? lex_number (lexer) > max
 928                   : false);
 929   if (is_integer && !too_small && !too_big)
 930     return true;
 931
 932   if (min > max)
 933     {
 934       /* Weird, maybe a bug in the caller.  Just report that we needed an
 935          integer. */
 936       if (name)
 937         lex_error (lexer, _("Syntax error expecting integer for %s."), name);
 938       else
 939         lex_error (lexer, _("Syntax error expecting integer."));
 940     }
 941   else if (min == max)
 942     {
 943       if (name)
 944         lex_error (lexer, _("Syntax error expecting %ld for %s."), min, name);
 945       else
 946         lex_error (lexer, _("Syntax error expecting %ld."), min);
 947     }
 948   else if (min + 1 == max)
 949     {
 950       if (name)
 951         lex_error (lexer, _("Syntax error expecting %ld or %ld for %s."),
 952                    min, min + 1, name);
 953       else
 954         lex_error (lexer, _("Syntax error expecting %ld or %ld."),
 955                    min, min + 1);
 956     }
 957   else
 958     {
 959       bool report_lower_bound = (min > INT_MIN / 2) || too_small;
 960       bool report_upper_bound = (max < INT_MAX / 2) || too_big;
 961
 962       if (report_lower_bound && report_upper_bound)
 963         {
 964           if (name)
 965             lex_error (lexer,
 966                        _("Syntax error expecting integer "
 967                          "between %ld and %ld for %s."),
 968                        min, max, name);
 969           else
 970             lex_error (lexer, _("Syntax error expecting integer "
 971                                 "between %ld and %ld."),
 972                        min, max);
 973         }
 974       else if (report_lower_bound)
 975         {
 976           if (min == 0)
 977             {
 978               if (name)
 979                 lex_error (lexer, _("Syntax error expecting "
 980                                     "non-negative integer for %s."),
 981                            name);
 982               else
 983                 lex_error (lexer, _("Syntax error expecting "
 984                                     "non-negative integer."));
 985             }
 986           else if (min == 1)
 987             {
 988               if (name)
 989                 lex_error (lexer, _("Syntax error expecting "
 990                                     "positive integer for %s."),
 991                            name);
 992               else
 993                 lex_error (lexer, _("Syntax error expecting "
 994                                     "positive integer."));
 995             }
 996           else
 997             {
 998               if (name)
 999                 lex_error (lexer, _("Syntax error expecting "
1000                                     "integer %ld or greater for %s."),
1001                            min, name);
1002               else
1003                 lex_error (lexer, _("Syntax error expecting "
1004                                     "integer %ld or greater."), min);
1005             }
1006         }
1007       else if (report_upper_bound)
1008         {
1009           if (name)
1010             lex_error (lexer,
1011                        _("Syntax error expecting integer less than or equal "
1012                          "to %ld for %s."),
1013                        max, name);
1014           else
1015             lex_error (lexer, _("Syntax error expecting integer less than or "
1016                                 "equal to %ld."),
1017                        max);
1018         }
1019       else
1020         {
1021           if (name)
1022             lex_error (lexer, _("Syntax error expecting integer for %s."),
1023                        name);
1024           else
1025             lex_error (lexer, _("Syntax error expecting integer."));
1026         }
1027     }
1028   return false;
1029 }
1030
1031 /* If the current token is a number, does nothing and returns true.
1032    Otherwise, reports an error and returns false. */
1033 bool
1034 lex_force_num (struct lexer *lexer)
1035 {
1036   if (lex_is_number (lexer))
1037     return true;
1038
1039   lex_error (lexer, _("Syntax error expecting number."));
1040   return false;
1041 }
1042
1043 /* If the current token is an number in the closed range [MIN,MAX], does
1044    nothing and returns true.  Otherwise, reports an error and returns false.
1045    If NAME is nonnull, then it is used in the error message. */
1046 bool
1047 lex_force_num_range_closed (struct lexer *lexer, const char *name,
1048                             double min, double max)
1049 {
1050   bool is_number = lex_is_number (lexer);
1051   bool too_small = is_number && lex_number (lexer) < min;
1052   bool too_big = is_number && lex_number (lexer) > max;
1053   if (is_number && !too_small && !too_big)
1054     return true;
1055
1056   if (min > max)
1057     {
1058       /* Weird, maybe a bug in the caller.  Just report that we needed an
1059          number. */
1060       if (name)
1061         lex_error (lexer, _("Syntax error expecting number for %s."), name);
1062       else
1063         lex_error (lexer, _("Syntax error expecting number."));
1064     }
1065   else if (min == max)
1066     {
1067       if (name)
1068         lex_error (lexer, _("Syntax error expecting number %g for %s."),
1069                    min, name);
1070       else
1071         lex_error (lexer, _("Syntax error expecting number %g."), min);
1072     }
1073   else
1074     {
1075       bool report_lower_bound = min > -DBL_MAX || too_small;
1076       bool report_upper_bound = max < DBL_MAX || too_big;
1077
1078       if (report_lower_bound && report_upper_bound)
1079         {
1080           if (name)
1081             lex_error (lexer,
1082                        _("Syntax error expecting number "
1083                          "between %g and %g for %s."),
1084                        min, max, name);
1085           else
1086             lex_error (lexer, _("Syntax error expecting number "
1087                                 "between %g and %g."),
1088                        min, max);
1089         }
1090       else if (report_lower_bound)
1091         {
1092           if (min == 0)
1093             {
1094               if (name)
1095                 lex_error (lexer, _("Syntax error expecting "
1096                                     "non-negative number for %s."),
1097                            name);
1098               else
1099                 lex_error (lexer, _("Syntax error expecting "
1100                                     "non-negative number."));
1101             }
1102           else
1103             {
1104               if (name)
1105                 lex_error (lexer, _("Syntax error expecting number "
1106                                     "%g or greater for %s."),
1107                            min, name);
1108               else
1109                 lex_error (lexer, _("Syntax error expecting number "
1110                                     "%g or greater."), min);
1111             }
1112         }
1113       else if (report_upper_bound)
1114         {
1115           if (name)
1116             lex_error (lexer,
1117                        _("Syntax error expecting number "
1118                          "less than or equal to %g for %s."),
1119                        max, name);
1120           else
1121             lex_error (lexer, _("Syntax error expecting number "
1122                                 "less than or equal to %g."),
1123                        max);
1124         }
1125       else
1126         {
1127           if (name)
1128             lex_error (lexer, _("Syntax error expecting number for %s."), name);
1129           else
1130             lex_error (lexer, _("Syntax error expecting number."));
1131         }
1132     }
1133   return false;
1134 }
1135
1136 /* If the current token is an number in the half-open range [MIN,MAX), does
1137    nothing and returns true.  Otherwise, reports an error and returns false.
1138    If NAME is nonnull, then it is used in the error message. */
1139 bool
1140 lex_force_num_range_halfopen (struct lexer *lexer, const char *name,
1141                               double min, double max)
1142 {
1143   bool is_number = lex_is_number (lexer);
1144   bool too_small = is_number && lex_number (lexer) < min;
1145   bool too_big = is_number && lex_number (lexer) >= max;
1146   if (is_number && !too_small && !too_big)
1147     return true;
1148
1149   if (min >= max)
1150     {
1151       /* Weird, maybe a bug in the caller.  Just report that we needed an
1152          number. */
1153       if (name)
1154         lex_error (lexer, _("Syntax error expecting number for %s."), name);
1155       else
1156         lex_error (lexer, _("Syntax error expecting number."));
1157     }
1158   else
1159     {
1160       bool report_lower_bound = min > -DBL_MAX || too_small;
1161       bool report_upper_bound = max < DBL_MAX || too_big;
1162
1163       if (report_lower_bound && report_upper_bound)
1164         {
1165           if (name)
1166             lex_error (lexer, _("Syntax error expecting number "
1167                                 "in [%g,%g) for %s."),
1168                        min, max, name);
1169           else
1170             lex_error (lexer, _("Syntax error expecting number in [%g,%g)."),
1171                        min, max);
1172         }
1173       else if (report_lower_bound)
1174         {
1175           if (min == 0)
1176             {
1177               if (name)
1178                 lex_error (lexer, _("Syntax error expecting "
1179                                     "non-negative number for %s."),
1180                            name);
1181               else
1182                 lex_error (lexer, _("Syntax error expecting "
1183                                     "non-negative number."));
1184             }
1185           else
1186             {
1187               if (name)
1188                 lex_error (lexer, _("Syntax error expecting "
1189                                     "number %g or greater for %s."),
1190                            min, name);
1191               else
1192                 lex_error (lexer, _("Syntax error expecting "
1193                                     "number %g or greater."), min);
1194             }
1195         }
1196       else if (report_upper_bound)
1197         {
1198           if (name)
1199             lex_error (lexer,
1200                        _("Syntax error expecting "
1201                          "number less than %g for %s."), max, name);
1202           else
1203             lex_error (lexer, _("Syntax error expecting "
1204                                 "number less than %g."), max);
1205         }
1206       else
1207         {
1208           if (name)
1209             lex_error (lexer, _("Syntax error expecting number for %s."), name);
1210           else
1211             lex_error (lexer, _("Syntax error expecting number."));
1212         }
1213     }
1214   return false;
1215 }
1216
1217 /* If the current token is an number in the open range (MIN,MAX), does
1218    nothing and returns true.  Otherwise, reports an error and returns false.
1219    If NAME is nonnull, then it is used in the error message. */
1220 bool
1221 lex_force_num_range_open (struct lexer *lexer, const char *name,
1222                           double min, double max)
1223 {
1224   bool is_number = lex_is_number (lexer);
1225   bool too_small = is_number && lex_number (lexer) <= min;
1226   bool too_big = is_number && lex_number (lexer) >= max;
1227   if (is_number && !too_small && !too_big)
1228     return true;
1229
1230   if (min >= max)
1231     {
1232       /* Weird, maybe a bug in the caller.  Just report that we needed an
1233          number. */
1234       if (name)
1235         lex_error (lexer, _("Syntax error expecting number for %s."), name);
1236       else
1237         lex_error (lexer, _("Syntax error expecting number."));
1238     }
1239   else
1240     {
1241       bool report_lower_bound = min > -DBL_MAX || too_small;
1242       bool report_upper_bound = max < DBL_MAX || too_big;
1243
1244       if (report_lower_bound && report_upper_bound)
1245         {
1246           if (name)
1247             lex_error (lexer, _("Syntax error expecting number "
1248                                 "in (%g,%g) for %s."),
1249                        min, max, name);
1250           else
1251             lex_error (lexer, _("Syntax error expecting number "
1252                                 "in (%g,%g)."), min, max);
1253         }
1254       else if (report_lower_bound)
1255         {
1256           if (min == 0)
1257             {
1258               if (name)
1259                 lex_error (lexer, _("Syntax error expecting "
1260                                     "positive number for %s."), name);
1261               else
1262                 lex_error (lexer, _("Syntax error expecting "
1263                                     "positive number."));
1264             }
1265           else
1266             {
1267               if (name)
1268                 lex_error (lexer, _("Syntax error expecting number "
1269                                     "greater than %g for %s."),
1270                            min, name);
1271               else
1272                 lex_error (lexer, _("Syntax error expecting number "
1273                                     "greater than %g."), min);
1274             }
1275         }
1276       else if (report_upper_bound)
1277         {
1278           if (name)
1279             lex_error (lexer, _("Syntax error expecting number "
1280                                 "less than %g for %s."),
1281                        max, name);
1282           else
1283             lex_error (lexer, _("Syntax error expecting number "
1284                                 "less than %g."), max);
1285         }
1286       else
1287         {
1288           if (name)
1289             lex_error (lexer, _("Syntax error expecting number "
1290                                 "for %s."), name);
1291           else
1292             lex_error (lexer, _("Syntax error expecting number."));
1293         }
1294     }
1295   return false;
1296 }
1297
1298 /* If the current token is an identifier, does nothing and returns true.
1299    Otherwise, reports an error and returns false. */
1300 bool
1301 lex_force_id (struct lexer *lexer)
1302 {
1303   if (lex_token (lexer) == T_ID)
1304     return true;
1305
1306   lex_error (lexer, _("Syntax error expecting identifier."));
1307   return false;
1308 }
1309 \f
1310 /* Token accessors. */
1311
1312 /* Returns the type of LEXER's current token. */
1313 enum token_type
1314 lex_token (const struct lexer *lexer)
1315 {
1316   return lex_next_token (lexer, 0);
1317 }
1318
1319 /* Returns the number in LEXER's current token.
1320
1321    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
1322    tokens this function will always return zero. */
1323 double
1324 lex_tokval (const struct lexer *lexer)
1325 {
1326   return lex_next_tokval (lexer, 0);
1327 }
1328
1329 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
1330
1331    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
1332    this functions this function will always return NULL.
1333
1334    The UTF-8 encoding of the returned string is correct for variable names and
1335    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
1336    data_in() to use it in a "union value".  */
1337 const char *
1338 lex_tokcstr (const struct lexer *lexer)
1339 {
1340   return lex_next_tokcstr (lexer, 0);
1341 }
1342
1343 /* Returns the string in LEXER's current token, UTF-8 encoded.  The string is
1344    null-terminated (but the null terminator is not included in the returned
1345    substring's 'length').
1346
1347    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
1348    this functions this function will always return NULL.
1349
1350    The UTF-8 encoding of the returned string is correct for variable names and
1351    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
1352    data_in() to use it in a "union value".  */
1353 struct substring
1354 lex_tokss (const struct lexer *lexer)
1355 {
1356   return lex_next_tokss (lexer, 0);
1357 }
1358 \f
1359 /* Looking ahead.
1360
1361    A value of 0 for N as an argument to any of these functions refers to the
1362    current token.  Lookahead is limited to the current command.  Any N greater
1363    than the number of tokens remaining in the current command will be treated
1364    as referring to a T_ENDCMD token. */
1365
1366 static const struct lex_token *
1367 lex_next__ (const struct lexer *lexer_, int n)
1368 {
1369   struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
1370   struct lex_source *src = lex_source__ (lexer);
1371
1372   if (src != NULL)
1373     return lex_source_next__ (src, n);
1374   else
1375     {
1376       static const struct lex_token stop_token = { .token = { .type = T_STOP } };
1377       return &stop_token;
1378     }
1379 }
1380
1381 static const struct lex_token *
1382 lex_source_ofs__ (const struct lex_source *src_, int ofs)
1383 {
1384   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1385
1386   if (ofs < 0)
1387     {
1388       static const struct lex_token endcmd_token
1389         = { .token = { .type = T_ENDCMD } };
1390       return &endcmd_token;
1391     }
1392
1393   while (ofs >= src->n_parse)
1394     {
1395       if (src->n_parse > 0)
1396         {
1397           const struct lex_token *t = src->parse[src->n_parse - 1];
1398           if (t->token.type == T_STOP || t->token.type == T_ENDCMD)
1399             return t;
1400         }
1401
1402       lex_source_get_parse (src);
1403     }
1404
1405   return src->parse[ofs];
1406 }
1407
1408 static const struct lex_token *
1409 lex_source_next__ (const struct lex_source *src, int n)
1410 {
1411   return lex_source_ofs__ (src, n + src->parse_ofs);
1412 }
1413
1414 /* Returns the "struct token" of the token N after the current one in LEXER.
1415    The returned pointer can be invalidated by pretty much any succeeding call
1416    into the lexer, although the string pointer within the returned token is
1417    only invalidated by consuming the token (e.g. with lex_get()). */
1418 const struct token *
1419 lex_next (const struct lexer *lexer, int n)
1420 {
1421   return &lex_next__ (lexer, n)->token;
1422 }
1423
1424 /* Returns the type of the token N after the current one in LEXER. */
1425 enum token_type
1426 lex_next_token (const struct lexer *lexer, int n)
1427 {
1428   return lex_next (lexer, n)->type;
1429 }
1430
1431 /* Returns the number in the tokn N after the current one in LEXER.
1432
1433    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
1434    tokens this function will always return zero. */
1435 double
1436 lex_next_tokval (const struct lexer *lexer, int n)
1437 {
1438   return token_number (lex_next (lexer, n));
1439 }
1440
1441 /* Returns the null-terminated string in the token N after the current one, in
1442    UTF-8 encoding.
1443
1444    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
1445    this functions this function will always return NULL.
1446
1447    The UTF-8 encoding of the returned string is correct for variable names and
1448    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
1449    data_in() to use it in a "union value".  */
1450 const char *
1451 lex_next_tokcstr (const struct lexer *lexer, int n)
1452 {
1453   return lex_next_tokss (lexer, n).string;
1454 }
1455
1456 /* Returns the string in the token N after the current one, in UTF-8 encoding.
1457    The string is null-terminated (but the null terminator is not included in
1458    the returned substring's 'length').
1459
1460    Only T_ID, T_MACRO_ID, T_STRING tokens have meaningful strings.  For other
1461    tokens this functions this function will always return NULL.
1462
1463    The UTF-8 encoding of the returned string is correct for variable names and
1464    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
1465    data_in() to use it in a "union value".  */
1466 struct substring
1467 lex_next_tokss (const struct lexer *lexer, int n)
1468 {
1469   return lex_next (lexer, n)->string;
1470 }
1471
1472 /* Returns the offset of the current token within the command being parsed in
1473    LEXER.  This is 0 for the first token in a command, 1 for the second, and so
1474    on.  The return value is useful later for referring to this token in calls
1475    to lex_ofs_*(). */
1476 int
1477 lex_ofs (const struct lexer *lexer)
1478 {
1479   struct lex_source *src = lex_source__ (lexer);
1480   return src ? src->parse_ofs : 0;
1481 }
1482
1483 /* Returns the offset of the last token in the current command. */
1484 int
1485 lex_max_ofs (const struct lexer *lexer)
1486 {
1487   struct lex_source *src = lex_source__ (lexer);
1488   if (!src)
1489     return 0;
1490
1491   int ofs = MAX (1, src->n_parse) - 1;
1492   for (;;)
1493     {
1494       enum token_type type = lex_source_ofs__ (src, ofs)->token.type;
1495       if (type == T_ENDCMD || type == T_STOP)
1496         return ofs;
1497
1498       ofs++;
1499     }
1500 }
1501
1502 /* Returns the token within LEXER's current command with offset OFS.  Use
1503    lex_ofs() to find out the offset of the current token. */
1504 const struct token *
1505 lex_ofs_token (const struct lexer *lexer_, int ofs)
1506 {
1507   struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
1508   struct lex_source *src = lex_source__ (lexer);
1509
1510   if (src != NULL)
1511     return &lex_source_next__ (src, ofs - src->parse_ofs)->token;
1512   else
1513     {
1514       static const struct token stop_token = { .type = T_STOP };
1515       return &stop_token;
1516     }
1517 }
1518
1519 /* Allocates and returns a new struct msg_location that spans tokens with
1520    offsets OFS0 through OFS1, inclusive, within the current command in
1521    LEXER.  See lex_ofs() for an explanation of token offsets.
1522
1523    The caller owns and must eventually free the returned object. */
1524 struct msg_location *
1525 lex_ofs_location (const struct lexer *lexer, int ofs0, int ofs1)
1526 {
1527   int ofs = lex_ofs (lexer);
1528   return lex_get_location (lexer, ofs0 - ofs, ofs1 - ofs);
1529 }
1530
1531 /* Returns a msg_point for the first character in the token with offset OFS,
1532    where offset 0 is the first token in the command currently being parsed, 1
1533    the second token, and so on.  These are absolute offsets, not relative to
1534    the token currently being parsed within the command.
1535
1536    Returns zeros for a T_STOP token.
1537  */
1538 struct msg_point
1539 lex_ofs_start_point (const struct lexer *lexer, int ofs)
1540 {
1541   const struct lex_source *src = lex_source__ (lexer);
1542   return (src
1543           ? lex_token_start_point (src, lex_source_ofs__ (src, ofs))
1544           : (struct msg_point) { 0, 0 });
1545 }
1546
1547 /* Returns a msg_point for the last character, inclusive, in the token with
1548    offset OFS, where offset 0 is the first token in the command currently being
1549    parsed, 1 the second token, and so on.  These are absolute offsets, not
1550    relative to the token currently being parsed within the command.
1551
1552    Returns zeros for a T_STOP token.
1553
1554    Most of the time, a single token is wholly within a single line of syntax,
1555    so that the start and end point for a given offset have the same line
1556    number.  There are two exceptions: a T_STRING token can be made up of
1557    multiple segments on adjacent lines connected with "+" punctuators, and a
1558    T_NEG_NUM token can consist of a "-" on one line followed by the number on
1559    the next.
1560  */
1561 struct msg_point
1562 lex_ofs_end_point (const struct lexer *lexer, int ofs)
1563 {
1564   const struct lex_source *src = lex_source__ (lexer);
1565   return (src
1566           ? lex_token_end_point (src, lex_source_ofs__ (src, ofs))
1567           : (struct msg_point) { 0, 0 });
1568 }
1569
1570 /* Returns the text of the syntax in tokens N0 ahead of the current one,
1571    through N1 ahead of the current one, inclusive.  (For example, if N0 and N1
1572    are both zero, this requests the syntax for the current token.)
1573
1574    The caller must eventually free the returned string (with free()).  The
1575    syntax is encoded in UTF-8 and in the original form supplied to the lexer so
1576    that, for example, it may include comments, spaces, and new-lines if it
1577    spans multiple tokens.  Macro expansion, however, has already been
1578    performed. */
1579 char *
1580 lex_next_representation (const struct lexer *lexer, int n0, int n1)
1581 {
1582   const struct lex_source *src = lex_source__ (lexer);
1583   return (src
1584           ? lex_source_syntax__ (src, n0 + src->parse_ofs, n1 + src->parse_ofs)
1585           : xstrdup (""));
1586 }
1587
1588
1589 /* Returns the text of the syntax in tokens with offsets OFS0 to OFS1,
1590    inclusive.  (For example, if OFS0 and OFS1 are both zero, this requests the
1591    syntax for the first token in the current command.)
1592
1593    The caller must eventually free the returned string (with free()).  The
1594    syntax is encoded in UTF-8 and in the original form supplied to the lexer so
1595    that, for example, it may include comments, spaces, and new-lines if it
1596    spans multiple tokens.  Macro expansion, however, has already been
1597    performed. */
1598 char *
1599 lex_ofs_representation (const struct lexer *lexer, int ofs0, int ofs1)
1600 {
1601   const struct lex_source *src = lex_source__ (lexer);
1602   return src ? lex_source_syntax__ (src, ofs0, ofs1) : xstrdup ("");
1603 }
1604
1605 /* Returns true if the token N ahead of the current one was produced by macro
1606    expansion, false otherwise. */
1607 bool
1608 lex_next_is_from_macro (const struct lexer *lexer, int n)
1609 {
1610   return lex_next__ (lexer, n)->macro_rep != NULL;
1611 }
1612
1613 static bool
1614 lex_tokens_match (const struct token *actual, const struct token *expected)
1615 {
1616   if (actual->type != expected->type)
1617     return false;
1618
1619   switch (actual->type)
1620     {
1621     case T_POS_NUM:
1622     case T_NEG_NUM:
1623       return actual->number == expected->number;
1624
1625     case T_ID:
1626       return lex_id_match (expected->string, actual->string);
1627
1628     case T_STRING:
1629       return (actual->string.length == expected->string.length
1630               && !memcmp (actual->string.string, expected->string.string,
1631                           actual->string.length));
1632
1633     default:
1634       return true;
1635     }
1636 }
1637
1638 static bool
1639 lex_ofs_at_phrase__ (struct lexer *lexer, int ofs, const char *s,
1640                      size_t *n_matchedp)
1641 {
1642   struct string_lexer slex;
1643   struct token token;
1644
1645   size_t n_matched = 0;
1646   bool all_matched = true;
1647   string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE, true);
1648   while (string_lexer_next (&slex, &token))
1649     {
1650       bool match = lex_tokens_match (lex_ofs_token (lexer, ofs + n_matched),
1651                                      &token);
1652       token_uninit (&token);
1653       if (!match)
1654         {
1655           all_matched = false;
1656           break;
1657         }
1658       n_matched++;
1659     }
1660   if (n_matchedp)
1661     *n_matchedp = n_matched;
1662   return all_matched;
1663 }
1664
1665 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1666    returns true.  Otherwise, returns false.
1667
1668    S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1669    "2SLS", or "END INPUT PROGRAM".  Identifiers may be abbreviated to their
1670    first three letters. */
1671 bool
1672 lex_at_phrase (struct lexer *lexer, const char *s)
1673 {
1674   return lex_ofs_at_phrase__ (lexer, lex_ofs (lexer), s, NULL);
1675 }
1676
1677 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1678    skips it and returns true.  Otherwise, returns false.
1679
1680    S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1681    "2SLS", or "END INPUT PROGRAM".  Identifiers may be abbreviated to their
1682    first three letters. */
1683 bool
1684 lex_match_phrase (struct lexer *lexer, const char *s)
1685 {
1686   size_t n_matched;
1687   if (!lex_ofs_at_phrase__ (lexer, lex_ofs (lexer), s, &n_matched))
1688     return false;
1689   lex_get_n (lexer, n_matched);
1690   return true;
1691 }
1692
1693 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1694    skips it and returns true.  Otherwise, issues an error and returns false.
1695
1696    S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1697    "2SLS", or "END INPUT PROGRAM".  Identifiers may be abbreviated to their
1698    first three letters. */
1699 bool
1700 lex_force_match_phrase (struct lexer *lexer, const char *s)
1701 {
1702   size_t n_matched;
1703   bool ok = lex_ofs_at_phrase__ (lexer, lex_ofs (lexer), s, &n_matched);
1704   if (ok)
1705     lex_get_n (lexer, n_matched);
1706   else
1707     lex_next_error (lexer, 0, n_matched, _("Syntax error expecting `%s'."), s);
1708   return ok;
1709 }
1710
1711 /* Returns the 1-based line number of the source text at the byte OFFSET in
1712    SRC. */
1713 static int
1714 lex_source_ofs_to_line_number (const struct lex_source *src, size_t offset)
1715 {
1716   size_t lo = 0;
1717   size_t hi = src->n_lines;
1718   for (;;)
1719     {
1720       size_t mid = (lo + hi) / 2;
1721       if (mid + 1 >= src->n_lines)
1722         return src->n_lines;
1723       else if (offset >= src->lines[mid + 1])
1724         lo = mid;
1725       else if (offset < src->lines[mid])
1726         hi = mid;
1727       else
1728         return mid + 1;
1729     }
1730 }
1731
1732 /* Returns the 1-based column number of the source text at the byte OFFSET in
1733    SRC. */
1734 static int
1735 lex_source_ofs_to_column_number (const struct lex_source *src, size_t offset)
1736 {
1737   const char *newline = memrchr (src->buffer, '\n', offset);
1738   size_t line_ofs = newline ? newline - src->buffer + 1 : 0;
1739   return utf8_count_columns (&src->buffer[line_ofs], offset - line_ofs) + 1;
1740 }
1741
1742 static struct msg_point
1743 lex_source_ofs_to_point__ (const struct lex_source *src, size_t offset)
1744 {
1745   return (struct msg_point) {
1746     .line = lex_source_ofs_to_line_number (src, offset),
1747     .column = lex_source_ofs_to_column_number (src, offset),
1748   };
1749 }
1750
1751 static struct msg_point
1752 lex_token_start_point (const struct lex_source *src,
1753                        const struct lex_token *token)
1754 {
1755   return lex_source_ofs_to_point__ (src, token->token_pos);
1756 }
1757
1758 static struct msg_point
1759 lex_token_end_point (const struct lex_source *src,
1760                      const struct lex_token *token)
1761 {
1762   return lex_source_ofs_to_point__ (src, lex_token_end (token));
1763 }
1764
1765 static struct msg_location
1766 lex_token_location (const struct lex_source *src,
1767                     const struct lex_token *t0,
1768                     const struct lex_token *t1)
1769 {
1770   return (struct msg_location) {
1771     .file_name = intern_new_if_nonnull (src->reader->file_name),
1772     .start = lex_token_start_point (src, t0),
1773     .end = lex_token_end_point (src, t1),
1774     .src = CONST_CAST (struct lex_source *, src),
1775   };
1776 }
1777
1778 static struct msg_location *
1779 lex_token_location_rw (const struct lex_source *src,
1780                        const struct lex_token *t0,
1781                        const struct lex_token *t1)
1782 {
1783   struct msg_location location = lex_token_location (src, t0, t1);
1784   return msg_location_dup (&location);
1785 }
1786
1787 static struct msg_location *
1788 lex_source_get_location (const struct lex_source *src, int ofs0, int ofs1)
1789 {
1790   return lex_token_location_rw (src,
1791                                 lex_source_ofs__ (src, ofs0),
1792                                 lex_source_ofs__ (src, ofs1));
1793 }
1794
1795 /* Returns the name of the syntax file from which the current command is drawn.
1796    Returns NULL for a T_STOP token or if the command's source does not have
1797    line numbers.
1798
1799    There is no version of this function that takes an N argument because
1800    lookahead only works to the end of a command and any given command is always
1801    within a single syntax file. */
1802 const char *
1803 lex_get_file_name (const struct lexer *lexer)
1804 {
1805   struct lex_source *src = lex_source__ (lexer);
1806   return src == NULL ? NULL : src->reader->file_name;
1807 }
1808
1809 /* Returns a newly allocated msg_location for the syntax that represents tokens
1810    with 0-based offsets N0...N1, inclusive, from the current token.  The caller
1811    must eventually free the location (with msg_location_destroy()). */
1812 struct msg_location *
1813 lex_get_location (const struct lexer *lexer, int n0, int n1)
1814 {
1815   struct msg_location *loc = xmalloc (sizeof *loc);
1816   *loc = (struct msg_location) {
1817     .file_name = intern_new_if_nonnull (lex_get_file_name (lexer)),
1818     .start = lex_ofs_start_point (lexer, n0 + lex_ofs (lexer)),
1819     .end = lex_ofs_end_point (lexer, n1 + lex_ofs (lexer)),
1820     .src = lex_source__ (lexer),
1821   };
1822   lex_source_ref (loc->src);
1823   return loc;
1824 }
1825
1826 const char *
1827 lex_get_encoding (const struct lexer *lexer)
1828 {
1829   struct lex_source *src = lex_source__ (lexer);
1830   return src == NULL ? NULL : src->reader->encoding;
1831 }
1832
1833 /* Returns the syntax mode for the syntax file from which the current drawn is
1834    drawn.  Returns SEG_MODE_AUTO for a T_STOP token or if the command's source
1835    does not have line numbers.
1836
1837    There is no version of this function that takes an N argument because
1838    lookahead only works to the end of a command and any given command is always
1839    within a single syntax file. */
1840 enum segmenter_mode
1841 lex_get_syntax_mode (const struct lexer *lexer)
1842 {
1843   struct lex_source *src = lex_source__ (lexer);
1844   return src == NULL ? SEG_MODE_AUTO : src->reader->syntax;
1845 }
1846
1847 /* Returns the error mode for the syntax file from which the current drawn is
1848    drawn.  Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1849    source does not have line numbers.
1850
1851    There is no version of this function that takes an N argument because
1852    lookahead only works to the end of a command and any given command is always
1853    within a single syntax file. */
1854 enum lex_error_mode
1855 lex_get_error_mode (const struct lexer *lexer)
1856 {
1857   struct lex_source *src = lex_source__ (lexer);
1858   return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1859 }
1860
1861 /* If the source that LEXER is currently reading has error mode
1862    LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1863    token to be read comes directly from whatever is next read from the stream.
1864
1865    It makes sense to call this function after encountering an error in a
1866    command entered on the console, because usually the user would prefer not to
1867    have cascading errors. */
1868 void
1869 lex_interactive_reset (struct lexer *lexer)
1870 {
1871   struct lex_source *src = lex_source__ (lexer);
1872   if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1873     {
1874       src->length = 0;
1875       src->journal_pos = src->seg_pos = 0;
1876       src->n_lines = 0;
1877       src->suppress_next_newline = false;
1878       src->segmenter = segmenter_init (segmenter_get_mode (&src->segmenter),
1879                                        false);
1880       lex_stage_clear (&src->pp);
1881       lex_stage_clear (&src->merge);
1882       lex_source_clear_parse (src);
1883       lex_source_push_endcmd__ (src);
1884     }
1885 }
1886
1887 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1888 void
1889 lex_discard_rest_of_command (struct lexer *lexer)
1890 {
1891   while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1892     lex_get (lexer);
1893 }
1894
1895 /* Discards all lookahead tokens in LEXER, then discards all input sources
1896    until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1897    runs out of input sources. */
1898 void
1899 lex_discard_noninteractive (struct lexer *lexer)
1900 {
1901   struct lex_source *src = lex_source__ (lexer);
1902   if (src != NULL)
1903     {
1904       if (src->reader->error == LEX_ERROR_IGNORE)
1905         return;
1906
1907       lex_stage_clear (&src->pp);
1908       lex_stage_clear (&src->merge);
1909       lex_source_clear_parse (src);
1910
1911       for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1912            src = lex_source__ (lexer))
1913         {
1914           ll_remove (&src->ll);
1915           lex_source_unref (src);
1916         }
1917     }
1918 }
1919 \f
1920 static void
1921 lex_source_expand__ (struct lex_source *src)
1922 {
1923   if (src->length >= src->allocated)
1924     src->buffer = x2realloc (src->buffer, &src->allocated);
1925 }
1926
1927 static void
1928 lex_source_read__ (struct lex_source *src)
1929 {
1930   do
1931     {
1932       lex_source_expand__ (src);
1933
1934       size_t space = src->allocated - src->length;
1935       enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1936       size_t n = src->reader->class->read (src->reader,
1937                                            &src->buffer[src->length],
1938                                            space, prompt);
1939       assert (n <= space);
1940
1941       if (n == 0)
1942         {
1943           /* End of input. */
1944           src->reader->eof = true;
1945           return;
1946         }
1947
1948       src->length += n;
1949     }
1950   while (!memchr (&src->buffer[src->seg_pos], '\n',
1951                   src->length - src->seg_pos));
1952 }
1953
1954 static struct lex_source *
1955 lex_source__ (const struct lexer *lexer)
1956 {
1957   return (ll_is_empty (&lexer->sources) ? NULL
1958           : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1959 }
1960
1961 const struct lex_source *
1962 lex_source (const struct lexer *lexer)
1963 {
1964   return lex_source__ (lexer);
1965 }
1966
1967 /* Returns the text of the syntax in SRC for tokens with offsets OFS0 through
1968    OFS1 in the current command, inclusive.  (For example, if OFS0 and OFS1 are
1969    both zero, this requests the syntax for the first token in the current
1970    command.)  The caller must eventually free the returned string (with
1971    free()).  The syntax is encoded in UTF-8 and in the original form supplied
1972    to the lexer so that, for example, it may include comments, spaces, and
1973    new-lines if it spans multiple tokens.  Macro expansion, however, has
1974    already been performed. */
1975 static char *
1976 lex_source_syntax__ (const struct lex_source *src, int ofs0, int ofs1)
1977 {
1978   struct string s = DS_EMPTY_INITIALIZER;
1979   for (size_t i = ofs0; i <= ofs1; )
1980     {
1981       /* Find [I,J) as the longest sequence of tokens not produced by macro
1982          expansion, or otherwise the longest sequence expanded from a single
1983          macro call. */
1984       const struct lex_token *first = lex_source_ofs__ (src, i);
1985       size_t j;
1986       for (j = i + 1; j <= ofs1; j++)
1987         {
1988           const struct lex_token *cur = lex_source_ofs__ (src, j);
1989           if ((first->macro_rep != NULL) != (cur->macro_rep != NULL)
1990               || first->macro_rep != cur->macro_rep)
1991             break;
1992         }
1993       const struct lex_token *last = lex_source_ofs__ (src, j - 1);
1994
1995       /* Now add the syntax for this sequence of tokens to SRC. */
1996       if (!ds_is_empty (&s))
1997         ds_put_byte (&s, ' ');
1998       if (!first->macro_rep)
1999         {
2000           size_t start = first->token_pos;
2001           size_t end = last->token_pos + last->token_len;
2002           ds_put_substring (&s, ss_buffer (&src->buffer[start], end - start));
2003         }
2004       else
2005         {
2006           size_t start = first->ofs;
2007           size_t end = last->ofs + last->len;
2008           ds_put_substring (&s, ss_buffer (first->macro_rep + start,
2009                                            end - start));
2010         }
2011
2012       i = j;
2013     }
2014   return ds_steal_cstr (&s);
2015 }
2016
2017 static bool
2018 lex_source_contains_macro_call (struct lex_source *src, int ofs0, int ofs1)
2019 {
2020   for (int i = ofs0; i <= ofs1; i++)
2021     if (lex_source_ofs__ (src, i)->macro_rep)
2022       return true;
2023   return false;
2024 }
2025
2026 /* If tokens N0...N1 (inclusive) in SRC contains a macro call, this returns the
2027    raw UTF-8 syntax for the macro call (not for the expansion) and for any
2028    other tokens included in that range.  The syntax is encoded in UTF-8 and in
2029    the original form supplied to the lexer so that, for example, it may include
2030    comments, spaces, and new-lines if it spans multiple tokens.
2031
2032    Returns an empty string if the token range doesn't include a macro call.
2033
2034    The caller must not modify or free the returned string. */
2035 static struct substring
2036 lex_source_get_macro_call (struct lex_source *src, int ofs0, int ofs1)
2037 {
2038   if (!lex_source_contains_macro_call (src, ofs0, ofs1))
2039     return ss_empty ();
2040
2041   const struct lex_token *token0 = lex_source_ofs__ (src, ofs0);
2042   const struct lex_token *token1 = lex_source_ofs__ (src, MAX (ofs0, ofs1));
2043   size_t start = token0->token_pos;
2044   size_t end = token1->token_pos + token1->token_len;
2045
2046   return ss_buffer (&src->buffer[start], end - start);
2047 }
2048
2049 static void
2050 lex_source_msg_valist (struct lex_source *src, enum msg_class class,
2051                        int ofs0, int ofs1, const char *format, va_list args)
2052 {
2053   struct string s = DS_EMPTY_INITIALIZER;
2054
2055   if (src)
2056     {
2057       /* Get the macro call(s) that expanded to the syntax that caused the
2058          error. */
2059       char call[64];
2060       str_ellipsize (lex_source_get_macro_call (src, ofs0, ofs1),
2061                      call, sizeof call);
2062       if (call[0])
2063         ds_put_format (&s, _("In syntax expanded from `%s'"), call);
2064     }
2065   else
2066     ds_put_cstr (&s, _("At end of input"));
2067
2068   if (!ds_is_empty (&s))
2069     ds_put_cstr (&s, ": ");
2070   if (format)
2071     ds_put_vformat (&s, format, args);
2072   else
2073     ds_put_cstr (&s, _("Syntax error."));
2074
2075   if (ds_last (&s) != '.')
2076     ds_put_byte (&s, '.');
2077
2078   struct msg *m = xmalloc (sizeof *m);
2079   *m = (struct msg) {
2080     .category = msg_class_to_category (class),
2081     .severity = msg_class_to_severity (class),
2082     .location = src ? lex_source_get_location (src, ofs0, ofs1) : NULL,
2083     .text = ds_steal_cstr (&s),
2084   };
2085   msg_emit (m);
2086 }
2087
2088 static void
2089 lex_get_error (struct lex_source *src, const struct lex_token *token)
2090 {
2091   char syntax[64];
2092   str_ellipsize (ss_buffer (&src->buffer[token->token_pos], token->token_len),
2093                  syntax, sizeof syntax);
2094
2095   struct string s = DS_EMPTY_INITIALIZER;
2096   ds_put_cstr (&s, token->token.string.string);
2097
2098   struct msg *m = xmalloc (sizeof *m);
2099   *m = (struct msg) {
2100     .category = MSG_C_SYNTAX,
2101     .severity = MSG_S_ERROR,
2102     .location = lex_token_location_rw (src, token, token),
2103     .text = ds_steal_cstr (&s),
2104   };
2105   msg_emit (m);
2106 }
2107
2108 /* Attempts to append an additional token to 'pp' in SRC, reading more from the
2109    underlying lex_reader if necessary.  Returns true if a new token was added
2110    to SRC's deque, false otherwise.  The caller should retry failures unless
2111    SRC's 'eof' marker was set to true indicating that there will be no more
2112    tokens from this source. */
2113 static bool
2114 lex_source_try_get_pp (struct lex_source *src)
2115 {
2116   /* Append a new token to SRC and initialize it. */
2117   struct lex_token *token = xmalloc (sizeof *token);
2118   token->token = (struct token) { .type = T_STOP };
2119   token->macro_rep = NULL;
2120   token->ref_cnt = NULL;
2121   token->token_pos = src->seg_pos;
2122
2123   /* Extract a segment. */
2124   const char *segment;
2125   enum segment_type seg_type;
2126   int seg_len;
2127   for (;;)
2128     {
2129       segment = &src->buffer[src->seg_pos];
2130       seg_len = segmenter_push (&src->segmenter, segment,
2131                                 src->length - src->seg_pos,
2132                                 src->reader->eof, &seg_type);
2133       if (seg_len >= 0)
2134         break;
2135
2136       /* The segmenter needs more input to produce a segment. */
2137       assert (!src->reader->eof);
2138       lex_source_read__ (src);
2139     }
2140
2141   /* Update state based on the segment. */
2142   token->token_len = seg_len;
2143   src->seg_pos += seg_len;
2144   if (seg_type == SEG_NEWLINE)
2145     {
2146       if (src->n_lines >= src->allocated_lines)
2147         src->lines = x2nrealloc (src->lines, &src->allocated_lines,
2148                                  sizeof *src->lines);
2149       src->lines[src->n_lines++] = src->seg_pos;
2150     }
2151
2152   /* Get a token from the segment. */
2153   enum tokenize_result result = token_from_segment (
2154     seg_type, ss_buffer (segment, seg_len), &token->token);
2155
2156   /* If we've reached the end of a line, or the end of a command, then pass
2157      the line to the output engine as a syntax text item.  */
2158   int n_lines = seg_type == SEG_NEWLINE;
2159   if (seg_type == SEG_END_COMMAND && !src->suppress_next_newline)
2160     {
2161       n_lines++;
2162       src->suppress_next_newline = true;
2163     }
2164   else if (n_lines > 0 && src->suppress_next_newline)
2165     {
2166       n_lines--;
2167       src->suppress_next_newline = false;
2168     }
2169   for (int i = 0; i < n_lines; i++)
2170     {
2171       /* Beginning of line. */
2172       const char *line = &src->buffer[src->journal_pos];
2173
2174       /* Calculate line length, including \n or \r\n end-of-line if present.
2175
2176          We use src->length even though that may be beyond what we've actually
2177          converted to tokens.  That's because, if we're emitting the line due
2178          to SEG_END_COMMAND, we want to take the whole line through the
2179          newline, not just through the '.'. */
2180       size_t max_len = src->length - src->journal_pos;
2181       const char *newline = memchr (line, '\n', max_len);
2182       size_t line_len = newline ? newline - line + 1 : max_len;
2183
2184       /* Calculate line length excluding end-of-line. */
2185       size_t copy_len = line_len;
2186       if (copy_len > 0 && line[copy_len - 1] == '\n')
2187         copy_len--;
2188       if (copy_len > 0 && line[copy_len - 1] == '\r')
2189         copy_len--;
2190
2191       /* Submit the line as syntax. */
2192       output_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX,
2193                                                    xmemdup0 (line, copy_len),
2194                                                    NULL));
2195
2196       src->journal_pos += line_len;
2197     }
2198
2199   switch (result)
2200     {
2201     case TOKENIZE_ERROR:
2202       lex_get_error (src, token);
2203       /* Fall through. */
2204     case TOKENIZE_EMPTY:
2205       lex_token_destroy (token);
2206       return false;
2207
2208     case TOKENIZE_TOKEN:
2209       if (token->token.type == T_STOP)
2210         {
2211           token->token.type = T_ENDCMD;
2212           src->eof = true;
2213         }
2214       lex_stage_push_last (&src->pp, token);
2215       return true;
2216     }
2217   NOT_REACHED ();
2218 }
2219
2220 /* Attempts to append a new token to SRC.  Returns true if successful, false on
2221    failure.  On failure, the end of SRC has been reached and no more tokens
2222    will be forthcoming from it.
2223
2224    Does not make the new token available for lookahead yet; the caller must
2225    adjust SRC's 'middle' pointer to do so. */
2226 static bool
2227 lex_source_get_pp (struct lex_source *src)
2228 {
2229   while (!src->eof)
2230     if (lex_source_try_get_pp (src))
2231       return true;
2232   return false;
2233 }
2234
2235 static bool
2236 lex_source_try_get_merge (const struct lex_source *src_)
2237 {
2238   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
2239
2240   if (lex_stage_is_empty (&src->pp) && !lex_source_get_pp (src))
2241     return false;
2242
2243   if (!settings_get_mexpand ())
2244     {
2245       lex_stage_shift (&src->merge, &src->pp, lex_stage_count (&src->pp));
2246       return true;
2247     }
2248
2249   /* Now pass tokens one-by-one to the macro expander.
2250
2251      In the common case where there is no macro to expand, the loop is not
2252      entered.  */
2253   struct macro_call *mc;
2254   int n_call = macro_call_create (src->lexer->macros,
2255                                   &lex_stage_first (&src->pp)->token, &mc);
2256   for (int ofs = 1; !n_call; ofs++)
2257     {
2258       if (lex_stage_count (&src->pp) <= ofs && !lex_source_get_pp (src))
2259         {
2260           /* This should not be reachable because we always get a T_ENDCMD at
2261              the end of an input file (transformed from T_STOP by
2262              lex_source_try_get_pp()) and the macro_expander should always
2263              terminate expansion on T_ENDCMD. */
2264           NOT_REACHED ();
2265         }
2266
2267       const struct lex_token *t = lex_stage_nth (&src->pp, ofs);
2268       const struct macro_token mt = {
2269         .token = t->token,
2270         .syntax = ss_buffer (&src->buffer[t->token_pos], t->token_len),
2271       };
2272       const struct msg_location loc = lex_token_location (src, t, t);
2273       n_call = macro_call_add (mc, &mt, &loc);
2274     }
2275   if (n_call < 0)
2276     {
2277       /* False alarm: no macro expansion after all.  Use first token as
2278          lookahead.  We'll retry macro expansion from the second token next
2279          time around. */
2280       macro_call_destroy (mc);
2281       lex_stage_shift (&src->merge, &src->pp, 1);
2282       return true;
2283     }
2284
2285   /* The first 'n_call' tokens in 'pp', which we bracket as C0...C1, inclusive,
2286      are a macro call.  (These are likely to be the only tokens in 'pp'.)
2287      Expand them.  */
2288   const struct lex_token *c0 = lex_stage_first (&src->pp);
2289   const struct lex_token *c1 = lex_stage_nth (&src->pp, n_call - 1);
2290   struct macro_tokens expansion = { .n = 0 };
2291   struct msg_location loc = lex_token_location (src, c0, c1);
2292   macro_call_expand (mc, src->reader->syntax, &loc, &expansion);
2293   macro_call_destroy (mc);
2294
2295   /* Convert the macro expansion into syntax for possible error messages
2296      later. */
2297   size_t *ofs = xnmalloc (expansion.n, sizeof *ofs);
2298   size_t *len = xnmalloc (expansion.n, sizeof *len);
2299   struct string s = DS_EMPTY_INITIALIZER;
2300   macro_tokens_to_syntax (&expansion, &s, ofs, len);
2301
2302   if (settings_get_mprint ())
2303     output_item_submit (text_item_create (TEXT_ITEM_LOG, ds_cstr (&s),
2304                                           _("Macro Expansion")));
2305
2306   /* Append the macro expansion tokens to the lookahead. */
2307   if (expansion.n > 0)
2308     {
2309       char *macro_rep = ds_steal_cstr (&s);
2310       size_t *ref_cnt = xmalloc (sizeof *ref_cnt);
2311       *ref_cnt = expansion.n;
2312       for (size_t i = 0; i < expansion.n; i++)
2313         {
2314           struct lex_token *token = xmalloc (sizeof *token);
2315           *token = (struct lex_token) {
2316             .token = expansion.mts[i].token,
2317             .token_pos = c0->token_pos,
2318             .token_len = (c1->token_pos + c1->token_len) - c0->token_pos,
2319             .macro_rep = macro_rep,
2320             .ofs = ofs[i],
2321             .len = len[i],
2322             .ref_cnt = ref_cnt,
2323           };
2324           lex_stage_push_last (&src->merge, token);
2325
2326           ss_dealloc (&expansion.mts[i].syntax);
2327         }
2328     }
2329   else
2330     ds_destroy (&s);
2331   free (expansion.mts);
2332   free (ofs);
2333   free (len);
2334
2335   /* Destroy the tokens for the call. */
2336   for (size_t i = 0; i < n_call; i++)
2337     lex_stage_pop_first (&src->pp);
2338
2339   return expansion.n > 0;
2340 }
2341
2342 /* Attempts to obtain at least one new token into 'merge' in SRC.
2343
2344    Returns true if successful, false on failure.  In the latter case, SRC is
2345    exhausted and 'src->eof' is now true. */
2346 static bool
2347 lex_source_get_merge (struct lex_source *src)
2348 {
2349   while (!src->eof)
2350     if (lex_source_try_get_merge (src))
2351       return true;
2352   return false;
2353 }
2354
2355 /* Attempts to obtain at least one new token into 'lookahead' in SRC.
2356
2357    Returns true if successful, false on failure.  In the latter case, SRC is
2358    exhausted and 'src->eof' is now true. */
2359 static bool
2360 lex_source_get_parse (struct lex_source *src)
2361 {
2362   struct merger m = MERGER_INIT;
2363   struct token out;
2364   for (size_t i = 0; ; i++)
2365     {
2366       while (lex_stage_count (&src->merge) <= i && !lex_source_get_merge (src))
2367         {
2368           /* We always get a T_ENDCMD at the end of an input file
2369              (transformed from T_STOP by lex_source_try_get_pp()) and
2370              merger_add() should never return -1 on T_ENDCMD. */
2371           assert (lex_stage_is_empty (&src->merge));
2372           return false;
2373         }
2374
2375       int retval = merger_add (&m, &lex_stage_nth (&src->merge, i)->token,
2376                                &out);
2377       if (!retval)
2378         {
2379           lex_source_push_parse (src, lex_stage_take_first (&src->merge));
2380           return true;
2381         }
2382       else if (retval > 0)
2383         {
2384           /* Add a token that merges all the tokens together. */
2385           const struct lex_token *first = lex_stage_first (&src->merge);
2386           const struct lex_token *last = lex_stage_nth (&src->merge,
2387                                                         retval - 1);
2388           bool macro = first->macro_rep && first->macro_rep == last->macro_rep;
2389           struct lex_token *t = xmalloc (sizeof *t);
2390           *t = (struct lex_token) {
2391             .token = out,
2392             .token_pos = first->token_pos,
2393             .token_len = (last->token_pos - first->token_pos) + last->token_len,
2394
2395             /* This works well if all the tokens were not expanded from macros,
2396                or if they came from the same macro expansion.  It just gives up
2397                in the other (corner) cases. */
2398             .macro_rep = macro ? first->macro_rep : NULL,
2399             .ofs = macro ? first->ofs : 0,
2400             .len = macro ? (last->ofs - first->ofs) + last->len : 0,
2401             .ref_cnt = macro ? first->ref_cnt : NULL,
2402           };
2403           if (t->ref_cnt)
2404             ++*t->ref_cnt;
2405           lex_source_push_parse (src, t);
2406
2407           for (int i = 0; i < retval; i++)
2408             lex_stage_pop_first (&src->merge);
2409           return true;
2410         }
2411     }
2412 }
2413 \f
2414 static void
2415 lex_source_push_endcmd__ (struct lex_source *src)
2416 {
2417   assert (src->n_parse == 0);
2418
2419   struct lex_token *token = xmalloc (sizeof *token);
2420   *token = (struct lex_token) { .token = { .type = T_ENDCMD } };
2421   lex_source_push_parse (src, token);
2422 }
2423
2424 static void
2425 lex_source_push_parse (struct lex_source *src, struct lex_token *token)
2426 {
2427   if (src->n_parse >= src->allocated_parse)
2428     src->parse = x2nrealloc (src->parse, &src->allocated_parse,
2429                              sizeof *src->parse);
2430   src->parse[src->n_parse++] = token;
2431 }
2432
2433 static void
2434 lex_source_clear_parse (struct lex_source *src)
2435 {
2436   for (size_t i = 0; i < src->n_parse; i++)
2437     lex_token_destroy (src->parse[i]);
2438   src->n_parse = src->parse_ofs = 0;
2439 }
2440
2441 static struct lex_source *
2442 lex_source_create (struct lexer *lexer, struct lex_reader *reader)
2443 {
2444   size_t allocated_lines = 4;
2445   size_t *lines = xmalloc (allocated_lines * sizeof *lines);
2446   *lines = 0;
2447
2448   struct lex_source *src = xmalloc (sizeof *src);
2449   *src = (struct lex_source) {
2450     .n_refs = 1,
2451     .reader = reader,
2452     .segmenter = segmenter_init (reader->syntax, false),
2453     .lexer = lexer,
2454     .lines = lines,
2455     .n_lines = 1,
2456     .allocated_lines = allocated_lines,
2457   };
2458
2459   lex_source_push_endcmd__ (src);
2460
2461   return src;
2462 }
2463
2464 void
2465 lex_set_message_handler (struct lexer *lexer,
2466                          void (*output_msg) (const struct msg *,
2467                                              struct lexer *))
2468 {
2469   struct msg_handler msg_handler = {
2470     .output_msg = (void (*)(const struct msg *, void *)) output_msg,
2471     .aux = lexer,
2472     .lex_source_ref = lex_source_ref,
2473     .lex_source_unref = lex_source_unref,
2474     .lex_source_get_line = lex_source_get_line,
2475   };
2476   msg_set_handler (&msg_handler);
2477 }
2478
2479 struct lex_source *
2480 lex_source_ref (const struct lex_source *src_)
2481 {
2482   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
2483   if (src)
2484     {
2485       assert (src->n_refs > 0);
2486       src->n_refs++;
2487     }
2488   return src;
2489 }
2490
2491 void
2492 lex_source_unref (struct lex_source *src)
2493 {
2494   if (!src)
2495     return;
2496
2497   assert (src->n_refs > 0);
2498   if (--src->n_refs > 0)
2499     return;
2500
2501   char *file_name = src->reader->file_name;
2502   char *encoding = src->reader->encoding;
2503   if (src->reader->class->destroy != NULL)
2504     src->reader->class->destroy (src->reader);
2505   free (file_name);
2506   free (encoding);
2507   free (src->buffer);
2508   free (src->lines);
2509   lex_stage_uninit (&src->pp);
2510   lex_stage_uninit (&src->merge);
2511   lex_source_clear_parse (src);
2512   free (src->parse);
2513   free (src);
2514 }
2515 \f
2516 struct lex_file_reader
2517   {
2518     struct lex_reader reader;
2519     struct u8_istream *istream;
2520   };
2521
2522 static struct lex_reader_class lex_file_reader_class;
2523
2524 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
2525    from stdin if FILE_NAME is "-").  The file is expected to be encoded with
2526    ENCODING, which should take one of the forms accepted by
2527    u8_istream_for_file().  SYNTAX and ERROR become the syntax mode and error
2528    mode of the new reader, respectively.
2529
2530    Returns a null pointer if FILE_NAME cannot be opened. */
2531 struct lex_reader *
2532 lex_reader_for_file (const char *file_name, const char *encoding,
2533                      enum segmenter_mode syntax,
2534                      enum lex_error_mode error)
2535 {
2536   struct lex_file_reader *r;
2537   struct u8_istream *istream;
2538
2539   istream = (!strcmp(file_name, "-")
2540              ? u8_istream_for_fd (encoding, STDIN_FILENO)
2541              : u8_istream_for_file (encoding, file_name, O_RDONLY));
2542   if (istream == NULL)
2543     {
2544       msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
2545       return NULL;
2546     }
2547
2548   r = xmalloc (sizeof *r);
2549   lex_reader_init (&r->reader, &lex_file_reader_class);
2550   r->reader.syntax = syntax;
2551   r->reader.error = error;
2552   r->reader.file_name = xstrdup (file_name);
2553   r->reader.encoding = xstrdup_if_nonnull (encoding);
2554   r->reader.line_number = 1;
2555   r->istream = istream;
2556
2557   return &r->reader;
2558 }
2559
2560 static struct lex_file_reader *
2561 lex_file_reader_cast (struct lex_reader *r)
2562 {
2563   return UP_CAST (r, struct lex_file_reader, reader);
2564 }
2565
2566 static size_t
2567 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
2568                enum prompt_style prompt_style UNUSED)
2569 {
2570   struct lex_file_reader *r = lex_file_reader_cast (r_);
2571   ssize_t n_read = u8_istream_read (r->istream, buf, n);
2572   if (n_read < 0)
2573     {
2574       msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
2575       return 0;
2576     }
2577   return n_read;
2578 }
2579
2580 static void
2581 lex_file_close (struct lex_reader *r_)
2582 {
2583   struct lex_file_reader *r = lex_file_reader_cast (r_);
2584
2585   if (u8_istream_fileno (r->istream) != STDIN_FILENO)
2586     {
2587       if (u8_istream_close (r->istream) != 0)
2588         msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
2589     }
2590   else
2591     u8_istream_free (r->istream);
2592
2593   free (r);
2594 }
2595
2596 static struct lex_reader_class lex_file_reader_class =
2597   {
2598     lex_file_read,
2599     lex_file_close
2600   };
2601 \f
2602 struct lex_string_reader
2603   {
2604     struct lex_reader reader;
2605     struct substring s;
2606     size_t offset;
2607   };
2608
2609 static struct lex_reader_class lex_string_reader_class;
2610
2611 /* Creates and returns a new lex_reader for the contents of S, which must be
2612    encoded in the given ENCODING.  The new reader takes ownership of S and will free it
2613    with ss_dealloc() when it is closed. */
2614 struct lex_reader *
2615 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
2616 {
2617   struct lex_string_reader *r;
2618
2619   r = xmalloc (sizeof *r);
2620   lex_reader_init (&r->reader, &lex_string_reader_class);
2621   r->reader.syntax = SEG_MODE_AUTO;
2622   r->reader.encoding = xstrdup_if_nonnull (encoding);
2623   r->s = s;
2624   r->offset = 0;
2625
2626   return &r->reader;
2627 }
2628
2629 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
2630    which must be encoded in ENCODING.  The caller retains ownership of S. */
2631 struct lex_reader *
2632 lex_reader_for_string (const char *s, const char *encoding)
2633 {
2634   return lex_reader_for_substring_nocopy (ss_clone (ss_cstr (s)), encoding);
2635 }
2636
2637 /* Formats FORMAT as a printf()-like format string and creates and returns a
2638    new lex_reader for the formatted result.  */
2639 struct lex_reader *
2640 lex_reader_for_format (const char *format, const char *encoding, ...)
2641 {
2642   struct lex_reader *r;
2643   va_list args;
2644
2645   va_start (args, encoding);
2646   r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
2647   va_end (args);
2648
2649   return r;
2650 }
2651
2652 static struct lex_string_reader *
2653 lex_string_reader_cast (struct lex_reader *r)
2654 {
2655   return UP_CAST (r, struct lex_string_reader, reader);
2656 }
2657
2658 static size_t
2659 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
2660                  enum prompt_style prompt_style UNUSED)
2661 {
2662   struct lex_string_reader *r = lex_string_reader_cast (r_);
2663   size_t chunk;
2664
2665   chunk = MIN (n, r->s.length - r->offset);
2666   memcpy (buf, r->s.string + r->offset, chunk);
2667   r->offset += chunk;
2668
2669   return chunk;
2670 }
2671
2672 static void
2673 lex_string_close (struct lex_reader *r_)
2674 {
2675   struct lex_string_reader *r = lex_string_reader_cast (r_);
2676
2677   ss_dealloc (&r->s);
2678   free (r);
2679 }
2680
2681 static struct lex_reader_class lex_string_reader_class =
2682   {
2683     lex_string_read,
2684     lex_string_close
2685   };
2686 \f
2687 struct substring
2688 lex_source_get_line (const struct lex_source *src, int line)
2689 {
2690   if (line < 1 || line > src->n_lines)
2691     return ss_empty ();
2692
2693   size_t ofs = src->lines[line - 1];
2694   size_t end;
2695   if (line < src->n_lines)
2696     end = src->lines[line];
2697   else
2698     {
2699       const char *newline = memchr (src->buffer + ofs, '\n', src->length - ofs);
2700       end = newline ? newline - src->buffer : src->length;
2701     }
2702   return ss_buffer (&src->buffer[ofs], end - ofs);
2703 }