pintos-os.org Git - pspp/blob - src/language/lexer/lexer.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "language/lexer/lexer.h"
  20
  21 #include <errno.h>
  22 #include <fcntl.h>
  23 #include <limits.h>
  24 #include <math.h>
  25 #include <stdarg.h>
  26 #include <stdlib.h>
  27 #include <string.h>
  28 #include <unictype.h>
  29 #include <unistd.h>
  30 #include <unistr.h>
  31
  32 #include "language/command.h"
  33 #include "language/lexer/macro.h"
  34 #include "language/lexer/scan.h"
  35 #include "language/lexer/segment.h"
  36 #include "language/lexer/token.h"
  37 #include "libpspp/assertion.h"
  38 #include "libpspp/cast.h"
  39 #include "libpspp/deque.h"
  40 #include "libpspp/i18n.h"
  41 #include "libpspp/intern.h"
  42 #include "libpspp/ll.h"
  43 #include "libpspp/message.h"
  44 #include "libpspp/misc.h"
  45 #include "libpspp/str.h"
  46 #include "libpspp/u8-istream.h"
  47 #include "output/journal.h"
  48 #include "output/output-item.h"
  49
  50 #include "gl/c-ctype.h"
  51 #include "gl/minmax.h"
  52 #include "gl/xalloc.h"
  53 #include "gl/xmemdup0.h"
  54
  55 #include "gettext.h"
  56 #define _(msgid) gettext (msgid)
  57 #define N_(msgid) msgid
  58
  59 /* A token within a lex_source. */
  60 struct lex_token
  61   {
  62     /* The regular token information. */
  63     struct token token;
  64
  65     /* For a token obtained through the lexer in an ordinary way, this is the
  66        location of the token in terms of the lex_source's buffer.
  67
  68        For a token produced through macro expansion, this is the entire macro
  69        call. */
  70     size_t token_pos;           /* Offset into src->buffer of token start. */
  71     size_t token_len;           /* Length of source for token in bytes. */
  72
  73     /* For a token obtained through macro expansion, this is just this token.
  74
  75        For a token obtained through the lexer in an ordinary way, these are
  76        nulls and zeros. */
  77     char *macro_rep;        /* The whole macro expansion. */
  78     size_t ofs;             /* Offset of this token in macro_rep. */
  79     size_t len;             /* Length of this token in macro_rep. */
  80     size_t *ref_cnt;        /* Number of lex_tokens that refer to macro_rep. */
  81   };
  82
  83 static struct msg_point lex_token_start_point (const struct lex_source *,
  84                                                const struct lex_token *);
  85 static struct msg_point lex_token_end_point (const struct lex_source *,
  86                                              const struct lex_token *);
  87
  88 static size_t lex_ofs_at_phrase__ (struct lexer *, int ofs, const char *s);
  89
  90 /* Source offset of the last byte in TOKEN. */
  91 static size_t
  92 lex_token_end (const struct lex_token *token)
  93 {
  94   return token->token_pos + MAX (token->token_len, 1) - 1;
  95 }
  96
  97 static void
  98 lex_token_destroy (struct lex_token *t)
  99 {
 100   token_uninit (&t->token);
 101   if (t->ref_cnt)
 102     {
 103       assert (*t->ref_cnt > 0);
 104       if (!--*t->ref_cnt)
 105         {
 106           free (t->macro_rep);
 107           free (t->ref_cnt);
 108         }
 109     }
 110   free (t);
 111 }
 112 \f
 113 /* A deque of lex_tokens that comprises one stage in the token pipeline in a
 114    lex_source. */
 115 struct lex_stage
 116   {
 117     struct deque deque;
 118     struct lex_token **tokens;
 119   };
 120
 121 static void lex_stage_clear (struct lex_stage *);
 122 static void lex_stage_uninit (struct lex_stage *);
 123
 124 static size_t lex_stage_count (const struct lex_stage *);
 125 static bool lex_stage_is_empty (const struct lex_stage *);
 126
 127 static struct lex_token *lex_stage_first (struct lex_stage *);
 128 static struct lex_token *lex_stage_nth (struct lex_stage *, size_t ofs);
 129
 130 static void lex_stage_push_last (struct lex_stage *, struct lex_token *);
 131 static void lex_stage_pop_first (struct lex_stage *);
 132
 133 static void lex_stage_shift (struct lex_stage *dst, struct lex_stage *src,
 134                              size_t n);
 135
 136 /* Deletes all the tokens from STAGE. */
 137 static void
 138 lex_stage_clear (struct lex_stage *stage)
 139 {
 140   while (!deque_is_empty (&stage->deque))
 141     lex_stage_pop_first (stage);
 142 }
 143
 144 /* Deletes all the tokens from STAGE and frees storage for the deque. */
 145 static void
 146 lex_stage_uninit (struct lex_stage *stage)
 147 {
 148   lex_stage_clear (stage);
 149   free (stage->tokens);
 150 }
 151
 152 /* Returns true if STAGE contains no tokens, otherwise false. */
 153 static bool
 154 lex_stage_is_empty (const struct lex_stage *stage)
 155 {
 156   return deque_is_empty (&stage->deque);
 157 }
 158
 159 /* Returns the number of tokens in STAGE. */
 160 static size_t
 161 lex_stage_count (const struct lex_stage *stage)
 162 {
 163   return deque_count (&stage->deque);
 164 }
 165
 166 /* Returns the first token in STAGE, which must be nonempty.
 167    The first token is the one accessed with the least lookahead. */
 168 static struct lex_token *
 169 lex_stage_first (struct lex_stage *stage)
 170 {
 171   return lex_stage_nth (stage, 0);
 172 }
 173
 174 /* Returns the token the given INDEX in STAGE.  The first token (with the least
 175    lookahead) is 0, the second token is 1, and so on.  There must be at least
 176    INDEX + 1 tokens in STAGE. */
 177 static struct lex_token *
 178 lex_stage_nth (struct lex_stage *stage, size_t index)
 179 {
 180   return stage->tokens[deque_back (&stage->deque, index)];
 181 }
 182
 183 /* Adds TOKEN so that it becomes the last token in STAGE. */
 184 static void
 185 lex_stage_push_last (struct lex_stage *stage, struct lex_token *token)
 186 {
 187   if (deque_is_full (&stage->deque))
 188     stage->tokens = deque_expand (&stage->deque, stage->tokens,
 189                                   sizeof *stage->tokens);
 190   stage->tokens[deque_push_front (&stage->deque)] = token;
 191 }
 192
 193 /* Removes and returns the first token from STAGE. */
 194 static struct lex_token *
 195 lex_stage_take_first (struct lex_stage *stage)
 196 {
 197   return stage->tokens[deque_pop_back (&stage->deque)];
 198 }
 199
 200 /* Removes the first token from STAGE and uninitializes it. */
 201 static void
 202 lex_stage_pop_first (struct lex_stage *stage)
 203 {
 204   lex_token_destroy (lex_stage_take_first (stage));
 205 }
 206
 207 /* Removes the first N tokens from SRC, appending them to DST as the last
 208    tokens. */
 209 static void
 210 lex_stage_shift (struct lex_stage *dst, struct lex_stage *src, size_t n)
 211 {
 212   for (size_t i = 0; i < n; i++)
 213     lex_stage_push_last (dst, lex_stage_take_first (src));
 214 }
 215
 216 /* A source of tokens, corresponding to a syntax file.
 217
 218    This is conceptually a lex_reader wrapped with everything needed to convert
 219    its UTF-8 bytes into tokens. */
 220 struct lex_source
 221   {
 222     struct ll ll;               /* In lexer's list of sources. */
 223
 224     /* Reference count:
 225
 226        - One for struct lexer.
 227
 228        - One for each struct msg_location that references this source. */
 229     size_t n_refs;
 230
 231     struct lex_reader *reader;
 232     struct lexer *lexer;
 233     struct segmenter segmenter;
 234     bool eof;                   /* True if T_STOP was read from 'reader'. */
 235
 236     /* Buffer of UTF-8 bytes. */
 237     char *buffer;               /* Source file contents. */
 238     size_t length;              /* Number of bytes filled. */
 239     size_t allocated;           /* Number of bytes allocated. */
 240
 241     /* Offsets into 'buffer'. */
 242     size_t journal_pos;         /* First byte not yet output to journal. */
 243     size_t seg_pos;             /* First byte not yet scanned as token. */
 244
 245     /* Offset into 'buffer' of starts of lines. */
 246     size_t *lines;
 247     size_t n_lines, allocated_lines;
 248
 249     bool suppress_next_newline;
 250
 251     /* Tokens.
 252
 253        This is a pipeline with the following stages.  Each token eventually
 254        made available to the parser passes through of these stages.  The stages
 255        are named after the processing that happens in each one.
 256
 257        Initially, tokens come from the segmenter and scanner to 'pp':
 258
 259        - pp: Tokens that need to pass through the macro preprocessor to end up
 260          in 'merge'.
 261
 262        - merge: Tokens that need to pass through scan_merge() to end up in
 263          'parse'.
 264
 265        - parse: Tokens available to the client for parsing.
 266
 267       'pp' and 'merge' store tokens only temporarily until they pass into
 268       'parse'.  Tokens then live in 'parse' until the command is fully
 269       consumed, at which time they are freed together. */
 270     struct lex_stage pp;
 271     struct lex_stage merge;
 272     struct lex_token **parse;
 273     size_t n_parse, allocated_parse, parse_ofs;
 274   };
 275
 276 static struct lex_source *lex_source_create (struct lexer *,
 277                                              struct lex_reader *);
 278
 279 /* Lexer. */
 280 struct lexer
 281   {
 282     struct ll_list sources;     /* Contains "struct lex_source"s. */
 283     struct macro_set *macros;
 284   };
 285
 286 static struct lex_source *lex_source__ (const struct lexer *);
 287 static char *lex_source_syntax__ (const struct lex_source *,
 288                                   int ofs0, int ofs1);
 289 static const struct lex_token *lex_next__ (const struct lexer *, int n);
 290 static void lex_source_push_endcmd__ (struct lex_source *);
 291 static void lex_source_push_parse (struct lex_source *, struct lex_token *);
 292 static void lex_source_clear_parse (struct lex_source *);
 293
 294 static bool lex_source_get_parse (struct lex_source *);
 295 static void lex_source_msg_valist (struct lex_source *, enum msg_class,
 296                                    int ofs0, int ofs1,
 297                                    const char *format, va_list)
 298    PRINTF_FORMAT (5, 0);
 299 static const struct lex_token *lex_source_next__ (const struct lex_source *,
 300                                                   int n);
 301 \f
 302 /* Initializes READER with the specified CLASS and otherwise some reasonable
 303    defaults.  The caller should fill in the others members as desired. */
 304 void
 305 lex_reader_init (struct lex_reader *reader,
 306                  const struct lex_reader_class *class)
 307 {
 308   reader->class = class;
 309   reader->syntax = SEG_MODE_AUTO;
 310   reader->error = LEX_ERROR_CONTINUE;
 311   reader->file_name = NULL;
 312   reader->encoding = NULL;
 313   reader->line_number = 0;
 314   reader->eof = false;
 315 }
 316
 317 /* Frees any file name already in READER and replaces it by a copy of
 318    FILE_NAME, or if FILE_NAME is null then clears any existing name. */
 319 void
 320 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
 321 {
 322   free (reader->file_name);
 323   reader->file_name = xstrdup_if_nonnull (file_name);
 324 }
 325 \f
 326 /* Creates and returns a new lexer. */
 327 struct lexer *
 328 lex_create (void)
 329 {
 330   struct lexer *lexer = xmalloc (sizeof *lexer);
 331   *lexer = (struct lexer) {
 332     .sources = LL_INITIALIZER (lexer->sources),
 333     .macros = macro_set_create (),
 334   };
 335   return lexer;
 336 }
 337
 338 /* Destroys LEXER. */
 339 void
 340 lex_destroy (struct lexer *lexer)
 341 {
 342   if (lexer != NULL)
 343     {
 344       struct lex_source *source, *next;
 345
 346       ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
 347         {
 348           ll_remove (&source->ll);
 349           lex_source_unref (source);
 350         }
 351       macro_set_destroy (lexer->macros);
 352       free (lexer);
 353     }
 354 }
 355
 356 /* Adds M to LEXER's set of macros.  M replaces any existing macro with the
 357    same name.  Takes ownership of M. */
 358 void
 359 lex_define_macro (struct lexer *lexer, struct macro *m)
 360 {
 361   macro_set_add (lexer->macros, m);
 362 }
 363
 364 /* Inserts READER into LEXER so that the next token read by LEXER comes from
 365    READER.  Before the caller, LEXER must either be empty or at a T_ENDCMD
 366    token. */
 367 void
 368 lex_include (struct lexer *lexer, struct lex_reader *reader)
 369 {
 370   assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
 371   ll_push_head (&lexer->sources, &lex_source_create (lexer, reader)->ll);
 372 }
 373
 374 /* Appends READER to LEXER, so that it will be read after all other current
 375    readers have already been read. */
 376 void
 377 lex_append (struct lexer *lexer, struct lex_reader *reader)
 378 {
 379   ll_push_tail (&lexer->sources, &lex_source_create (lexer, reader)->ll);
 380 }
 381 \f
 382 /* Advancing. */
 383
 384 /* Advances LEXER to the next token, consuming the current token. */
 385 void
 386 lex_get (struct lexer *lexer)
 387 {
 388   struct lex_source *src;
 389
 390   src = lex_source__ (lexer);
 391   if (src == NULL)
 392     return;
 393
 394   if (src->parse_ofs < src->n_parse)
 395     {
 396       if (src->parse[src->parse_ofs]->token.type == T_ENDCMD)
 397         lex_source_clear_parse (src);
 398       else
 399         src->parse_ofs++;
 400     }
 401
 402   while (src->parse_ofs == src->n_parse)
 403     if (!lex_source_get_parse (src))
 404       {
 405         ll_remove (&src->ll);
 406         lex_source_unref (src);
 407         src = lex_source__ (lexer);
 408         if (src == NULL)
 409           return;
 410       }
 411 }
 412
 413 /* Advances LEXER by N tokens. */
 414 void
 415 lex_get_n (struct lexer *lexer, size_t n)
 416 {
 417   while (n-- > 0)
 418     lex_get (lexer);
 419 }
 420 \f
 421 /* Issuing errors. */
 422
 423 /* Prints a syntax error message containing the current token and
 424    given message MESSAGE (if non-null). */
 425 void
 426 lex_error (struct lexer *lexer, const char *format, ...)
 427 {
 428   va_list args;
 429
 430   va_start (args, format);
 431   lex_ofs_msg_valist (lexer, SE, lex_ofs (lexer), lex_ofs (lexer),
 432                       format, args);
 433   va_end (args);
 434 }
 435
 436 /* Prints a syntax error message for the span of tokens N0 through N1,
 437    inclusive, from the current token in LEXER, adding message MESSAGE (if
 438    non-null). */
 439 void
 440 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
 441 {
 442   va_list args;
 443
 444   va_start (args, format);
 445   int ofs = lex_ofs (lexer);
 446   lex_ofs_msg_valist (lexer, SE, n0 + ofs, n1 + ofs, format, args);
 447   va_end (args);
 448 }
 449
 450 /* Prints a syntax error message for the span of tokens with offsets OFS0
 451    through OFS1, inclusive, within the current command in LEXER, adding message
 452    MESSAGE (if non-null). */
 453 void
 454 lex_ofs_error (struct lexer *lexer, int ofs0, int ofs1, const char *format, ...)
 455 {
 456   va_list args;
 457
 458   va_start (args, format);
 459   lex_ofs_msg_valist (lexer, SE, ofs0, ofs1, format, args);
 460   va_end (args);
 461 }
 462
 463 /* Prints a message of the given CLASS containing the current token and given
 464    message MESSAGE (if non-null). */
 465 void
 466 lex_msg (struct lexer *lexer, enum msg_class class, const char *format, ...)
 467 {
 468   va_list args;
 469
 470   va_start (args, format);
 471   lex_ofs_msg_valist (lexer, class, lex_ofs (lexer), lex_ofs (lexer),
 472                       format, args);
 473   va_end (args);
 474 }
 475
 476 /* Prints a syntax error message for the span of tokens N0 through N1,
 477    inclusive, from the current token in LEXER, adding message MESSAGE (if
 478    non-null). */
 479 void
 480 lex_next_msg (struct lexer *lexer, enum msg_class class, int n0, int n1,
 481               const char *format, ...)
 482 {
 483   va_list args;
 484
 485   va_start (args, format);
 486   int ofs = lex_ofs (lexer);
 487   lex_ofs_msg_valist (lexer, class, n0 + ofs, n1 + ofs, format, args);
 488   va_end (args);
 489 }
 490
 491 /* Prints a message of the given CLASS for the span of tokens with offsets OFS0
 492    through OFS1, inclusive, within the current command in LEXER, adding message
 493    MESSAGE (if non-null). */
 494 void
 495 lex_ofs_msg (struct lexer *lexer, enum msg_class class, int ofs0, int ofs1,
 496              const char *format, ...)
 497 {
 498   va_list args;
 499
 500   va_start (args, format);
 501   lex_ofs_msg_valist (lexer, class, ofs0, ofs1, format, args);
 502   va_end (args);
 503 }
 504
 505 /* Prints a syntax error message saying that one of the strings provided as
 506    varargs, up to the first NULL, is expected. */
 507 void
 508 (lex_error_expecting) (struct lexer *lexer, ...)
 509 {
 510   va_list args;
 511
 512   va_start (args, lexer);
 513   lex_error_expecting_valist (lexer, args);
 514   va_end (args);
 515 }
 516
 517 /* Prints a syntax error message saying that one of the options provided in
 518    ARGS, up to the first NULL, is expected. */
 519 void
 520 lex_error_expecting_valist (struct lexer *lexer, va_list args)
 521 {
 522   const char **options = NULL;
 523   size_t allocated = 0;
 524   size_t n = 0;
 525
 526   for (;;)
 527     {
 528       const char *option = va_arg (args, const char *);
 529       if (!option)
 530         break;
 531
 532       if (n >= allocated)
 533         options = x2nrealloc (options, &allocated, sizeof *options);
 534       options[n++] = option;
 535     }
 536   lex_error_expecting_array (lexer, options, n);
 537   free (options);
 538 }
 539
 540 void
 541 lex_error_expecting_array (struct lexer *lexer, const char **options, size_t n)
 542 {
 543   switch (n)
 544     {
 545     case 0:
 546       lex_error (lexer, NULL);
 547       break;
 548
 549     case 1:
 550       lex_error (lexer, _("Syntax error expecting %s."), options[0]);
 551       break;
 552
 553     case 2:
 554       lex_error (lexer, _("Syntax error expecting %s or %s."),
 555                  options[0], options[1]);
 556       break;
 557
 558     case 3:
 559       lex_error (lexer, _("Syntax error expecting %s, %s, or %s."),
 560                  options[0], options[1], options[2]);
 561       break;
 562
 563     case 4:
 564       lex_error (lexer, _("Syntax error expecting %s, %s, %s, or %s."),
 565                  options[0], options[1], options[2], options[3]);
 566       break;
 567
 568     case 5:
 569       lex_error (lexer, _("Syntax error expecting %s, %s, %s, %s, or %s."),
 570                  options[0], options[1], options[2], options[3], options[4]);
 571       break;
 572
 573     case 6:
 574       lex_error (lexer, _("Syntax error expecting %s, %s, %s, %s, %s, or %s."),
 575                  options[0], options[1], options[2], options[3], options[4],
 576                  options[5]);
 577       break;
 578
 579     case 7:
 580       lex_error (lexer, _("Syntax error expecting %s, %s, %s, %s, %s, %s, "
 581                           "or %s."),
 582                  options[0], options[1], options[2], options[3], options[4],
 583                  options[5], options[6]);
 584       break;
 585
 586     case 8:
 587       lex_error (lexer, _("Syntax error expecting %s, %s, %s, %s, %s, %s, %s, "
 588                           "or %s."),
 589                  options[0], options[1], options[2], options[3], options[4],
 590                  options[5], options[6], options[7]);
 591       break;
 592
 593     default:
 594       {
 595         struct string s = DS_EMPTY_INITIALIZER;
 596         for (size_t i = 0; i < n; i++)
 597           {
 598             if (i > 0)
 599               ds_put_cstr (&s, ", ");
 600             ds_put_cstr (&s, options[i]);
 601           }
 602         lex_error (lexer, _("Syntax error expecting one of the following: %s."),
 603                    ds_cstr (&s));
 604         ds_destroy (&s);
 605       }
 606       break;
 607     }
 608 }
 609
 610 /* Reports an error to the effect that subcommand SBC may only be specified
 611    once. */
 612 void
 613 lex_sbc_only_once (struct lexer *lexer, const char *sbc)
 614 {
 615   int ofs = lex_ofs (lexer) - 1;
 616   if (lex_ofs_token (lexer, ofs)->type == T_EQUALS)
 617     ofs--;
 618
 619   /* lex_ofs_at_phrase__() handles subcommand names that are keywords, such as
 620      BY. */
 621   if (lex_ofs_at_phrase__ (lexer, ofs, sbc))
 622     lex_ofs_error (lexer, ofs, ofs,
 623                    _("Subcommand %s may only be specified once."), sbc);
 624   else
 625     msg (SE, _("Subcommand %s may only be specified once."), sbc);
 626 }
 627
 628 /* Reports an error to the effect that subcommand SBC is missing.
 629
 630    This function does not take a lexer as an argument or use lex_error(),
 631    because a missing subcommand can normally be detected only after the whole
 632    command has been parsed, and so lex_error() would always report "Syntax
 633    error at end of command", which does not help the user find the error. */
 634 void
 635 lex_sbc_missing (struct lexer *lexer, const char *sbc)
 636 {
 637   lex_ofs_error (lexer, 0, lex_max_ofs (lexer),
 638                  _("Required subcommand %s was not specified."), sbc);
 639 }
 640
 641 /* Reports an error to the effect that specification SPEC may only be specified
 642    once within subcommand SBC. */
 643 void
 644 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
 645 {
 646   lex_error (lexer, _("%s may only be specified once within subcommand %s."),
 647              spec, sbc);
 648 }
 649
 650 /* Reports an error to the effect that specification SPEC is missing within
 651    subcommand SBC. */
 652 void
 653 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
 654 {
 655   lex_error (lexer, _("Required %s specification missing from %s subcommand."),
 656              spec, sbc);
 657 }
 658
 659 /* Prints a syntax error message for the span of tokens with offsets OFS0
 660    through OFS1, inclusive, within the current command in LEXER, adding message
 661    MESSAGE (if non-null) with the given ARGS. */
 662 void
 663 lex_ofs_msg_valist (struct lexer *lexer, enum msg_class class,
 664                     int ofs0, int ofs1, const char *format, va_list args)
 665 {
 666   lex_source_msg_valist (lex_source__ (lexer), class, ofs0, ofs1, format, args);
 667 }
 668
 669 /* Checks that we're at end of command.
 670    If so, returns a successful command completion code.
 671    If not, flags a syntax error and returns an error command
 672    completion code. */
 673 int
 674 lex_end_of_command (struct lexer *lexer)
 675 {
 676   if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
 677     {
 678       lex_error (lexer, _("Syntax error expecting end of command."));
 679       return CMD_FAILURE;
 680     }
 681   else
 682     return CMD_SUCCESS;
 683 }
 684 \f
 685 /* Token testing functions. */
 686
 687 /* Returns true if the current token is a number. */
 688 bool
 689 lex_is_number (const struct lexer *lexer)
 690 {
 691   return lex_next_is_number (lexer, 0);
 692 }
 693
 694 /* Returns true if the current token is a string. */
 695 bool
 696 lex_is_string (const struct lexer *lexer)
 697 {
 698   return lex_next_is_string (lexer, 0);
 699 }
 700
 701 /* Returns the value of the current token, which must be a
 702    floating point number. */
 703 double
 704 lex_number (const struct lexer *lexer)
 705 {
 706   return lex_next_number (lexer, 0);
 707 }
 708
 709 /* Returns true iff the current token is an integer. */
 710 bool
 711 lex_is_integer (const struct lexer *lexer)
 712 {
 713   return lex_next_is_integer (lexer, 0);
 714 }
 715
 716 /* Returns the value of the current token, which must be an
 717    integer. */
 718 long
 719 lex_integer (const struct lexer *lexer)
 720 {
 721   return lex_next_integer (lexer, 0);
 722 }
 723 \f
 724 /* Token testing functions with lookahead.
 725
 726    A value of 0 for N as an argument to any of these functions refers to the
 727    current token.  Lookahead is limited to the current command.  Any N greater
 728    than the number of tokens remaining in the current command will be treated
 729    as referring to a T_ENDCMD token. */
 730
 731 /* Returns true if the token N ahead of the current token is a number. */
 732 bool
 733 lex_next_is_number (const struct lexer *lexer, int n)
 734 {
 735   return token_is_number (lex_next (lexer, n));
 736 }
 737
 738 /* Returns true if the token N ahead of the current token is a string. */
 739 bool
 740 lex_next_is_string (const struct lexer *lexer, int n)
 741 {
 742   return token_is_string (lex_next (lexer, n));
 743 }
 744
 745 /* Returns the value of the token N ahead of the current token, which must be a
 746    floating point number. */
 747 double
 748 lex_next_number (const struct lexer *lexer, int n)
 749 {
 750   return token_number (lex_next (lexer, n));
 751 }
 752
 753 /* Returns true if the token N ahead of the current token is an integer. */
 754 bool
 755 lex_next_is_integer (const struct lexer *lexer, int n)
 756 {
 757   return token_is_integer (lex_next (lexer, n));
 758 }
 759
 760 /* Returns the value of the token N ahead of the current token, which must be
 761    an integer. */
 762 long
 763 lex_next_integer (const struct lexer *lexer, int n)
 764 {
 765   return token_integer (lex_next (lexer, n));
 766 }
 767 \f
 768 /* Token matching functions. */
 769
 770 /* If the current token has the specified TYPE, skips it and returns true.
 771    Otherwise, returns false. */
 772 bool
 773 lex_match (struct lexer *lexer, enum token_type type)
 774 {
 775   if (lex_token (lexer) == type)
 776     {
 777       lex_get (lexer);
 778       return true;
 779     }
 780   else
 781     return false;
 782 }
 783
 784 /* If the current token matches IDENTIFIER, skips it and returns true.
 785    IDENTIFIER may be abbreviated to its first three letters.  Otherwise,
 786    returns false.
 787
 788    IDENTIFIER must be an ASCII string. */
 789 bool
 790 lex_match_id (struct lexer *lexer, const char *identifier)
 791 {
 792   return lex_match_id_n (lexer, identifier, 3);
 793 }
 794
 795 /* If the current token is IDENTIFIER, skips it and returns true.  IDENTIFIER
 796    may be abbreviated to its first N letters.  Otherwise, returns false.
 797
 798    IDENTIFIER must be an ASCII string. */
 799 bool
 800 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
 801 {
 802   if (lex_token (lexer) == T_ID
 803       && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
 804     {
 805       lex_get (lexer);
 806       return true;
 807     }
 808   else
 809     return false;
 810 }
 811
 812 /* If the current token is integer X, skips it and returns true.  Otherwise,
 813    returns false. */
 814 bool
 815 lex_match_int (struct lexer *lexer, int x)
 816 {
 817   if (lex_is_integer (lexer) && lex_integer (lexer) == x)
 818     {
 819       lex_get (lexer);
 820       return true;
 821     }
 822   else
 823     return false;
 824 }
 825 \f
 826 /* Forced matches. */
 827
 828 /* If this token is IDENTIFIER, skips it and returns true.  IDENTIFIER may be
 829    abbreviated to its first 3 letters.  Otherwise, reports an error and returns
 830    false.
 831
 832    IDENTIFIER must be an ASCII string. */
 833 bool
 834 lex_force_match_id (struct lexer *lexer, const char *identifier)
 835 {
 836   if (lex_match_id (lexer, identifier))
 837     return true;
 838   else
 839     {
 840       lex_error_expecting (lexer, identifier);
 841       return false;
 842     }
 843 }
 844
 845 /* If the current token has the specified TYPE, skips it and returns true.
 846    Otherwise, reports an error and returns false. */
 847 bool
 848 lex_force_match (struct lexer *lexer, enum token_type type)
 849 {
 850   if (lex_token (lexer) == type)
 851     {
 852       lex_get (lexer);
 853       return true;
 854     }
 855   else
 856     {
 857       const char *type_string = token_type_to_string (type);
 858       if (type_string)
 859         {
 860           char *s = xasprintf ("`%s'", type_string);
 861           lex_error_expecting (lexer, s);
 862           free (s);
 863         }
 864       else
 865         lex_error_expecting (lexer, token_type_to_name (type));
 866
 867       return false;
 868     }
 869 }
 870
 871 /* If the current token is a string, does nothing and returns true.
 872    Otherwise, reports an error and returns false. */
 873 bool
 874 lex_force_string (struct lexer *lexer)
 875 {
 876   if (lex_is_string (lexer))
 877     return true;
 878   else
 879     {
 880       lex_error (lexer, _("Syntax error expecting string."));
 881       return false;
 882     }
 883 }
 884
 885 /* If the current token is a string or an identifier, does nothing and returns
 886    true.  Otherwise, reports an error and returns false.
 887
 888    This is meant for use in syntactic situations where we want to encourage the
 889    user to supply a quoted string, but for compatibility we also accept
 890    identifiers.  (One example of such a situation is file names.)  Therefore,
 891    the error message issued when the current token is wrong only says that a
 892    string is expected and doesn't mention that an identifier would also be
 893    accepted. */
 894 bool
 895 lex_force_string_or_id (struct lexer *lexer)
 896 {
 897   return lex_token (lexer) == T_ID || lex_force_string (lexer);
 898 }
 899
 900 /* If the current token is an integer, does nothing and returns true.
 901    Otherwise, reports an error and returns false. */
 902 bool
 903 lex_force_int (struct lexer *lexer)
 904 {
 905   if (lex_is_integer (lexer))
 906     return true;
 907   else
 908     {
 909       lex_error (lexer, _("Syntax error expecting integer."));
 910       return false;
 911     }
 912 }
 913
 914 /* If the current token is an integer in the range MIN...MAX (inclusive), does
 915    nothing and returns true.  Otherwise, reports an error and returns false.
 916    If NAME is nonnull, then it is used in the error message. */
 917 bool
 918 lex_force_int_range (struct lexer *lexer, const char *name, long min, long max)
 919 {
 920   bool is_number = lex_is_number (lexer);
 921   bool is_integer = lex_is_integer (lexer);
 922   bool too_small = (is_integer ? lex_integer (lexer) < min
 923                     : is_number ? lex_number (lexer) < min
 924                     : false);
 925   bool too_big = (is_integer ? lex_integer (lexer) > max
 926                   : is_number ? lex_number (lexer) > max
 927                   : false);
 928   if (is_integer && !too_small && !too_big)
 929     return true;
 930
 931   if (min > max)
 932     {
 933       /* Weird, maybe a bug in the caller.  Just report that we needed an
 934          integer. */
 935       if (name)
 936         lex_error (lexer, _("Syntax error expecting integer for %s."), name);
 937       else
 938         lex_error (lexer, _("Syntax error expecting integer."));
 939     }
 940   else if (min == max)
 941     {
 942       if (name)
 943         lex_error (lexer, _("Syntax error expecting %ld for %s."), min, name);
 944       else
 945         lex_error (lexer, _("Syntax error expecting %ld."), min);
 946     }
 947   else if (min + 1 == max)
 948     {
 949       if (name)
 950         lex_error (lexer, _("Syntax error expecting %ld or %ld for %s."),
 951                    min, min + 1, name);
 952       else
 953         lex_error (lexer, _("Syntax error expecting %ld or %ld."),
 954                    min, min + 1);
 955     }
 956   else
 957     {
 958       bool report_lower_bound = (min > INT_MIN / 2) || too_small;
 959       bool report_upper_bound = (max < INT_MAX / 2) || too_big;
 960
 961       if (report_lower_bound && report_upper_bound)
 962         {
 963           if (name)
 964             lex_error (lexer,
 965                        _("Syntax error expecting integer "
 966                          "between %ld and %ld for %s."),
 967                        min, max, name);
 968           else
 969             lex_error (lexer, _("Syntax error expecting integer "
 970                                 "between %ld and %ld."),
 971                        min, max);
 972         }
 973       else if (report_lower_bound)
 974         {
 975           if (min == 0)
 976             {
 977               if (name)
 978                 lex_error (lexer, _("Syntax error expecting "
 979                                     "non-negative integer for %s."),
 980                            name);
 981               else
 982                 lex_error (lexer, _("Syntax error expecting "
 983                                     "non-negative integer."));
 984             }
 985           else if (min == 1)
 986             {
 987               if (name)
 988                 lex_error (lexer, _("Syntax error expecting "
 989                                     "positive integer for %s."),
 990                            name);
 991               else
 992                 lex_error (lexer, _("Syntax error expecting "
 993                                     "positive integer."));
 994             }
 995           else
 996             {
 997               if (name)
 998                 lex_error (lexer, _("Syntax error expecting "
 999                                     "integer %ld or greater for %s."),
1000                            min, name);
1001               else
1002                 lex_error (lexer, _("Syntax error expecting "
1003                                     "integer %ld or greater."), min);
1004             }
1005         }
1006       else if (report_upper_bound)
1007         {
1008           if (name)
1009             lex_error (lexer,
1010                        _("Syntax error expecting integer less than or equal "
1011                          "to %ld for %s."),
1012                        max, name);
1013           else
1014             lex_error (lexer, _("Syntax error expecting integer less than or "
1015                                 "equal to %ld."),
1016                        max);
1017         }
1018       else
1019         {
1020           if (name)
1021             lex_error (lexer, _("Syntax error expecting integer for %s."),
1022                        name);
1023           else
1024             lex_error (lexer, _("Syntax error expecting integer."));
1025         }
1026     }
1027   return false;
1028 }
1029
1030 /* If the current token is a number, does nothing and returns true.
1031    Otherwise, reports an error and returns false. */
1032 bool
1033 lex_force_num (struct lexer *lexer)
1034 {
1035   if (lex_is_number (lexer))
1036     return true;
1037
1038   lex_error (lexer, _("Syntax error expecting number."));
1039   return false;
1040 }
1041
1042 /* If the current token is an number in the closed range [MIN,MAX], does
1043    nothing and returns true.  Otherwise, reports an error and returns false.
1044    If NAME is nonnull, then it is used in the error message. */
1045 bool
1046 lex_force_num_range_closed (struct lexer *lexer, const char *name,
1047                             double min, double max)
1048 {
1049   bool is_number = lex_is_number (lexer);
1050   bool too_small = is_number && lex_number (lexer) < min;
1051   bool too_big = is_number && lex_number (lexer) > max;
1052   if (is_number && !too_small && !too_big)
1053     return true;
1054
1055   if (min > max)
1056     {
1057       /* Weird, maybe a bug in the caller.  Just report that we needed an
1058          number. */
1059       if (name)
1060         lex_error (lexer, _("Syntax error expecting number for %s."), name);
1061       else
1062         lex_error (lexer, _("Syntax error expecting number."));
1063     }
1064   else if (min == max)
1065     {
1066       if (name)
1067         lex_error (lexer, _("Syntax error expecting number %g for %s."),
1068                    min, name);
1069       else
1070         lex_error (lexer, _("Syntax error expecting number %g."), min);
1071     }
1072   else
1073     {
1074       bool report_lower_bound = min > -DBL_MAX || too_small;
1075       bool report_upper_bound = max < DBL_MAX || too_big;
1076
1077       if (report_lower_bound && report_upper_bound)
1078         {
1079           if (name)
1080             lex_error (lexer,
1081                        _("Syntax error expecting number "
1082                          "between %g and %g for %s."),
1083                        min, max, name);
1084           else
1085             lex_error (lexer, _("Syntax error expecting number "
1086                                 "between %g and %g."),
1087                        min, max);
1088         }
1089       else if (report_lower_bound)
1090         {
1091           if (min == 0)
1092             {
1093               if (name)
1094                 lex_error (lexer, _("Syntax error expecting "
1095                                     "non-negative number for %s."),
1096                            name);
1097               else
1098                 lex_error (lexer, _("Syntax error expecting "
1099                                     "non-negative number."));
1100             }
1101           else
1102             {
1103               if (name)
1104                 lex_error (lexer, _("Syntax error expecting number "
1105                                     "%g or greater for %s."),
1106                            min, name);
1107               else
1108                 lex_error (lexer, _("Syntax error expecting number "
1109                                     "%g or greater."), min);
1110             }
1111         }
1112       else if (report_upper_bound)
1113         {
1114           if (name)
1115             lex_error (lexer,
1116                        _("Syntax error expecting number "
1117                          "less than or equal to %g for %s."),
1118                        max, name);
1119           else
1120             lex_error (lexer, _("Syntax error expecting number "
1121                                 "less than or equal to %g."),
1122                        max);
1123         }
1124       else
1125         {
1126           if (name)
1127             lex_error (lexer, _("Syntax error expecting number for %s."), name);
1128           else
1129             lex_error (lexer, _("Syntax error expecting number."));
1130         }
1131     }
1132   return false;
1133 }
1134
1135 /* If the current token is an number in the half-open range [MIN,MAX), does
1136    nothing and returns true.  Otherwise, reports an error and returns false.
1137    If NAME is nonnull, then it is used in the error message. */
1138 bool
1139 lex_force_num_range_halfopen (struct lexer *lexer, const char *name,
1140                               double min, double max)
1141 {
1142   bool is_number = lex_is_number (lexer);
1143   bool too_small = is_number && lex_number (lexer) < min;
1144   bool too_big = is_number && lex_number (lexer) >= max;
1145   if (is_number && !too_small && !too_big)
1146     return true;
1147
1148   if (min >= max)
1149     {
1150       /* Weird, maybe a bug in the caller.  Just report that we needed an
1151          number. */
1152       if (name)
1153         lex_error (lexer, _("Syntax error expecting number for %s."), name);
1154       else
1155         lex_error (lexer, _("Syntax error expecting number."));
1156     }
1157   else
1158     {
1159       bool report_lower_bound = min > -DBL_MAX || too_small;
1160       bool report_upper_bound = max < DBL_MAX || too_big;
1161
1162       if (report_lower_bound && report_upper_bound)
1163         {
1164           if (name)
1165             lex_error (lexer, _("Syntax error expecting number "
1166                                 "in [%g,%g) for %s."),
1167                        min, max, name);
1168           else
1169             lex_error (lexer, _("Syntax error expecting number in [%g,%g)."),
1170                        min, max);
1171         }
1172       else if (report_lower_bound)
1173         {
1174           if (min == 0)
1175             {
1176               if (name)
1177                 lex_error (lexer, _("Syntax error expecting "
1178                                     "non-negative number for %s."),
1179                            name);
1180               else
1181                 lex_error (lexer, _("Syntax error expecting "
1182                                     "non-negative number."));
1183             }
1184           else
1185             {
1186               if (name)
1187                 lex_error (lexer, _("Syntax error expecting "
1188                                     "number %g or greater for %s."),
1189                            min, name);
1190               else
1191                 lex_error (lexer, _("Syntax error expecting "
1192                                     "number %g or greater."), min);
1193             }
1194         }
1195       else if (report_upper_bound)
1196         {
1197           if (name)
1198             lex_error (lexer,
1199                        _("Syntax error expecting "
1200                          "number less than %g for %s."), max, name);
1201           else
1202             lex_error (lexer, _("Syntax error expecting "
1203                                 "number less than %g."), max);
1204         }
1205       else
1206         {
1207           if (name)
1208             lex_error (lexer, _("Syntax error expecting number for %s."), name);
1209           else
1210             lex_error (lexer, _("Syntax error expecting number."));
1211         }
1212     }
1213   return false;
1214 }
1215
1216 /* If the current token is an number in the open range (MIN,MAX), does
1217    nothing and returns true.  Otherwise, reports an error and returns false.
1218    If NAME is nonnull, then it is used in the error message. */
1219 bool
1220 lex_force_num_range_open (struct lexer *lexer, const char *name,
1221                           double min, double max)
1222 {
1223   bool is_number = lex_is_number (lexer);
1224   bool too_small = is_number && lex_number (lexer) <= min;
1225   bool too_big = is_number && lex_number (lexer) >= max;
1226   if (is_number && !too_small && !too_big)
1227     return true;
1228
1229   if (min >= max)
1230     {
1231       /* Weird, maybe a bug in the caller.  Just report that we needed an
1232          number. */
1233       if (name)
1234         lex_error (lexer, _("Syntax error expecting number for %s."), name);
1235       else
1236         lex_error (lexer, _("Syntax error expecting number."));
1237     }
1238   else
1239     {
1240       bool report_lower_bound = min > -DBL_MAX || too_small;
1241       bool report_upper_bound = max < DBL_MAX || too_big;
1242
1243       if (report_lower_bound && report_upper_bound)
1244         {
1245           if (name)
1246             lex_error (lexer, _("Syntax error expecting number "
1247                                 "in (%g,%g) for %s."),
1248                        min, max, name);
1249           else
1250             lex_error (lexer, _("Syntax error expecting number "
1251                                 "in (%g,%g)."), min, max);
1252         }
1253       else if (report_lower_bound)
1254         {
1255           if (min == 0)
1256             {
1257               if (name)
1258                 lex_error (lexer, _("Syntax error expecting "
1259                                     "positive number for %s."), name);
1260               else
1261                 lex_error (lexer, _("Syntax error expecting "
1262                                     "positive number."));
1263             }
1264           else
1265             {
1266               if (name)
1267                 lex_error (lexer, _("Syntax error expecting number "
1268                                     "greater than %g for %s."),
1269                            min, name);
1270               else
1271                 lex_error (lexer, _("Syntax error expecting number "
1272                                     "greater than %g."), min);
1273             }
1274         }
1275       else if (report_upper_bound)
1276         {
1277           if (name)
1278             lex_error (lexer, _("Syntax error expecting number "
1279                                 "less than %g for %s."),
1280                        max, name);
1281           else
1282             lex_error (lexer, _("Syntax error expecting number "
1283                                 "less than %g."), max);
1284         }
1285       else
1286         {
1287           if (name)
1288             lex_error (lexer, _("Syntax error expecting number "
1289                                 "for %s."), name);
1290           else
1291             lex_error (lexer, _("Syntax error expecting number."));
1292         }
1293     }
1294   return false;
1295 }
1296
1297 /* If the current token is an identifier, does nothing and returns true.
1298    Otherwise, reports an error and returns false. */
1299 bool
1300 lex_force_id (struct lexer *lexer)
1301 {
1302   if (lex_token (lexer) == T_ID)
1303     return true;
1304
1305   lex_error (lexer, _("Syntax error expecting identifier."));
1306   return false;
1307 }
1308 \f
1309 /* Token accessors. */
1310
1311 /* Returns the type of LEXER's current token. */
1312 enum token_type
1313 lex_token (const struct lexer *lexer)
1314 {
1315   return lex_next_token (lexer, 0);
1316 }
1317
1318 /* Returns the number in LEXER's current token.
1319
1320    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
1321    tokens this function will always return zero. */
1322 double
1323 lex_tokval (const struct lexer *lexer)
1324 {
1325   return lex_next_tokval (lexer, 0);
1326 }
1327
1328 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
1329
1330    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
1331    this functions this function will always return NULL.
1332
1333    The UTF-8 encoding of the returned string is correct for variable names and
1334    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
1335    data_in() to use it in a "union value".  */
1336 const char *
1337 lex_tokcstr (const struct lexer *lexer)
1338 {
1339   return lex_next_tokcstr (lexer, 0);
1340 }
1341
1342 /* Returns the string in LEXER's current token, UTF-8 encoded.  The string is
1343    null-terminated (but the null terminator is not included in the returned
1344    substring's 'length').
1345
1346    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
1347    this functions this function will always return NULL.
1348
1349    The UTF-8 encoding of the returned string is correct for variable names and
1350    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
1351    data_in() to use it in a "union value".  */
1352 struct substring
1353 lex_tokss (const struct lexer *lexer)
1354 {
1355   return lex_next_tokss (lexer, 0);
1356 }
1357 \f
1358 /* Looking ahead.
1359
1360    A value of 0 for N as an argument to any of these functions refers to the
1361    current token.  Lookahead is limited to the current command.  Any N greater
1362    than the number of tokens remaining in the current command will be treated
1363    as referring to a T_ENDCMD token. */
1364
1365 static const struct lex_token *
1366 lex_next__ (const struct lexer *lexer_, int n)
1367 {
1368   struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
1369   struct lex_source *src = lex_source__ (lexer);
1370
1371   if (src != NULL)
1372     return lex_source_next__ (src, n);
1373   else
1374     {
1375       static const struct lex_token stop_token = { .token = { .type = T_STOP } };
1376       return &stop_token;
1377     }
1378 }
1379
1380 static const struct lex_token *
1381 lex_source_ofs__ (const struct lex_source *src_, int ofs)
1382 {
1383   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1384
1385   if (ofs < 0)
1386     {
1387       static const struct lex_token endcmd_token
1388         = { .token = { .type = T_ENDCMD } };
1389       return &endcmd_token;
1390     }
1391
1392   while (ofs >= src->n_parse)
1393     {
1394       if (src->n_parse > 0)
1395         {
1396           const struct lex_token *t = src->parse[src->n_parse - 1];
1397           if (t->token.type == T_STOP || t->token.type == T_ENDCMD)
1398             return t;
1399         }
1400
1401       lex_source_get_parse (src);
1402     }
1403
1404   return src->parse[ofs];
1405 }
1406
1407 static const struct lex_token *
1408 lex_source_next__ (const struct lex_source *src, int n)
1409 {
1410   return lex_source_ofs__ (src, n + src->parse_ofs);
1411 }
1412
1413 /* Returns the "struct token" of the token N after the current one in LEXER.
1414    The returned pointer can be invalidated by pretty much any succeeding call
1415    into the lexer, although the string pointer within the returned token is
1416    only invalidated by consuming the token (e.g. with lex_get()). */
1417 const struct token *
1418 lex_next (const struct lexer *lexer, int n)
1419 {
1420   return &lex_next__ (lexer, n)->token;
1421 }
1422
1423 /* Returns the type of the token N after the current one in LEXER. */
1424 enum token_type
1425 lex_next_token (const struct lexer *lexer, int n)
1426 {
1427   return lex_next (lexer, n)->type;
1428 }
1429
1430 /* Returns the number in the tokn N after the current one in LEXER.
1431
1432    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
1433    tokens this function will always return zero. */
1434 double
1435 lex_next_tokval (const struct lexer *lexer, int n)
1436 {
1437   return token_number (lex_next (lexer, n));
1438 }
1439
1440 /* Returns the null-terminated string in the token N after the current one, in
1441    UTF-8 encoding.
1442
1443    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
1444    this functions this function will always return NULL.
1445
1446    The UTF-8 encoding of the returned string is correct for variable names and
1447    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
1448    data_in() to use it in a "union value".  */
1449 const char *
1450 lex_next_tokcstr (const struct lexer *lexer, int n)
1451 {
1452   return lex_next_tokss (lexer, n).string;
1453 }
1454
1455 /* Returns the string in the token N after the current one, in UTF-8 encoding.
1456    The string is null-terminated (but the null terminator is not included in
1457    the returned substring's 'length').
1458
1459    Only T_ID, T_MACRO_ID, T_STRING tokens have meaningful strings.  For other
1460    tokens this functions this function will always return NULL.
1461
1462    The UTF-8 encoding of the returned string is correct for variable names and
1463    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
1464    data_in() to use it in a "union value".  */
1465 struct substring
1466 lex_next_tokss (const struct lexer *lexer, int n)
1467 {
1468   return lex_next (lexer, n)->string;
1469 }
1470
1471 /* Returns the offset of the current token within the command being parsed in
1472    LEXER.  This is 0 for the first token in a command, 1 for the second, and so
1473    on.  The return value is useful later for referring to this token in calls
1474    to lex_ofs_*(). */
1475 int
1476 lex_ofs (const struct lexer *lexer)
1477 {
1478   struct lex_source *src = lex_source__ (lexer);
1479   return src ? src->parse_ofs : 0;
1480 }
1481
1482 /* Returns the offset of the last token in the current command. */
1483 int
1484 lex_max_ofs (const struct lexer *lexer)
1485 {
1486   struct lex_source *src = lex_source__ (lexer);
1487   if (!src)
1488     return 0;
1489
1490   int ofs = MAX (1, src->n_parse) - 1;
1491   for (;;)
1492     {
1493       enum token_type type = lex_source_ofs__ (src, ofs)->token.type;
1494       if (type == T_ENDCMD || type == T_STOP)
1495         return ofs;
1496
1497       ofs++;
1498     }
1499 }
1500
1501 /* Returns the token within LEXER's current command with offset OFS.  Use
1502    lex_ofs() to find out the offset of the current token. */
1503 const struct token *
1504 lex_ofs_token (const struct lexer *lexer_, int ofs)
1505 {
1506   struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
1507   struct lex_source *src = lex_source__ (lexer);
1508
1509   if (src != NULL)
1510     return &lex_source_next__ (src, ofs - src->parse_ofs)->token;
1511   else
1512     {
1513       static const struct token stop_token = { .type = T_STOP };
1514       return &stop_token;
1515     }
1516 }
1517
1518 /* Allocates and returns a new struct msg_location that spans tokens with
1519    offsets OFS0 through OFS1, inclusive, within the current command in
1520    LEXER.  See lex_ofs() for an explanation of token offsets.
1521
1522    The caller owns and must eventually free the returned object. */
1523 struct msg_location *
1524 lex_ofs_location (const struct lexer *lexer, int ofs0, int ofs1)
1525 {
1526   int ofs = lex_ofs (lexer);
1527   return lex_get_location (lexer, ofs0 - ofs, ofs1 - ofs);
1528 }
1529
1530 /* Returns a msg_point for the first character in the token with offset OFS,
1531    where offset 0 is the first token in the command currently being parsed, 1
1532    the second token, and so on.  These are absolute offsets, not relative to
1533    the token currently being parsed within the command.
1534
1535    Returns zeros for a T_STOP token.
1536  */
1537 struct msg_point
1538 lex_ofs_start_point (const struct lexer *lexer, int ofs)
1539 {
1540   const struct lex_source *src = lex_source__ (lexer);
1541   return (src
1542           ? lex_token_start_point (src, lex_source_ofs__ (src, ofs))
1543           : (struct msg_point) { 0, 0 });
1544 }
1545
1546 /* Returns a msg_point for the last character, inclusive, in the token with
1547    offset OFS, where offset 0 is the first token in the command currently being
1548    parsed, 1 the second token, and so on.  These are absolute offsets, not
1549    relative to the token currently being parsed within the command.
1550
1551    Returns zeros for a T_STOP token.
1552
1553    Most of the time, a single token is wholly within a single line of syntax,
1554    so that the start and end point for a given offset have the same line
1555    number.  There are two exceptions: a T_STRING token can be made up of
1556    multiple segments on adjacent lines connected with "+" punctuators, and a
1557    T_NEG_NUM token can consist of a "-" on one line followed by the number on
1558    the next.
1559  */
1560 struct msg_point
1561 lex_ofs_end_point (const struct lexer *lexer, int ofs)
1562 {
1563   const struct lex_source *src = lex_source__ (lexer);
1564   return (src
1565           ? lex_token_end_point (src, lex_source_ofs__ (src, ofs))
1566           : (struct msg_point) { 0, 0 });
1567 }
1568
1569 /* Returns the text of the syntax in tokens N0 ahead of the current one,
1570    through N1 ahead of the current one, inclusive.  (For example, if N0 and N1
1571    are both zero, this requests the syntax for the current token.)
1572
1573    The caller must eventually free the returned string (with free()).  The
1574    syntax is encoded in UTF-8 and in the original form supplied to the lexer so
1575    that, for example, it may include comments, spaces, and new-lines if it
1576    spans multiple tokens.  Macro expansion, however, has already been
1577    performed. */
1578 char *
1579 lex_next_representation (const struct lexer *lexer, int n0, int n1)
1580 {
1581   const struct lex_source *src = lex_source__ (lexer);
1582   return (src
1583           ? lex_source_syntax__ (src, n0 + src->parse_ofs, n1 + src->parse_ofs)
1584           : xstrdup (""));
1585 }
1586
1587
1588 /* Returns the text of the syntax in tokens with offsets OFS0 to OFS1,
1589    inclusive.  (For example, if OFS0 and OFS1 are both zero, this requests the
1590    syntax for the first token in the current command.)
1591
1592    The caller must eventually free the returned string (with free()).  The
1593    syntax is encoded in UTF-8 and in the original form supplied to the lexer so
1594    that, for example, it may include comments, spaces, and new-lines if it
1595    spans multiple tokens.  Macro expansion, however, has already been
1596    performed. */
1597 char *
1598 lex_ofs_representation (const struct lexer *lexer, int ofs0, int ofs1)
1599 {
1600   const struct lex_source *src = lex_source__ (lexer);
1601   return src ? lex_source_syntax__ (src, ofs0, ofs1) : xstrdup ("");
1602 }
1603
1604 /* Returns true if the token N ahead of the current one was produced by macro
1605    expansion, false otherwise. */
1606 bool
1607 lex_next_is_from_macro (const struct lexer *lexer, int n)
1608 {
1609   return lex_next__ (lexer, n)->macro_rep != NULL;
1610 }
1611
1612 static bool
1613 lex_tokens_match (const struct token *actual, const struct token *expected)
1614 {
1615   if (actual->type != expected->type)
1616     return false;
1617
1618   switch (actual->type)
1619     {
1620     case T_POS_NUM:
1621     case T_NEG_NUM:
1622       return actual->number == expected->number;
1623
1624     case T_ID:
1625       return lex_id_match (expected->string, actual->string);
1626
1627     case T_STRING:
1628       return (actual->string.length == expected->string.length
1629               && !memcmp (actual->string.string, expected->string.string,
1630                           actual->string.length));
1631
1632     default:
1633       return true;
1634     }
1635 }
1636
1637 static size_t
1638 lex_ofs_at_phrase__ (struct lexer *lexer, int ofs, const char *s)
1639 {
1640   struct string_lexer slex;
1641   struct token token;
1642
1643   size_t i = 0;
1644   string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE, true);
1645   while (string_lexer_next (&slex, &token))
1646     {
1647       bool match = lex_tokens_match (lex_ofs_token (lexer, ofs + i++), &token);
1648       token_uninit (&token);
1649       if (!match)
1650         return 0;
1651     }
1652   return i;
1653 }
1654
1655 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1656    returns true.  Otherwise, returns false.
1657
1658    S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1659    "2SLS", or "END INPUT PROGRAM".  Identifiers may be abbreviated to their
1660    first three letters. */
1661 bool
1662 lex_at_phrase (struct lexer *lexer, const char *s)
1663 {
1664   return lex_ofs_at_phrase__ (lexer, lex_ofs (lexer), s) > 0;
1665 }
1666
1667 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1668    skips it and returns true.  Otherwise, returns false.
1669
1670    S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1671    "2SLS", or "END INPUT PROGRAM".  Identifiers may be abbreviated to their
1672    first three letters. */
1673 bool
1674 lex_match_phrase (struct lexer *lexer, const char *s)
1675 {
1676   size_t n = lex_ofs_at_phrase__ (lexer, lex_ofs (lexer), s);
1677   if (n > 0)
1678     lex_get_n (lexer, n);
1679   return n > 0;
1680 }
1681
1682 /* Returns the 1-based line number of the source text at the byte OFFSET in
1683    SRC. */
1684 static int
1685 lex_source_ofs_to_line_number (const struct lex_source *src, size_t offset)
1686 {
1687   size_t lo = 0;
1688   size_t hi = src->n_lines;
1689   for (;;)
1690     {
1691       size_t mid = (lo + hi) / 2;
1692       if (mid + 1 >= src->n_lines)
1693         return src->n_lines;
1694       else if (offset >= src->lines[mid + 1])
1695         lo = mid;
1696       else if (offset < src->lines[mid])
1697         hi = mid;
1698       else
1699         return mid + 1;
1700     }
1701 }
1702
1703 /* Returns the 1-based column number of the source text at the byte OFFSET in
1704    SRC. */
1705 static int
1706 lex_source_ofs_to_column_number (const struct lex_source *src, size_t offset)
1707 {
1708   const char *newline = memrchr (src->buffer, '\n', offset);
1709   size_t line_ofs = newline ? newline - src->buffer + 1 : 0;
1710   return utf8_count_columns (&src->buffer[line_ofs], offset - line_ofs) + 1;
1711 }
1712
1713 static struct msg_point
1714 lex_source_ofs_to_point__ (const struct lex_source *src, size_t offset)
1715 {
1716   return (struct msg_point) {
1717     .line = lex_source_ofs_to_line_number (src, offset),
1718     .column = lex_source_ofs_to_column_number (src, offset),
1719   };
1720 }
1721
1722 static struct msg_point
1723 lex_token_start_point (const struct lex_source *src,
1724                        const struct lex_token *token)
1725 {
1726   return lex_source_ofs_to_point__ (src, token->token_pos);
1727 }
1728
1729 static struct msg_point
1730 lex_token_end_point (const struct lex_source *src,
1731                      const struct lex_token *token)
1732 {
1733   return lex_source_ofs_to_point__ (src, lex_token_end (token));
1734 }
1735
1736 static struct msg_location
1737 lex_token_location (const struct lex_source *src,
1738                     const struct lex_token *t0,
1739                     const struct lex_token *t1)
1740 {
1741   return (struct msg_location) {
1742     .file_name = intern_new_if_nonnull (src->reader->file_name),
1743     .start = lex_token_start_point (src, t0),
1744     .end = lex_token_end_point (src, t1),
1745     .src = CONST_CAST (struct lex_source *, src),
1746   };
1747 }
1748
1749 static struct msg_location *
1750 lex_token_location_rw (const struct lex_source *src,
1751                        const struct lex_token *t0,
1752                        const struct lex_token *t1)
1753 {
1754   struct msg_location location = lex_token_location (src, t0, t1);
1755   return msg_location_dup (&location);
1756 }
1757
1758 static struct msg_location *
1759 lex_source_get_location (const struct lex_source *src, int ofs0, int ofs1)
1760 {
1761   return lex_token_location_rw (src,
1762                                 lex_source_ofs__ (src, ofs0),
1763                                 lex_source_ofs__ (src, ofs1));
1764 }
1765
1766 /* Returns the name of the syntax file from which the current command is drawn.
1767    Returns NULL for a T_STOP token or if the command's source does not have
1768    line numbers.
1769
1770    There is no version of this function that takes an N argument because
1771    lookahead only works to the end of a command and any given command is always
1772    within a single syntax file. */
1773 const char *
1774 lex_get_file_name (const struct lexer *lexer)
1775 {
1776   struct lex_source *src = lex_source__ (lexer);
1777   return src == NULL ? NULL : src->reader->file_name;
1778 }
1779
1780 /* Returns a newly allocated msg_location for the syntax that represents tokens
1781    with 0-based offsets N0...N1, inclusive, from the current token.  The caller
1782    must eventually free the location (with msg_location_destroy()). */
1783 struct msg_location *
1784 lex_get_location (const struct lexer *lexer, int n0, int n1)
1785 {
1786   struct msg_location *loc = xmalloc (sizeof *loc);
1787   *loc = (struct msg_location) {
1788     .file_name = intern_new_if_nonnull (lex_get_file_name (lexer)),
1789     .start = lex_ofs_start_point (lexer, n0 + lex_ofs (lexer)),
1790     .end = lex_ofs_end_point (lexer, n1 + lex_ofs (lexer)),
1791     .src = lex_source__ (lexer),
1792   };
1793   lex_source_ref (loc->src);
1794   return loc;
1795 }
1796
1797 const char *
1798 lex_get_encoding (const struct lexer *lexer)
1799 {
1800   struct lex_source *src = lex_source__ (lexer);
1801   return src == NULL ? NULL : src->reader->encoding;
1802 }
1803
1804 /* Returns the syntax mode for the syntax file from which the current drawn is
1805    drawn.  Returns SEG_MODE_AUTO for a T_STOP token or if the command's source
1806    does not have line numbers.
1807
1808    There is no version of this function that takes an N argument because
1809    lookahead only works to the end of a command and any given command is always
1810    within a single syntax file. */
1811 enum segmenter_mode
1812 lex_get_syntax_mode (const struct lexer *lexer)
1813 {
1814   struct lex_source *src = lex_source__ (lexer);
1815   return src == NULL ? SEG_MODE_AUTO : src->reader->syntax;
1816 }
1817
1818 /* Returns the error mode for the syntax file from which the current drawn is
1819    drawn.  Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1820    source does not have line numbers.
1821
1822    There is no version of this function that takes an N argument because
1823    lookahead only works to the end of a command and any given command is always
1824    within a single syntax file. */
1825 enum lex_error_mode
1826 lex_get_error_mode (const struct lexer *lexer)
1827 {
1828   struct lex_source *src = lex_source__ (lexer);
1829   return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1830 }
1831
1832 /* If the source that LEXER is currently reading has error mode
1833    LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1834    token to be read comes directly from whatever is next read from the stream.
1835
1836    It makes sense to call this function after encountering an error in a
1837    command entered on the console, because usually the user would prefer not to
1838    have cascading errors. */
1839 void
1840 lex_interactive_reset (struct lexer *lexer)
1841 {
1842   struct lex_source *src = lex_source__ (lexer);
1843   if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1844     {
1845       src->length = 0;
1846       src->journal_pos = src->seg_pos = 0;
1847       src->n_lines = 0;
1848       src->suppress_next_newline = false;
1849       src->segmenter = segmenter_init (segmenter_get_mode (&src->segmenter),
1850                                        false);
1851       lex_stage_clear (&src->pp);
1852       lex_stage_clear (&src->merge);
1853       lex_source_clear_parse (src);
1854       lex_source_push_endcmd__ (src);
1855     }
1856 }
1857
1858 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1859 void
1860 lex_discard_rest_of_command (struct lexer *lexer)
1861 {
1862   while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1863     lex_get (lexer);
1864 }
1865
1866 /* Discards all lookahead tokens in LEXER, then discards all input sources
1867    until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1868    runs out of input sources. */
1869 void
1870 lex_discard_noninteractive (struct lexer *lexer)
1871 {
1872   struct lex_source *src = lex_source__ (lexer);
1873   if (src != NULL)
1874     {
1875       if (src->reader->error == LEX_ERROR_IGNORE)
1876         return;
1877
1878       lex_stage_clear (&src->pp);
1879       lex_stage_clear (&src->merge);
1880       lex_source_clear_parse (src);
1881
1882       for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1883            src = lex_source__ (lexer))
1884         {
1885           ll_remove (&src->ll);
1886           lex_source_unref (src);
1887         }
1888     }
1889 }
1890 \f
1891 static void
1892 lex_source_expand__ (struct lex_source *src)
1893 {
1894   if (src->length >= src->allocated)
1895     src->buffer = x2realloc (src->buffer, &src->allocated);
1896 }
1897
1898 static void
1899 lex_source_read__ (struct lex_source *src)
1900 {
1901   do
1902     {
1903       lex_source_expand__ (src);
1904
1905       size_t space = src->allocated - src->length;
1906       enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1907       size_t n = src->reader->class->read (src->reader,
1908                                            &src->buffer[src->length],
1909                                            space, prompt);
1910       assert (n <= space);
1911
1912       if (n == 0)
1913         {
1914           /* End of input. */
1915           src->reader->eof = true;
1916           return;
1917         }
1918
1919       src->length += n;
1920     }
1921   while (!memchr (&src->buffer[src->seg_pos], '\n',
1922                   src->length - src->seg_pos));
1923 }
1924
1925 static struct lex_source *
1926 lex_source__ (const struct lexer *lexer)
1927 {
1928   return (ll_is_empty (&lexer->sources) ? NULL
1929           : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1930 }
1931
1932 const struct lex_source *
1933 lex_source (const struct lexer *lexer)
1934 {
1935   return lex_source__ (lexer);
1936 }
1937
1938 /* Returns the text of the syntax in SRC for tokens with offsets OFS0 through
1939    OFS1 in the current command, inclusive.  (For example, if OFS0 and OFS1 are
1940    both zero, this requests the syntax for the first token in the current
1941    command.)  The caller must eventually free the returned string (with
1942    free()).  The syntax is encoded in UTF-8 and in the original form supplied
1943    to the lexer so that, for example, it may include comments, spaces, and
1944    new-lines if it spans multiple tokens.  Macro expansion, however, has
1945    already been performed. */
1946 static char *
1947 lex_source_syntax__ (const struct lex_source *src, int ofs0, int ofs1)
1948 {
1949   struct string s = DS_EMPTY_INITIALIZER;
1950   for (size_t i = ofs0; i <= ofs1; )
1951     {
1952       /* Find [I,J) as the longest sequence of tokens not produced by macro
1953          expansion, or otherwise the longest sequence expanded from a single
1954          macro call. */
1955       const struct lex_token *first = lex_source_ofs__ (src, i);
1956       size_t j;
1957       for (j = i + 1; j <= ofs1; j++)
1958         {
1959           const struct lex_token *cur = lex_source_ofs__ (src, j);
1960           if ((first->macro_rep != NULL) != (cur->macro_rep != NULL)
1961               || first->macro_rep != cur->macro_rep)
1962             break;
1963         }
1964       const struct lex_token *last = lex_source_ofs__ (src, j - 1);
1965
1966       /* Now add the syntax for this sequence of tokens to SRC. */
1967       if (!ds_is_empty (&s))
1968         ds_put_byte (&s, ' ');
1969       if (!first->macro_rep)
1970         {
1971           size_t start = first->token_pos;
1972           size_t end = last->token_pos + last->token_len;
1973           ds_put_substring (&s, ss_buffer (&src->buffer[start], end - start));
1974         }
1975       else
1976         {
1977           size_t start = first->ofs;
1978           size_t end = last->ofs + last->len;
1979           ds_put_substring (&s, ss_buffer (first->macro_rep + start,
1980                                            end - start));
1981         }
1982
1983       i = j;
1984     }
1985   return ds_steal_cstr (&s);
1986 }
1987
1988 static bool
1989 lex_source_contains_macro_call (struct lex_source *src, int ofs0, int ofs1)
1990 {
1991   for (int i = ofs0; i <= ofs1; i++)
1992     if (lex_source_ofs__ (src, i)->macro_rep)
1993       return true;
1994   return false;
1995 }
1996
1997 /* If tokens N0...N1 (inclusive) in SRC contains a macro call, this returns the
1998    raw UTF-8 syntax for the macro call (not for the expansion) and for any
1999    other tokens included in that range.  The syntax is encoded in UTF-8 and in
2000    the original form supplied to the lexer so that, for example, it may include
2001    comments, spaces, and new-lines if it spans multiple tokens.
2002
2003    Returns an empty string if the token range doesn't include a macro call.
2004
2005    The caller must not modify or free the returned string. */
2006 static struct substring
2007 lex_source_get_macro_call (struct lex_source *src, int ofs0, int ofs1)
2008 {
2009   if (!lex_source_contains_macro_call (src, ofs0, ofs1))
2010     return ss_empty ();
2011
2012   const struct lex_token *token0 = lex_source_ofs__ (src, ofs0);
2013   const struct lex_token *token1 = lex_source_ofs__ (src, MAX (ofs0, ofs1));
2014   size_t start = token0->token_pos;
2015   size_t end = token1->token_pos + token1->token_len;
2016
2017   return ss_buffer (&src->buffer[start], end - start);
2018 }
2019
2020 static void
2021 lex_source_msg_valist (struct lex_source *src, enum msg_class class,
2022                        int ofs0, int ofs1, const char *format, va_list args)
2023 {
2024   struct string s = DS_EMPTY_INITIALIZER;
2025
2026   if (src)
2027     {
2028       /* Get the macro call(s) that expanded to the syntax that caused the
2029          error. */
2030       char call[64];
2031       str_ellipsize (lex_source_get_macro_call (src, ofs0, ofs1),
2032                      call, sizeof call);
2033       if (call[0])
2034         ds_put_format (&s, _("In syntax expanded from `%s'"), call);
2035     }
2036   else
2037     ds_put_cstr (&s, _("At end of input"));
2038
2039   if (!ds_is_empty (&s))
2040     ds_put_cstr (&s, ": ");
2041   if (format)
2042     ds_put_vformat (&s, format, args);
2043   else
2044     ds_put_cstr (&s, _("Syntax error."));
2045
2046   if (ds_last (&s) != '.')
2047     ds_put_byte (&s, '.');
2048
2049   struct msg *m = xmalloc (sizeof *m);
2050   *m = (struct msg) {
2051     .category = msg_class_to_category (class),
2052     .severity = msg_class_to_severity (class),
2053     .location = src ? lex_source_get_location (src, ofs0, ofs1) : NULL,
2054     .text = ds_steal_cstr (&s),
2055   };
2056   msg_emit (m);
2057 }
2058
2059 static void
2060 lex_get_error (struct lex_source *src, const struct lex_token *token)
2061 {
2062   char syntax[64];
2063   str_ellipsize (ss_buffer (&src->buffer[token->token_pos], token->token_len),
2064                  syntax, sizeof syntax);
2065
2066   struct string s = DS_EMPTY_INITIALIZER;
2067   ds_put_cstr (&s, token->token.string.string);
2068
2069   struct msg *m = xmalloc (sizeof *m);
2070   *m = (struct msg) {
2071     .category = MSG_C_SYNTAX,
2072     .severity = MSG_S_ERROR,
2073     .location = lex_token_location_rw (src, token, token),
2074     .text = ds_steal_cstr (&s),
2075   };
2076   msg_emit (m);
2077 }
2078
2079 /* Attempts to append an additional token to 'pp' in SRC, reading more from the
2080    underlying lex_reader if necessary.  Returns true if a new token was added
2081    to SRC's deque, false otherwise.  The caller should retry failures unless
2082    SRC's 'eof' marker was set to true indicating that there will be no more
2083    tokens from this source. */
2084 static bool
2085 lex_source_try_get_pp (struct lex_source *src)
2086 {
2087   /* Append a new token to SRC and initialize it. */
2088   struct lex_token *token = xmalloc (sizeof *token);
2089   token->token = (struct token) { .type = T_STOP };
2090   token->macro_rep = NULL;
2091   token->ref_cnt = NULL;
2092   token->token_pos = src->seg_pos;
2093
2094   /* Extract a segment. */
2095   const char *segment;
2096   enum segment_type seg_type;
2097   int seg_len;
2098   for (;;)
2099     {
2100       segment = &src->buffer[src->seg_pos];
2101       seg_len = segmenter_push (&src->segmenter, segment,
2102                                 src->length - src->seg_pos,
2103                                 src->reader->eof, &seg_type);
2104       if (seg_len >= 0)
2105         break;
2106
2107       /* The segmenter needs more input to produce a segment. */
2108       assert (!src->reader->eof);
2109       lex_source_read__ (src);
2110     }
2111
2112   /* Update state based on the segment. */
2113   token->token_len = seg_len;
2114   src->seg_pos += seg_len;
2115   if (seg_type == SEG_NEWLINE)
2116     {
2117       if (src->n_lines >= src->allocated_lines)
2118         src->lines = x2nrealloc (src->lines, &src->allocated_lines,
2119                                  sizeof *src->lines);
2120       src->lines[src->n_lines++] = src->seg_pos;
2121     }
2122
2123   /* Get a token from the segment. */
2124   enum tokenize_result result = token_from_segment (
2125     seg_type, ss_buffer (segment, seg_len), &token->token);
2126
2127   /* If we've reached the end of a line, or the end of a command, then pass
2128      the line to the output engine as a syntax text item.  */
2129   int n_lines = seg_type == SEG_NEWLINE;
2130   if (seg_type == SEG_END_COMMAND && !src->suppress_next_newline)
2131     {
2132       n_lines++;
2133       src->suppress_next_newline = true;
2134     }
2135   else if (n_lines > 0 && src->suppress_next_newline)
2136     {
2137       n_lines--;
2138       src->suppress_next_newline = false;
2139     }
2140   for (int i = 0; i < n_lines; i++)
2141     {
2142       /* Beginning of line. */
2143       const char *line = &src->buffer[src->journal_pos];
2144
2145       /* Calculate line length, including \n or \r\n end-of-line if present.
2146
2147          We use src->length even though that may be beyond what we've actually
2148          converted to tokens.  That's because, if we're emitting the line due
2149          to SEG_END_COMMAND, we want to take the whole line through the
2150          newline, not just through the '.'. */
2151       size_t max_len = src->length - src->journal_pos;
2152       const char *newline = memchr (line, '\n', max_len);
2153       size_t line_len = newline ? newline - line + 1 : max_len;
2154
2155       /* Calculate line length excluding end-of-line. */
2156       size_t copy_len = line_len;
2157       if (copy_len > 0 && line[copy_len - 1] == '\n')
2158         copy_len--;
2159       if (copy_len > 0 && line[copy_len - 1] == '\r')
2160         copy_len--;
2161
2162       /* Submit the line as syntax. */
2163       output_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX,
2164                                                    xmemdup0 (line, copy_len),
2165                                                    NULL));
2166
2167       src->journal_pos += line_len;
2168     }
2169
2170   switch (result)
2171     {
2172     case TOKENIZE_ERROR:
2173       lex_get_error (src, token);
2174       /* Fall through. */
2175     case TOKENIZE_EMPTY:
2176       lex_token_destroy (token);
2177       return false;
2178
2179     case TOKENIZE_TOKEN:
2180       if (token->token.type == T_STOP)
2181         {
2182           token->token.type = T_ENDCMD;
2183           src->eof = true;
2184         }
2185       lex_stage_push_last (&src->pp, token);
2186       return true;
2187     }
2188   NOT_REACHED ();
2189 }
2190
2191 /* Attempts to append a new token to SRC.  Returns true if successful, false on
2192    failure.  On failure, the end of SRC has been reached and no more tokens
2193    will be forthcoming from it.
2194
2195    Does not make the new token available for lookahead yet; the caller must
2196    adjust SRC's 'middle' pointer to do so. */
2197 static bool
2198 lex_source_get_pp (struct lex_source *src)
2199 {
2200   while (!src->eof)
2201     if (lex_source_try_get_pp (src))
2202       return true;
2203   return false;
2204 }
2205
2206 static bool
2207 lex_source_try_get_merge (const struct lex_source *src_)
2208 {
2209   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
2210
2211   if (lex_stage_is_empty (&src->pp) && !lex_source_get_pp (src))
2212     return false;
2213
2214   if (!settings_get_mexpand ())
2215     {
2216       lex_stage_shift (&src->merge, &src->pp, lex_stage_count (&src->pp));
2217       return true;
2218     }
2219
2220   /* Now pass tokens one-by-one to the macro expander.
2221
2222      In the common case where there is no macro to expand, the loop is not
2223      entered.  */
2224   struct macro_call *mc;
2225   int n_call = macro_call_create (src->lexer->macros,
2226                                   &lex_stage_first (&src->pp)->token, &mc);
2227   for (int ofs = 1; !n_call; ofs++)
2228     {
2229       if (lex_stage_count (&src->pp) <= ofs && !lex_source_get_pp (src))
2230         {
2231           /* This should not be reachable because we always get a T_ENDCMD at
2232              the end of an input file (transformed from T_STOP by
2233              lex_source_try_get_pp()) and the macro_expander should always
2234              terminate expansion on T_ENDCMD. */
2235           NOT_REACHED ();
2236         }
2237
2238       const struct lex_token *t = lex_stage_nth (&src->pp, ofs);
2239       const struct macro_token mt = {
2240         .token = t->token,
2241         .syntax = ss_buffer (&src->buffer[t->token_pos], t->token_len),
2242       };
2243       const struct msg_location loc = lex_token_location (src, t, t);
2244       n_call = macro_call_add (mc, &mt, &loc);
2245     }
2246   if (n_call < 0)
2247     {
2248       /* False alarm: no macro expansion after all.  Use first token as
2249          lookahead.  We'll retry macro expansion from the second token next
2250          time around. */
2251       macro_call_destroy (mc);
2252       lex_stage_shift (&src->merge, &src->pp, 1);
2253       return true;
2254     }
2255
2256   /* The first 'n_call' tokens in 'pp', which we bracket as C0...C1, inclusive,
2257      are a macro call.  (These are likely to be the only tokens in 'pp'.)
2258      Expand them.  */
2259   const struct lex_token *c0 = lex_stage_first (&src->pp);
2260   const struct lex_token *c1 = lex_stage_nth (&src->pp, n_call - 1);
2261   struct macro_tokens expansion = { .n = 0 };
2262   struct msg_location loc = lex_token_location (src, c0, c1);
2263   macro_call_expand (mc, src->reader->syntax, &loc, &expansion);
2264   macro_call_destroy (mc);
2265
2266   /* Convert the macro expansion into syntax for possible error messages
2267      later. */
2268   size_t *ofs = xnmalloc (expansion.n, sizeof *ofs);
2269   size_t *len = xnmalloc (expansion.n, sizeof *len);
2270   struct string s = DS_EMPTY_INITIALIZER;
2271   macro_tokens_to_syntax (&expansion, &s, ofs, len);
2272
2273   if (settings_get_mprint ())
2274     output_item_submit (text_item_create (TEXT_ITEM_LOG, ds_cstr (&s),
2275                                           _("Macro Expansion")));
2276
2277   /* Append the macro expansion tokens to the lookahead. */
2278   if (expansion.n > 0)
2279     {
2280       char *macro_rep = ds_steal_cstr (&s);
2281       size_t *ref_cnt = xmalloc (sizeof *ref_cnt);
2282       *ref_cnt = expansion.n;
2283       for (size_t i = 0; i < expansion.n; i++)
2284         {
2285           struct lex_token *token = xmalloc (sizeof *token);
2286           *token = (struct lex_token) {
2287             .token = expansion.mts[i].token,
2288             .token_pos = c0->token_pos,
2289             .token_len = (c1->token_pos + c1->token_len) - c0->token_pos,
2290             .macro_rep = macro_rep,
2291             .ofs = ofs[i],
2292             .len = len[i],
2293             .ref_cnt = ref_cnt,
2294           };
2295           lex_stage_push_last (&src->merge, token);
2296
2297           ss_dealloc (&expansion.mts[i].syntax);
2298         }
2299     }
2300   else
2301     ds_destroy (&s);
2302   free (expansion.mts);
2303   free (ofs);
2304   free (len);
2305
2306   /* Destroy the tokens for the call. */
2307   for (size_t i = 0; i < n_call; i++)
2308     lex_stage_pop_first (&src->pp);
2309
2310   return expansion.n > 0;
2311 }
2312
2313 /* Attempts to obtain at least one new token into 'merge' in SRC.
2314
2315    Returns true if successful, false on failure.  In the latter case, SRC is
2316    exhausted and 'src->eof' is now true. */
2317 static bool
2318 lex_source_get_merge (struct lex_source *src)
2319 {
2320   while (!src->eof)
2321     if (lex_source_try_get_merge (src))
2322       return true;
2323   return false;
2324 }
2325
2326 /* Attempts to obtain at least one new token into 'lookahead' in SRC.
2327
2328    Returns true if successful, false on failure.  In the latter case, SRC is
2329    exhausted and 'src->eof' is now true. */
2330 static bool
2331 lex_source_get_parse (struct lex_source *src)
2332 {
2333   struct merger m = MERGER_INIT;
2334   struct token out;
2335   for (size_t i = 0; ; i++)
2336     {
2337       while (lex_stage_count (&src->merge) <= i && !lex_source_get_merge (src))
2338         {
2339           /* We always get a T_ENDCMD at the end of an input file
2340              (transformed from T_STOP by lex_source_try_get_pp()) and
2341              merger_add() should never return -1 on T_ENDCMD. */
2342           assert (lex_stage_is_empty (&src->merge));
2343           return false;
2344         }
2345
2346       int retval = merger_add (&m, &lex_stage_nth (&src->merge, i)->token,
2347                                &out);
2348       if (!retval)
2349         {
2350           lex_source_push_parse (src, lex_stage_take_first (&src->merge));
2351           return true;
2352         }
2353       else if (retval > 0)
2354         {
2355           /* Add a token that merges all the tokens together. */
2356           const struct lex_token *first = lex_stage_first (&src->merge);
2357           const struct lex_token *last = lex_stage_nth (&src->merge,
2358                                                         retval - 1);
2359           bool macro = first->macro_rep && first->macro_rep == last->macro_rep;
2360           struct lex_token *t = xmalloc (sizeof *t);
2361           *t = (struct lex_token) {
2362             .token = out,
2363             .token_pos = first->token_pos,
2364             .token_len = (last->token_pos - first->token_pos) + last->token_len,
2365
2366             /* This works well if all the tokens were not expanded from macros,
2367                or if they came from the same macro expansion.  It just gives up
2368                in the other (corner) cases. */
2369             .macro_rep = macro ? first->macro_rep : NULL,
2370             .ofs = macro ? first->ofs : 0,
2371             .len = macro ? (last->ofs - first->ofs) + last->len : 0,
2372             .ref_cnt = macro ? first->ref_cnt : NULL,
2373           };
2374           if (t->ref_cnt)
2375             ++*t->ref_cnt;
2376           lex_source_push_parse (src, t);
2377
2378           for (int i = 0; i < retval; i++)
2379             lex_stage_pop_first (&src->merge);
2380           return true;
2381         }
2382     }
2383 }
2384 \f
2385 static void
2386 lex_source_push_endcmd__ (struct lex_source *src)
2387 {
2388   assert (src->n_parse == 0);
2389
2390   struct lex_token *token = xmalloc (sizeof *token);
2391   *token = (struct lex_token) { .token = { .type = T_ENDCMD } };
2392   lex_source_push_parse (src, token);
2393 }
2394
2395 static void
2396 lex_source_push_parse (struct lex_source *src, struct lex_token *token)
2397 {
2398   if (src->n_parse >= src->allocated_parse)
2399     src->parse = x2nrealloc (src->parse, &src->allocated_parse,
2400                              sizeof *src->parse);
2401   src->parse[src->n_parse++] = token;
2402 }
2403
2404 static void
2405 lex_source_clear_parse (struct lex_source *src)
2406 {
2407   for (size_t i = 0; i < src->n_parse; i++)
2408     lex_token_destroy (src->parse[i]);
2409   src->n_parse = src->parse_ofs = 0;
2410 }
2411
2412 static struct lex_source *
2413 lex_source_create (struct lexer *lexer, struct lex_reader *reader)
2414 {
2415   size_t allocated_lines = 4;
2416   size_t *lines = xmalloc (allocated_lines * sizeof *lines);
2417   *lines = 0;
2418
2419   struct lex_source *src = xmalloc (sizeof *src);
2420   *src = (struct lex_source) {
2421     .n_refs = 1,
2422     .reader = reader,
2423     .segmenter = segmenter_init (reader->syntax, false),
2424     .lexer = lexer,
2425     .lines = lines,
2426     .n_lines = 1,
2427     .allocated_lines = allocated_lines,
2428   };
2429
2430   lex_source_push_endcmd__ (src);
2431
2432   return src;
2433 }
2434
2435 void
2436 lex_set_message_handler (struct lexer *lexer,
2437                          void (*output_msg) (const struct msg *,
2438                                              struct lexer *))
2439 {
2440   struct msg_handler msg_handler = {
2441     .output_msg = (void (*)(const struct msg *, void *)) output_msg,
2442     .aux = lexer,
2443     .lex_source_ref = lex_source_ref,
2444     .lex_source_unref = lex_source_unref,
2445     .lex_source_get_line = lex_source_get_line,
2446   };
2447   msg_set_handler (&msg_handler);
2448 }
2449
2450 struct lex_source *
2451 lex_source_ref (const struct lex_source *src_)
2452 {
2453   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
2454   if (src)
2455     {
2456       assert (src->n_refs > 0);
2457       src->n_refs++;
2458     }
2459   return src;
2460 }
2461
2462 void
2463 lex_source_unref (struct lex_source *src)
2464 {
2465   if (!src)
2466     return;
2467
2468   assert (src->n_refs > 0);
2469   if (--src->n_refs > 0)
2470     return;
2471
2472   char *file_name = src->reader->file_name;
2473   char *encoding = src->reader->encoding;
2474   if (src->reader->class->destroy != NULL)
2475     src->reader->class->destroy (src->reader);
2476   free (file_name);
2477   free (encoding);
2478   free (src->buffer);
2479   free (src->lines);
2480   lex_stage_uninit (&src->pp);
2481   lex_stage_uninit (&src->merge);
2482   lex_source_clear_parse (src);
2483   free (src->parse);
2484   free (src);
2485 }
2486 \f
2487 struct lex_file_reader
2488   {
2489     struct lex_reader reader;
2490     struct u8_istream *istream;
2491   };
2492
2493 static struct lex_reader_class lex_file_reader_class;
2494
2495 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
2496    from stdin if FILE_NAME is "-").  The file is expected to be encoded with
2497    ENCODING, which should take one of the forms accepted by
2498    u8_istream_for_file().  SYNTAX and ERROR become the syntax mode and error
2499    mode of the new reader, respectively.
2500
2501    Returns a null pointer if FILE_NAME cannot be opened. */
2502 struct lex_reader *
2503 lex_reader_for_file (const char *file_name, const char *encoding,
2504                      enum segmenter_mode syntax,
2505                      enum lex_error_mode error)
2506 {
2507   struct lex_file_reader *r;
2508   struct u8_istream *istream;
2509
2510   istream = (!strcmp(file_name, "-")
2511              ? u8_istream_for_fd (encoding, STDIN_FILENO)
2512              : u8_istream_for_file (encoding, file_name, O_RDONLY));
2513   if (istream == NULL)
2514     {
2515       msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
2516       return NULL;
2517     }
2518
2519   r = xmalloc (sizeof *r);
2520   lex_reader_init (&r->reader, &lex_file_reader_class);
2521   r->reader.syntax = syntax;
2522   r->reader.error = error;
2523   r->reader.file_name = xstrdup (file_name);
2524   r->reader.encoding = xstrdup_if_nonnull (encoding);
2525   r->reader.line_number = 1;
2526   r->istream = istream;
2527
2528   return &r->reader;
2529 }
2530
2531 static struct lex_file_reader *
2532 lex_file_reader_cast (struct lex_reader *r)
2533 {
2534   return UP_CAST (r, struct lex_file_reader, reader);
2535 }
2536
2537 static size_t
2538 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
2539                enum prompt_style prompt_style UNUSED)
2540 {
2541   struct lex_file_reader *r = lex_file_reader_cast (r_);
2542   ssize_t n_read = u8_istream_read (r->istream, buf, n);
2543   if (n_read < 0)
2544     {
2545       msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
2546       return 0;
2547     }
2548   return n_read;
2549 }
2550
2551 static void
2552 lex_file_close (struct lex_reader *r_)
2553 {
2554   struct lex_file_reader *r = lex_file_reader_cast (r_);
2555
2556   if (u8_istream_fileno (r->istream) != STDIN_FILENO)
2557     {
2558       if (u8_istream_close (r->istream) != 0)
2559         msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
2560     }
2561   else
2562     u8_istream_free (r->istream);
2563
2564   free (r);
2565 }
2566
2567 static struct lex_reader_class lex_file_reader_class =
2568   {
2569     lex_file_read,
2570     lex_file_close
2571   };
2572 \f
2573 struct lex_string_reader
2574   {
2575     struct lex_reader reader;
2576     struct substring s;
2577     size_t offset;
2578   };
2579
2580 static struct lex_reader_class lex_string_reader_class;
2581
2582 /* Creates and returns a new lex_reader for the contents of S, which must be
2583    encoded in the given ENCODING.  The new reader takes ownership of S and will free it
2584    with ss_dealloc() when it is closed. */
2585 struct lex_reader *
2586 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
2587 {
2588   struct lex_string_reader *r;
2589
2590   r = xmalloc (sizeof *r);
2591   lex_reader_init (&r->reader, &lex_string_reader_class);
2592   r->reader.syntax = SEG_MODE_AUTO;
2593   r->reader.encoding = xstrdup_if_nonnull (encoding);
2594   r->s = s;
2595   r->offset = 0;
2596
2597   return &r->reader;
2598 }
2599
2600 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
2601    which must be encoded in ENCODING.  The caller retains ownership of S. */
2602 struct lex_reader *
2603 lex_reader_for_string (const char *s, const char *encoding)
2604 {
2605   return lex_reader_for_substring_nocopy (ss_clone (ss_cstr (s)), encoding);
2606 }
2607
2608 /* Formats FORMAT as a printf()-like format string and creates and returns a
2609    new lex_reader for the formatted result.  */
2610 struct lex_reader *
2611 lex_reader_for_format (const char *format, const char *encoding, ...)
2612 {
2613   struct lex_reader *r;
2614   va_list args;
2615
2616   va_start (args, encoding);
2617   r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
2618   va_end (args);
2619
2620   return r;
2621 }
2622
2623 static struct lex_string_reader *
2624 lex_string_reader_cast (struct lex_reader *r)
2625 {
2626   return UP_CAST (r, struct lex_string_reader, reader);
2627 }
2628
2629 static size_t
2630 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
2631                  enum prompt_style prompt_style UNUSED)
2632 {
2633   struct lex_string_reader *r = lex_string_reader_cast (r_);
2634   size_t chunk;
2635
2636   chunk = MIN (n, r->s.length - r->offset);
2637   memcpy (buf, r->s.string + r->offset, chunk);
2638   r->offset += chunk;
2639
2640   return chunk;
2641 }
2642
2643 static void
2644 lex_string_close (struct lex_reader *r_)
2645 {
2646   struct lex_string_reader *r = lex_string_reader_cast (r_);
2647
2648   ss_dealloc (&r->s);
2649   free (r);
2650 }
2651
2652 static struct lex_reader_class lex_string_reader_class =
2653   {
2654     lex_string_read,
2655     lex_string_close
2656   };
2657 \f
2658 struct substring
2659 lex_source_get_line (const struct lex_source *src, int line)
2660 {
2661   if (line < 1 || line > src->n_lines)
2662     return ss_empty ();
2663
2664   size_t ofs = src->lines[line - 1];
2665   size_t end;
2666   if (line < src->n_lines)
2667     end = src->lines[line];
2668   else
2669     {
2670       const char *newline = memchr (src->buffer + ofs, '\n', src->length - ofs);
2671       end = newline ? newline - src->buffer : src->length;
2672     }
2673   return ss_buffer (&src->buffer[ofs], end - ofs);
2674 }