pintos-os.org Git - pspp/blob - src/language/lexer/lexer.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "language/lexer/lexer.h"
  20
  21 #include <errno.h>
  22 #include <fcntl.h>
  23 #include <limits.h>
  24 #include <math.h>
  25 #include <stdarg.h>
  26 #include <stdlib.h>
  27 #include <string.h>
  28 #include <unictype.h>
  29 #include <unistd.h>
  30 #include <unistr.h>
  31
  32 #include "language/command.h"
  33 #include "language/lexer/macro.h"
  34 #include "language/lexer/scan.h"
  35 #include "language/lexer/segment.h"
  36 #include "language/lexer/token.h"
  37 #include "libpspp/assertion.h"
  38 #include "libpspp/cast.h"
  39 #include "libpspp/deque.h"
  40 #include "libpspp/i18n.h"
  41 #include "libpspp/intern.h"
  42 #include "libpspp/ll.h"
  43 #include "libpspp/message.h"
  44 #include "libpspp/misc.h"
  45 #include "libpspp/str.h"
  46 #include "libpspp/u8-istream.h"
  47 #include "output/journal.h"
  48 #include "output/output-item.h"
  49
  50 #include "gl/c-ctype.h"
  51 #include "gl/minmax.h"
  52 #include "gl/xalloc.h"
  53 #include "gl/xmemdup0.h"
  54
  55 #include "gettext.h"
  56 #define _(msgid) gettext (msgid)
  57 #define N_(msgid) msgid
  58
  59 /* A token within a lex_source. */
  60 struct lex_token
  61   {
  62     /* The regular token information. */
  63     struct token token;
  64
  65     /* For a token obtained through the lexer in an ordinary way, this is the
  66        location of the token in terms of the lex_source's buffer.
  67
  68        For a token produced through macro expansion, this is the entire macro
  69        call. */
  70     size_t token_pos;           /* Offset into src->buffer of token start. */
  71     size_t token_len;           /* Length of source for token in bytes. */
  72
  73     /* For a token obtained through macro expansion, this is just this token.
  74
  75        For a token obtained through the lexer in an ordinary way, these are
  76        nulls and zeros. */
  77     char *macro_rep;        /* The whole macro expansion. */
  78     size_t ofs;             /* Offset of this token in macro_rep. */
  79     size_t len;             /* Length of this token in macro_rep. */
  80     size_t *ref_cnt;        /* Number of lex_tokens that refer to macro_rep. */
  81   };
  82
  83 static struct msg_point lex_token_start_point (const struct lex_source *,
  84                                                const struct lex_token *);
  85 static struct msg_point lex_token_end_point (const struct lex_source *,
  86                                              const struct lex_token *);
  87
  88 static size_t lex_ofs_at_phrase__ (struct lexer *, int ofs, const char *s);
  89
  90 /* Source offset of the last byte in TOKEN. */
  91 static size_t
  92 lex_token_end (const struct lex_token *token)
  93 {
  94   return token->token_pos + MAX (token->token_len, 1) - 1;
  95 }
  96
  97 static void
  98 lex_token_destroy (struct lex_token *t)
  99 {
 100   token_uninit (&t->token);
 101   if (t->ref_cnt)
 102     {
 103       assert (*t->ref_cnt > 0);
 104       if (!--*t->ref_cnt)
 105         {
 106           free (t->macro_rep);
 107           free (t->ref_cnt);
 108         }
 109     }
 110   free (t);
 111 }
 112 \f
 113 /* A deque of lex_tokens that comprises one stage in the token pipeline in a
 114    lex_source. */
 115 struct lex_stage
 116   {
 117     struct deque deque;
 118     struct lex_token **tokens;
 119   };
 120
 121 static void lex_stage_clear (struct lex_stage *);
 122 static void lex_stage_uninit (struct lex_stage *);
 123
 124 static size_t lex_stage_count (const struct lex_stage *);
 125 static bool lex_stage_is_empty (const struct lex_stage *);
 126
 127 static struct lex_token *lex_stage_first (struct lex_stage *);
 128 static struct lex_token *lex_stage_nth (struct lex_stage *, size_t ofs);
 129
 130 static void lex_stage_push_last (struct lex_stage *, struct lex_token *);
 131 static void lex_stage_pop_first (struct lex_stage *);
 132
 133 static void lex_stage_shift (struct lex_stage *dst, struct lex_stage *src,
 134                              size_t n);
 135
 136 /* Deletes all the tokens from STAGE. */
 137 static void
 138 lex_stage_clear (struct lex_stage *stage)
 139 {
 140   while (!deque_is_empty (&stage->deque))
 141     lex_stage_pop_first (stage);
 142 }
 143
 144 /* Deletes all the tokens from STAGE and frees storage for the deque. */
 145 static void
 146 lex_stage_uninit (struct lex_stage *stage)
 147 {
 148   lex_stage_clear (stage);
 149   free (stage->tokens);
 150 }
 151
 152 /* Returns true if STAGE contains no tokens, otherwise false. */
 153 static bool
 154 lex_stage_is_empty (const struct lex_stage *stage)
 155 {
 156   return deque_is_empty (&stage->deque);
 157 }
 158
 159 /* Returns the number of tokens in STAGE. */
 160 static size_t
 161 lex_stage_count (const struct lex_stage *stage)
 162 {
 163   return deque_count (&stage->deque);
 164 }
 165
 166 /* Returns the first token in STAGE, which must be nonempty.
 167    The first token is the one accessed with the least lookahead. */
 168 static struct lex_token *
 169 lex_stage_first (struct lex_stage *stage)
 170 {
 171   return lex_stage_nth (stage, 0);
 172 }
 173
 174 /* Returns the token the given INDEX in STAGE.  The first token (with the least
 175    lookahead) is 0, the second token is 1, and so on.  There must be at least
 176    INDEX + 1 tokens in STAGE. */
 177 static struct lex_token *
 178 lex_stage_nth (struct lex_stage *stage, size_t index)
 179 {
 180   return stage->tokens[deque_back (&stage->deque, index)];
 181 }
 182
 183 /* Adds TOKEN so that it becomes the last token in STAGE. */
 184 static void
 185 lex_stage_push_last (struct lex_stage *stage, struct lex_token *token)
 186 {
 187   if (deque_is_full (&stage->deque))
 188     stage->tokens = deque_expand (&stage->deque, stage->tokens,
 189                                   sizeof *stage->tokens);
 190   stage->tokens[deque_push_front (&stage->deque)] = token;
 191 }
 192
 193 /* Removes and returns the first token from STAGE. */
 194 static struct lex_token *
 195 lex_stage_take_first (struct lex_stage *stage)
 196 {
 197   return stage->tokens[deque_pop_back (&stage->deque)];
 198 }
 199
 200 /* Removes the first token from STAGE and uninitializes it. */
 201 static void
 202 lex_stage_pop_first (struct lex_stage *stage)
 203 {
 204   lex_token_destroy (lex_stage_take_first (stage));
 205 }
 206
 207 /* Removes the first N tokens from SRC, appending them to DST as the last
 208    tokens. */
 209 static void
 210 lex_stage_shift (struct lex_stage *dst, struct lex_stage *src, size_t n)
 211 {
 212   for (size_t i = 0; i < n; i++)
 213     lex_stage_push_last (dst, lex_stage_take_first (src));
 214 }
 215
 216 /* A source of tokens, corresponding to a syntax file.
 217
 218    This is conceptually a lex_reader wrapped with everything needed to convert
 219    its UTF-8 bytes into tokens. */
 220 struct lex_source
 221   {
 222     struct ll ll;               /* In lexer's list of sources. */
 223
 224     /* Reference count:
 225
 226        - One for struct lexer.
 227
 228        - One for each struct msg_location that references this source. */
 229     size_t n_refs;
 230
 231     struct lex_reader *reader;
 232     struct lexer *lexer;
 233     struct segmenter segmenter;
 234     bool eof;                   /* True if T_STOP was read from 'reader'. */
 235
 236     /* Buffer of UTF-8 bytes. */
 237     char *buffer;               /* Source file contents. */
 238     size_t length;              /* Number of bytes filled. */
 239     size_t allocated;           /* Number of bytes allocated. */
 240
 241     /* Offsets into 'buffer'. */
 242     size_t journal_pos;         /* First byte not yet output to journal. */
 243     size_t seg_pos;             /* First byte not yet scanned as token. */
 244
 245     /* Offset into 'buffer' of starts of lines. */
 246     size_t *lines;
 247     size_t n_lines, allocated_lines;
 248
 249     bool suppress_next_newline;
 250
 251     /* Tokens.
 252
 253        This is a pipeline with the following stages.  Each token eventually
 254        made available to the parser passes through of these stages.  The stages
 255        are named after the processing that happens in each one.
 256
 257        Initially, tokens come from the segmenter and scanner to 'pp':
 258
 259        - pp: Tokens that need to pass through the macro preprocessor to end up
 260          in 'merge'.
 261
 262        - merge: Tokens that need to pass through scan_merge() to end up in
 263          'parse'.
 264
 265        - parse: Tokens available to the client for parsing.
 266
 267       'pp' and 'merge' store tokens only temporarily until they pass into
 268       'parse'.  Tokens then live in 'parse' until the command is fully
 269       consumed, at which time they are freed together. */
 270     struct lex_stage pp;
 271     struct lex_stage merge;
 272     struct lex_token **parse;
 273     size_t n_parse, allocated_parse, parse_ofs;
 274   };
 275
 276 static struct lex_source *lex_source_create (struct lexer *,
 277                                              struct lex_reader *);
 278
 279 /* Lexer. */
 280 struct lexer
 281   {
 282     struct ll_list sources;     /* Contains "struct lex_source"s. */
 283     struct macro_set *macros;
 284   };
 285
 286 static struct lex_source *lex_source__ (const struct lexer *);
 287 static char *lex_source_syntax__ (const struct lex_source *,
 288                                   int ofs0, int ofs1);
 289 static const struct lex_token *lex_next__ (const struct lexer *, int n);
 290 static void lex_source_push_endcmd__ (struct lex_source *);
 291 static void lex_source_push_parse (struct lex_source *, struct lex_token *);
 292 static void lex_source_clear_parse (struct lex_source *);
 293
 294 static bool lex_source_get_parse (struct lex_source *);
 295 static void lex_source_msg_valist (struct lex_source *, enum msg_class,
 296                                    int ofs0, int ofs1,
 297                                    const char *format, va_list)
 298    PRINTF_FORMAT (5, 0);
 299 static const struct lex_token *lex_source_next__ (const struct lex_source *,
 300                                                   int n);
 301 \f
 302 /* Initializes READER with the specified CLASS and otherwise some reasonable
 303    defaults.  The caller should fill in the others members as desired. */
 304 void
 305 lex_reader_init (struct lex_reader *reader,
 306                  const struct lex_reader_class *class)
 307 {
 308   reader->class = class;
 309   reader->syntax = SEG_MODE_AUTO;
 310   reader->error = LEX_ERROR_CONTINUE;
 311   reader->file_name = NULL;
 312   reader->encoding = NULL;
 313   reader->line_number = 0;
 314   reader->eof = false;
 315 }
 316
 317 /* Frees any file name already in READER and replaces it by a copy of
 318    FILE_NAME, or if FILE_NAME is null then clears any existing name. */
 319 void
 320 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
 321 {
 322   free (reader->file_name);
 323   reader->file_name = xstrdup_if_nonnull (file_name);
 324 }
 325 \f
 326 /* Creates and returns a new lexer. */
 327 struct lexer *
 328 lex_create (void)
 329 {
 330   struct lexer *lexer = xmalloc (sizeof *lexer);
 331   *lexer = (struct lexer) {
 332     .sources = LL_INITIALIZER (lexer->sources),
 333     .macros = macro_set_create (),
 334   };
 335   return lexer;
 336 }
 337
 338 /* Destroys LEXER. */
 339 void
 340 lex_destroy (struct lexer *lexer)
 341 {
 342   if (lexer != NULL)
 343     {
 344       struct lex_source *source, *next;
 345
 346       ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
 347         {
 348           ll_remove (&source->ll);
 349           lex_source_unref (source);
 350         }
 351       macro_set_destroy (lexer->macros);
 352       free (lexer);
 353     }
 354 }
 355
 356 /* Adds M to LEXER's set of macros.  M replaces any existing macro with the
 357    same name.  Takes ownership of M. */
 358 void
 359 lex_define_macro (struct lexer *lexer, struct macro *m)
 360 {
 361   macro_set_add (lexer->macros, m);
 362 }
 363
 364 /* Inserts READER into LEXER so that the next token read by LEXER comes from
 365    READER.  Before the caller, LEXER must either be empty or at a T_ENDCMD
 366    token. */
 367 void
 368 lex_include (struct lexer *lexer, struct lex_reader *reader)
 369 {
 370   assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
 371   ll_push_head (&lexer->sources, &lex_source_create (lexer, reader)->ll);
 372 }
 373
 374 /* Appends READER to LEXER, so that it will be read after all other current
 375    readers have already been read. */
 376 void
 377 lex_append (struct lexer *lexer, struct lex_reader *reader)
 378 {
 379   ll_push_tail (&lexer->sources, &lex_source_create (lexer, reader)->ll);
 380 }
 381 \f
 382 /* Advancing. */
 383
 384 /* Advances LEXER to the next token, consuming the current token. */
 385 void
 386 lex_get (struct lexer *lexer)
 387 {
 388   struct lex_source *src;
 389
 390   src = lex_source__ (lexer);
 391   if (src == NULL)
 392     return;
 393
 394   if (src->parse_ofs < src->n_parse)
 395     {
 396       if (src->parse[src->parse_ofs]->token.type == T_ENDCMD)
 397         lex_source_clear_parse (src);
 398       else
 399         src->parse_ofs++;
 400     }
 401
 402   while (src->parse_ofs == src->n_parse)
 403     if (!lex_source_get_parse (src))
 404       {
 405         ll_remove (&src->ll);
 406         lex_source_unref (src);
 407         src = lex_source__ (lexer);
 408         if (src == NULL)
 409           return;
 410       }
 411 }
 412
 413 /* Advances LEXER by N tokens. */
 414 void
 415 lex_get_n (struct lexer *lexer, size_t n)
 416 {
 417   while (n-- > 0)
 418     lex_get (lexer);
 419 }
 420 \f
 421 /* Issuing errors. */
 422
 423 /* Prints a syntax error message containing the current token and
 424    given message MESSAGE (if non-null). */
 425 void
 426 lex_error (struct lexer *lexer, const char *format, ...)
 427 {
 428   va_list args;
 429
 430   va_start (args, format);
 431   lex_ofs_msg_valist (lexer, SE, lex_ofs (lexer), lex_ofs (lexer),
 432                       format, args);
 433   va_end (args);
 434 }
 435
 436 /* Prints a syntax error message for the span of tokens N0 through N1,
 437    inclusive, from the current token in LEXER, adding message MESSAGE (if
 438    non-null). */
 439 void
 440 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
 441 {
 442   va_list args;
 443
 444   va_start (args, format);
 445   int ofs = lex_ofs (lexer);
 446   lex_ofs_msg_valist (lexer, SE, n0 + ofs, n1 + ofs, format, args);
 447   va_end (args);
 448 }
 449
 450 /* Prints a syntax error message for the span of tokens with offsets OFS0
 451    through OFS1, inclusive, within the current command in LEXER, adding message
 452    MESSAGE (if non-null). */
 453 void
 454 lex_ofs_error (struct lexer *lexer, int ofs0, int ofs1, const char *format, ...)
 455 {
 456   va_list args;
 457
 458   va_start (args, format);
 459   lex_ofs_msg_valist (lexer, SE, ofs0, ofs1, format, args);
 460   va_end (args);
 461 }
 462
 463 /* Prints a message of the given CLASS containing the current token and given
 464    message MESSAGE (if non-null). */
 465 void
 466 lex_msg (struct lexer *lexer, enum msg_class class, const char *format, ...)
 467 {
 468   va_list args;
 469
 470   va_start (args, format);
 471   lex_ofs_msg_valist (lexer, class, lex_ofs (lexer), lex_ofs (lexer),
 472                       format, args);
 473   va_end (args);
 474 }
 475
 476 /* Prints a syntax error message for the span of tokens N0 through N1,
 477    inclusive, from the current token in LEXER, adding message MESSAGE (if
 478    non-null). */
 479 void
 480 lex_next_msg (struct lexer *lexer, enum msg_class class, int n0, int n1,
 481               const char *format, ...)
 482 {
 483   va_list args;
 484
 485   va_start (args, format);
 486   int ofs = lex_ofs (lexer);
 487   lex_ofs_msg_valist (lexer, class, n0 + ofs, n1 + ofs, format, args);
 488   va_end (args);
 489 }
 490
 491 /* Prints a message of the given CLASS for the span of tokens with offsets OFS0
 492    through OFS1, inclusive, within the current command in LEXER, adding message
 493    MESSAGE (if non-null). */
 494 void
 495 lex_ofs_msg (struct lexer *lexer, enum msg_class class, int ofs0, int ofs1,
 496              const char *format, ...)
 497 {
 498   va_list args;
 499
 500   va_start (args, format);
 501   lex_ofs_msg_valist (lexer, class, ofs0, ofs1, format, args);
 502   va_end (args);
 503 }
 504
 505 /* Prints a syntax error message saying that one of the strings provided as
 506    varargs, up to the first NULL, is expected. */
 507 void
 508 (lex_error_expecting) (struct lexer *lexer, ...)
 509 {
 510   va_list args;
 511
 512   va_start (args, lexer);
 513   lex_error_expecting_valist (lexer, args);
 514   va_end (args);
 515 }
 516
 517 /* Prints a syntax error message saying that one of the options provided in
 518    ARGS, up to the first NULL, is expected. */
 519 void
 520 lex_error_expecting_valist (struct lexer *lexer, va_list args)
 521 {
 522   enum { MAX_OPTIONS = 9 };
 523   const char *options[MAX_OPTIONS];
 524   int n = 0;
 525   while (n < MAX_OPTIONS)
 526     {
 527       const char *option = va_arg (args, const char *);
 528       if (!option)
 529         break;
 530
 531       options[n++] = option;
 532     }
 533   lex_error_expecting_array (lexer, options, n);
 534 }
 535
 536 void
 537 lex_error_expecting_array (struct lexer *lexer, const char **options, size_t n)
 538 {
 539   switch (n)
 540     {
 541     case 0:
 542       lex_error (lexer, NULL);
 543       break;
 544
 545     case 1:
 546       lex_error (lexer, _("Syntax error expecting %s."), options[0]);
 547       break;
 548
 549     case 2:
 550       lex_error (lexer, _("Syntax error expecting %s or %s."),
 551                  options[0], options[1]);
 552       break;
 553
 554     case 3:
 555       lex_error (lexer, _("Syntax error expecting %s, %s, or %s."),
 556                  options[0], options[1], options[2]);
 557       break;
 558
 559     case 4:
 560       lex_error (lexer, _("Syntax error expecting %s, %s, %s, or %s."),
 561                  options[0], options[1], options[2], options[3]);
 562       break;
 563
 564     case 5:
 565       lex_error (lexer, _("Syntax error expecting %s, %s, %s, %s, or %s."),
 566                  options[0], options[1], options[2], options[3], options[4]);
 567       break;
 568
 569     case 6:
 570       lex_error (lexer, _("Syntax error expecting %s, %s, %s, %s, %s, or %s."),
 571                  options[0], options[1], options[2], options[3], options[4],
 572                  options[5]);
 573       break;
 574
 575     case 7:
 576       lex_error (lexer, _("Syntax error expecting %s, %s, %s, %s, %s, %s, "
 577                           "or %s."),
 578                  options[0], options[1], options[2], options[3], options[4],
 579                  options[5], options[6]);
 580       break;
 581
 582     case 8:
 583       lex_error (lexer, _("Syntax error expecting %s, %s, %s, %s, %s, %s, %s, "
 584                           "or %s."),
 585                  options[0], options[1], options[2], options[3], options[4],
 586                  options[5], options[6], options[7]);
 587       break;
 588
 589     default:
 590       {
 591         struct string s = DS_EMPTY_INITIALIZER;
 592         for (size_t i = 0; i < n; i++)
 593           {
 594             if (i > 0)
 595               ds_put_cstr (&s, ", ");
 596             ds_put_cstr (&s, options[i]);
 597           }
 598         lex_error (lexer, _("Syntax error expecting one of the following: %s."),
 599                    ds_cstr (&s));
 600         ds_destroy (&s);
 601       }
 602       break;
 603     }
 604 }
 605
 606 /* Reports an error to the effect that subcommand SBC may only be specified
 607    once. */
 608 void
 609 lex_sbc_only_once (struct lexer *lexer, const char *sbc)
 610 {
 611   int ofs = lex_ofs (lexer) - 1;
 612   if (lex_ofs_token (lexer, ofs)->type == T_EQUALS)
 613     ofs--;
 614
 615   /* lex_ofs_at_phrase__() handles subcommand names that are keywords, such as
 616      BY. */
 617   if (lex_ofs_at_phrase__ (lexer, ofs, sbc))
 618     lex_ofs_error (lexer, ofs, ofs,
 619                    _("Subcommand %s may only be specified once."), sbc);
 620   else
 621     msg (SE, _("Subcommand %s may only be specified once."), sbc);
 622 }
 623
 624 /* Reports an error to the effect that subcommand SBC is missing.
 625
 626    This function does not take a lexer as an argument or use lex_error(),
 627    because a missing subcommand can normally be detected only after the whole
 628    command has been parsed, and so lex_error() would always report "Syntax
 629    error at end of command", which does not help the user find the error. */
 630 void
 631 lex_sbc_missing (struct lexer *lexer, const char *sbc)
 632 {
 633   lex_ofs_error (lexer, 0, lex_max_ofs (lexer),
 634                  _("Required subcommand %s was not specified."), sbc);
 635 }
 636
 637 /* Reports an error to the effect that specification SPEC may only be specified
 638    once within subcommand SBC. */
 639 void
 640 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
 641 {
 642   lex_error (lexer, _("%s may only be specified once within subcommand %s."),
 643              spec, sbc);
 644 }
 645
 646 /* Reports an error to the effect that specification SPEC is missing within
 647    subcommand SBC. */
 648 void
 649 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
 650 {
 651   lex_error (lexer, _("Required %s specification missing from %s subcommand."),
 652              spec, sbc);
 653 }
 654
 655 /* Prints a syntax error message for the span of tokens with offsets OFS0
 656    through OFS1, inclusive, within the current command in LEXER, adding message
 657    MESSAGE (if non-null) with the given ARGS. */
 658 void
 659 lex_ofs_msg_valist (struct lexer *lexer, enum msg_class class,
 660                     int ofs0, int ofs1, const char *format, va_list args)
 661 {
 662   lex_source_msg_valist (lex_source__ (lexer), class, ofs0, ofs1, format, args);
 663 }
 664
 665 /* Checks that we're at end of command.
 666    If so, returns a successful command completion code.
 667    If not, flags a syntax error and returns an error command
 668    completion code. */
 669 int
 670 lex_end_of_command (struct lexer *lexer)
 671 {
 672   if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
 673     {
 674       lex_error (lexer, _("Syntax error expecting end of command."));
 675       return CMD_FAILURE;
 676     }
 677   else
 678     return CMD_SUCCESS;
 679 }
 680 \f
 681 /* Token testing functions. */
 682
 683 /* Returns true if the current token is a number. */
 684 bool
 685 lex_is_number (const struct lexer *lexer)
 686 {
 687   return lex_next_is_number (lexer, 0);
 688 }
 689
 690 /* Returns true if the current token is a string. */
 691 bool
 692 lex_is_string (const struct lexer *lexer)
 693 {
 694   return lex_next_is_string (lexer, 0);
 695 }
 696
 697 /* Returns the value of the current token, which must be a
 698    floating point number. */
 699 double
 700 lex_number (const struct lexer *lexer)
 701 {
 702   return lex_next_number (lexer, 0);
 703 }
 704
 705 /* Returns true iff the current token is an integer. */
 706 bool
 707 lex_is_integer (const struct lexer *lexer)
 708 {
 709   return lex_next_is_integer (lexer, 0);
 710 }
 711
 712 /* Returns the value of the current token, which must be an
 713    integer. */
 714 long
 715 lex_integer (const struct lexer *lexer)
 716 {
 717   return lex_next_integer (lexer, 0);
 718 }
 719 \f
 720 /* Token testing functions with lookahead.
 721
 722    A value of 0 for N as an argument to any of these functions refers to the
 723    current token.  Lookahead is limited to the current command.  Any N greater
 724    than the number of tokens remaining in the current command will be treated
 725    as referring to a T_ENDCMD token. */
 726
 727 /* Returns true if the token N ahead of the current token is a number. */
 728 bool
 729 lex_next_is_number (const struct lexer *lexer, int n)
 730 {
 731   return token_is_number (lex_next (lexer, n));
 732 }
 733
 734 /* Returns true if the token N ahead of the current token is a string. */
 735 bool
 736 lex_next_is_string (const struct lexer *lexer, int n)
 737 {
 738   return token_is_string (lex_next (lexer, n));
 739 }
 740
 741 /* Returns the value of the token N ahead of the current token, which must be a
 742    floating point number. */
 743 double
 744 lex_next_number (const struct lexer *lexer, int n)
 745 {
 746   return token_number (lex_next (lexer, n));
 747 }
 748
 749 /* Returns true if the token N ahead of the current token is an integer. */
 750 bool
 751 lex_next_is_integer (const struct lexer *lexer, int n)
 752 {
 753   return token_is_integer (lex_next (lexer, n));
 754 }
 755
 756 /* Returns the value of the token N ahead of the current token, which must be
 757    an integer. */
 758 long
 759 lex_next_integer (const struct lexer *lexer, int n)
 760 {
 761   return token_integer (lex_next (lexer, n));
 762 }
 763 \f
 764 /* Token matching functions. */
 765
 766 /* If the current token has the specified TYPE, skips it and returns true.
 767    Otherwise, returns false. */
 768 bool
 769 lex_match (struct lexer *lexer, enum token_type type)
 770 {
 771   if (lex_token (lexer) == type)
 772     {
 773       lex_get (lexer);
 774       return true;
 775     }
 776   else
 777     return false;
 778 }
 779
 780 /* If the current token matches IDENTIFIER, skips it and returns true.
 781    IDENTIFIER may be abbreviated to its first three letters.  Otherwise,
 782    returns false.
 783
 784    IDENTIFIER must be an ASCII string. */
 785 bool
 786 lex_match_id (struct lexer *lexer, const char *identifier)
 787 {
 788   return lex_match_id_n (lexer, identifier, 3);
 789 }
 790
 791 /* If the current token is IDENTIFIER, skips it and returns true.  IDENTIFIER
 792    may be abbreviated to its first N letters.  Otherwise, returns false.
 793
 794    IDENTIFIER must be an ASCII string. */
 795 bool
 796 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
 797 {
 798   if (lex_token (lexer) == T_ID
 799       && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
 800     {
 801       lex_get (lexer);
 802       return true;
 803     }
 804   else
 805     return false;
 806 }
 807
 808 /* If the current token is integer X, skips it and returns true.  Otherwise,
 809    returns false. */
 810 bool
 811 lex_match_int (struct lexer *lexer, int x)
 812 {
 813   if (lex_is_integer (lexer) && lex_integer (lexer) == x)
 814     {
 815       lex_get (lexer);
 816       return true;
 817     }
 818   else
 819     return false;
 820 }
 821 \f
 822 /* Forced matches. */
 823
 824 /* If this token is IDENTIFIER, skips it and returns true.  IDENTIFIER may be
 825    abbreviated to its first 3 letters.  Otherwise, reports an error and returns
 826    false.
 827
 828    IDENTIFIER must be an ASCII string. */
 829 bool
 830 lex_force_match_id (struct lexer *lexer, const char *identifier)
 831 {
 832   if (lex_match_id (lexer, identifier))
 833     return true;
 834   else
 835     {
 836       lex_error_expecting (lexer, identifier);
 837       return false;
 838     }
 839 }
 840
 841 /* If the current token has the specified TYPE, skips it and returns true.
 842    Otherwise, reports an error and returns false. */
 843 bool
 844 lex_force_match (struct lexer *lexer, enum token_type type)
 845 {
 846   if (lex_token (lexer) == type)
 847     {
 848       lex_get (lexer);
 849       return true;
 850     }
 851   else
 852     {
 853       const char *type_string = token_type_to_string (type);
 854       if (type_string)
 855         {
 856           char *s = xasprintf ("`%s'", type_string);
 857           lex_error_expecting (lexer, s);
 858           free (s);
 859         }
 860       else
 861         lex_error_expecting (lexer, token_type_to_name (type));
 862
 863       return false;
 864     }
 865 }
 866
 867 /* If the current token is a string, does nothing and returns true.
 868    Otherwise, reports an error and returns false. */
 869 bool
 870 lex_force_string (struct lexer *lexer)
 871 {
 872   if (lex_is_string (lexer))
 873     return true;
 874   else
 875     {
 876       lex_error (lexer, _("Syntax error expecting string."));
 877       return false;
 878     }
 879 }
 880
 881 /* If the current token is a string or an identifier, does nothing and returns
 882    true.  Otherwise, reports an error and returns false.
 883
 884    This is meant for use in syntactic situations where we want to encourage the
 885    user to supply a quoted string, but for compatibility we also accept
 886    identifiers.  (One example of such a situation is file names.)  Therefore,
 887    the error message issued when the current token is wrong only says that a
 888    string is expected and doesn't mention that an identifier would also be
 889    accepted. */
 890 bool
 891 lex_force_string_or_id (struct lexer *lexer)
 892 {
 893   return lex_token (lexer) == T_ID || lex_force_string (lexer);
 894 }
 895
 896 /* If the current token is an integer, does nothing and returns true.
 897    Otherwise, reports an error and returns false. */
 898 bool
 899 lex_force_int (struct lexer *lexer)
 900 {
 901   if (lex_is_integer (lexer))
 902     return true;
 903   else
 904     {
 905       lex_error (lexer, _("Syntax error expecting integer."));
 906       return false;
 907     }
 908 }
 909
 910 /* If the current token is an integer in the range MIN...MAX (inclusive), does
 911    nothing and returns true.  Otherwise, reports an error and returns false.
 912    If NAME is nonnull, then it is used in the error message. */
 913 bool
 914 lex_force_int_range (struct lexer *lexer, const char *name, long min, long max)
 915 {
 916   bool is_number = lex_is_number (lexer);
 917   bool is_integer = lex_is_integer (lexer);
 918   bool too_small = (is_integer ? lex_integer (lexer) < min
 919                     : is_number ? lex_number (lexer) < min
 920                     : false);
 921   bool too_big = (is_integer ? lex_integer (lexer) > max
 922                   : is_number ? lex_number (lexer) > max
 923                   : false);
 924   if (is_integer && !too_small && !too_big)
 925     return true;
 926
 927   if (min > max)
 928     {
 929       /* Weird, maybe a bug in the caller.  Just report that we needed an
 930          integer. */
 931       if (name)
 932         lex_error (lexer, _("Syntax error expecting integer for %s."), name);
 933       else
 934         lex_error (lexer, _("Syntax error expecting integer."));
 935     }
 936   else if (min == max)
 937     {
 938       if (name)
 939         lex_error (lexer, _("Syntax error expecting %ld for %s."), min, name);
 940       else
 941         lex_error (lexer, _("Syntax error expecting %ld."), min);
 942     }
 943   else if (min + 1 == max)
 944     {
 945       if (name)
 946         lex_error (lexer, _("Syntax error expecting %ld or %ld for %s."),
 947                    min, min + 1, name);
 948       else
 949         lex_error (lexer, _("Syntax error expecting %ld or %ld."),
 950                    min, min + 1);
 951     }
 952   else
 953     {
 954       bool report_lower_bound = (min > INT_MIN / 2) || too_small;
 955       bool report_upper_bound = (max < INT_MAX / 2) || too_big;
 956
 957       if (report_lower_bound && report_upper_bound)
 958         {
 959           if (name)
 960             lex_error (lexer,
 961                        _("Syntax error expecting integer "
 962                          "between %ld and %ld for %s."),
 963                        min, max, name);
 964           else
 965             lex_error (lexer, _("Syntax error expecting integer "
 966                                 "between %ld and %ld."),
 967                        min, max);
 968         }
 969       else if (report_lower_bound)
 970         {
 971           if (min == 0)
 972             {
 973               if (name)
 974                 lex_error (lexer, _("Syntax error expecting "
 975                                     "non-negative integer for %s."),
 976                            name);
 977               else
 978                 lex_error (lexer, _("Syntax error expecting "
 979                                     "non-negative integer."));
 980             }
 981           else if (min == 1)
 982             {
 983               if (name)
 984                 lex_error (lexer, _("Syntax error expecting "
 985                                     "positive integer for %s."),
 986                            name);
 987               else
 988                 lex_error (lexer, _("Syntax error expecting "
 989                                     "positive integer."));
 990             }
 991           else
 992             {
 993               if (name)
 994                 lex_error (lexer, _("Syntax error expecting "
 995                                     "integer %ld or greater for %s."),
 996                            min, name);
 997               else
 998                 lex_error (lexer, _("Syntax error expecting "
 999                                     "integer %ld or greater."), min);
1000             }
1001         }
1002       else if (report_upper_bound)
1003         {
1004           if (name)
1005             lex_error (lexer,
1006                        _("Syntax error expecting integer less than or equal "
1007                          "to %ld for %s."),
1008                        max, name);
1009           else
1010             lex_error (lexer, _("Syntax error expecting integer less than or "
1011                                 "equal to %ld."),
1012                        max);
1013         }
1014       else
1015         {
1016           if (name)
1017             lex_error (lexer, _("Syntax error expecting integer for %s."),
1018                        name);
1019           else
1020             lex_error (lexer, _("Syntax error expecting integer."));
1021         }
1022     }
1023   return false;
1024 }
1025
1026 /* If the current token is a number, does nothing and returns true.
1027    Otherwise, reports an error and returns false. */
1028 bool
1029 lex_force_num (struct lexer *lexer)
1030 {
1031   if (lex_is_number (lexer))
1032     return true;
1033
1034   lex_error (lexer, _("Syntax error expecting number."));
1035   return false;
1036 }
1037
1038 /* If the current token is an number in the closed range [MIN,MAX], does
1039    nothing and returns true.  Otherwise, reports an error and returns false.
1040    If NAME is nonnull, then it is used in the error message. */
1041 bool
1042 lex_force_num_range_closed (struct lexer *lexer, const char *name,
1043                             double min, double max)
1044 {
1045   bool is_number = lex_is_number (lexer);
1046   bool too_small = is_number && lex_number (lexer) < min;
1047   bool too_big = is_number && lex_number (lexer) > max;
1048   if (is_number && !too_small && !too_big)
1049     return true;
1050
1051   if (min > max)
1052     {
1053       /* Weird, maybe a bug in the caller.  Just report that we needed an
1054          number. */
1055       if (name)
1056         lex_error (lexer, _("Syntax error expecting number for %s."), name);
1057       else
1058         lex_error (lexer, _("Syntax error expecting number."));
1059     }
1060   else if (min == max)
1061     {
1062       if (name)
1063         lex_error (lexer, _("Syntax error expecting number %g for %s."),
1064                    min, name);
1065       else
1066         lex_error (lexer, _("Syntax error expecting number %g."), min);
1067     }
1068   else
1069     {
1070       bool report_lower_bound = min > -DBL_MAX || too_small;
1071       bool report_upper_bound = max < DBL_MAX || too_big;
1072
1073       if (report_lower_bound && report_upper_bound)
1074         {
1075           if (name)
1076             lex_error (lexer,
1077                        _("Syntax error expecting number "
1078                          "between %g and %g for %s."),
1079                        min, max, name);
1080           else
1081             lex_error (lexer, _("Syntax error expecting number "
1082                                 "between %g and %g."),
1083                        min, max);
1084         }
1085       else if (report_lower_bound)
1086         {
1087           if (min == 0)
1088             {
1089               if (name)
1090                 lex_error (lexer, _("Syntax error expecting "
1091                                     "non-negative number for %s."),
1092                            name);
1093               else
1094                 lex_error (lexer, _("Syntax error expecting "
1095                                     "non-negative number."));
1096             }
1097           else
1098             {
1099               if (name)
1100                 lex_error (lexer, _("Syntax error expecting number "
1101                                     "%g or greater for %s."),
1102                            min, name);
1103               else
1104                 lex_error (lexer, _("Syntax error expecting number "
1105                                     "%g or greater."), min);
1106             }
1107         }
1108       else if (report_upper_bound)
1109         {
1110           if (name)
1111             lex_error (lexer,
1112                        _("Syntax error expecting number "
1113                          "less than or equal to %g for %s."),
1114                        max, name);
1115           else
1116             lex_error (lexer, _("Syntax error expecting number "
1117                                 "less than or equal to %g."),
1118                        max);
1119         }
1120       else
1121         {
1122           if (name)
1123             lex_error (lexer, _("Syntax error expecting number for %s."), name);
1124           else
1125             lex_error (lexer, _("Syntax error expecting number."));
1126         }
1127     }
1128   return false;
1129 }
1130
1131 /* If the current token is an number in the half-open range [MIN,MAX), does
1132    nothing and returns true.  Otherwise, reports an error and returns false.
1133    If NAME is nonnull, then it is used in the error message. */
1134 bool
1135 lex_force_num_range_halfopen (struct lexer *lexer, const char *name,
1136                               double min, double max)
1137 {
1138   bool is_number = lex_is_number (lexer);
1139   bool too_small = is_number && lex_number (lexer) < min;
1140   bool too_big = is_number && lex_number (lexer) >= max;
1141   if (is_number && !too_small && !too_big)
1142     return true;
1143
1144   if (min >= max)
1145     {
1146       /* Weird, maybe a bug in the caller.  Just report that we needed an
1147          number. */
1148       if (name)
1149         lex_error (lexer, _("Syntax error expecting number for %s."), name);
1150       else
1151         lex_error (lexer, _("Syntax error expecting number."));
1152     }
1153   else
1154     {
1155       bool report_lower_bound = min > -DBL_MAX || too_small;
1156       bool report_upper_bound = max < DBL_MAX || too_big;
1157
1158       if (report_lower_bound && report_upper_bound)
1159         {
1160           if (name)
1161             lex_error (lexer, _("Syntax error expecting number "
1162                                 "in [%g,%g) for %s."),
1163                        min, max, name);
1164           else
1165             lex_error (lexer, _("Syntax error expecting number in [%g,%g)."),
1166                        min, max);
1167         }
1168       else if (report_lower_bound)
1169         {
1170           if (min == 0)
1171             {
1172               if (name)
1173                 lex_error (lexer, _("Syntax error expecting "
1174                                     "non-negative number for %s."),
1175                            name);
1176               else
1177                 lex_error (lexer, _("Syntax error expecting "
1178                                     "non-negative number."));
1179             }
1180           else
1181             {
1182               if (name)
1183                 lex_error (lexer, _("Syntax error expecting "
1184                                     "number %g or greater for %s."),
1185                            min, name);
1186               else
1187                 lex_error (lexer, _("Syntax error expecting "
1188                                     "number %g or greater."), min);
1189             }
1190         }
1191       else if (report_upper_bound)
1192         {
1193           if (name)
1194             lex_error (lexer,
1195                        _("Syntax error expecting "
1196                          "number less than %g for %s."), max, name);
1197           else
1198             lex_error (lexer, _("Syntax error expecting "
1199                                 "number less than %g."), max);
1200         }
1201       else
1202         {
1203           if (name)
1204             lex_error (lexer, _("Syntax error expecting number for %s."), name);
1205           else
1206             lex_error (lexer, _("Syntax error expecting number."));
1207         }
1208     }
1209   return false;
1210 }
1211
1212 /* If the current token is an number in the open range (MIN,MAX), does
1213    nothing and returns true.  Otherwise, reports an error and returns false.
1214    If NAME is nonnull, then it is used in the error message. */
1215 bool
1216 lex_force_num_range_open (struct lexer *lexer, const char *name,
1217                           double min, double max)
1218 {
1219   bool is_number = lex_is_number (lexer);
1220   bool too_small = is_number && lex_number (lexer) <= min;
1221   bool too_big = is_number && lex_number (lexer) >= max;
1222   if (is_number && !too_small && !too_big)
1223     return true;
1224
1225   if (min >= max)
1226     {
1227       /* Weird, maybe a bug in the caller.  Just report that we needed an
1228          number. */
1229       if (name)
1230         lex_error (lexer, _("Syntax error expecting number for %s."), name);
1231       else
1232         lex_error (lexer, _("Syntax error expecting number."));
1233     }
1234   else
1235     {
1236       bool report_lower_bound = min > -DBL_MAX || too_small;
1237       bool report_upper_bound = max < DBL_MAX || too_big;
1238
1239       if (report_lower_bound && report_upper_bound)
1240         {
1241           if (name)
1242             lex_error (lexer, _("Syntax error expecting number "
1243                                 "in (%g,%g) for %s."),
1244                        min, max, name);
1245           else
1246             lex_error (lexer, _("Syntax error expecting number "
1247                                 "in (%g,%g)."), min, max);
1248         }
1249       else if (report_lower_bound)
1250         {
1251           if (min == 0)
1252             {
1253               if (name)
1254                 lex_error (lexer, _("Syntax error expecting "
1255                                     "positive number for %s."), name);
1256               else
1257                 lex_error (lexer, _("Syntax error expecting "
1258                                     "positive number."));
1259             }
1260           else
1261             {
1262               if (name)
1263                 lex_error (lexer, _("Syntax error expecting number "
1264                                     "greater than %g for %s."),
1265                            min, name);
1266               else
1267                 lex_error (lexer, _("Syntax error expecting number "
1268                                     "greater than %g."), min);
1269             }
1270         }
1271       else if (report_upper_bound)
1272         {
1273           if (name)
1274             lex_error (lexer, _("Syntax error expecting number "
1275                                 "less than %g for %s."),
1276                        max, name);
1277           else
1278             lex_error (lexer, _("Syntax error expecting number "
1279                                 "less than %g."), max);
1280         }
1281       else
1282         {
1283           if (name)
1284             lex_error (lexer, _("Syntax error expecting number "
1285                                 "for %s."), name);
1286           else
1287             lex_error (lexer, _("Syntax error expecting number."));
1288         }
1289     }
1290   return false;
1291 }
1292
1293 /* If the current token is an identifier, does nothing and returns true.
1294    Otherwise, reports an error and returns false. */
1295 bool
1296 lex_force_id (struct lexer *lexer)
1297 {
1298   if (lex_token (lexer) == T_ID)
1299     return true;
1300
1301   lex_error (lexer, _("Syntax error expecting identifier."));
1302   return false;
1303 }
1304 \f
1305 /* Token accessors. */
1306
1307 /* Returns the type of LEXER's current token. */
1308 enum token_type
1309 lex_token (const struct lexer *lexer)
1310 {
1311   return lex_next_token (lexer, 0);
1312 }
1313
1314 /* Returns the number in LEXER's current token.
1315
1316    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
1317    tokens this function will always return zero. */
1318 double
1319 lex_tokval (const struct lexer *lexer)
1320 {
1321   return lex_next_tokval (lexer, 0);
1322 }
1323
1324 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
1325
1326    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
1327    this functions this function will always return NULL.
1328
1329    The UTF-8 encoding of the returned string is correct for variable names and
1330    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
1331    data_in() to use it in a "union value".  */
1332 const char *
1333 lex_tokcstr (const struct lexer *lexer)
1334 {
1335   return lex_next_tokcstr (lexer, 0);
1336 }
1337
1338 /* Returns the string in LEXER's current token, UTF-8 encoded.  The string is
1339    null-terminated (but the null terminator is not included in the returned
1340    substring's 'length').
1341
1342    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
1343    this functions this function will always return NULL.
1344
1345    The UTF-8 encoding of the returned string is correct for variable names and
1346    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
1347    data_in() to use it in a "union value".  */
1348 struct substring
1349 lex_tokss (const struct lexer *lexer)
1350 {
1351   return lex_next_tokss (lexer, 0);
1352 }
1353 \f
1354 /* Looking ahead.
1355
1356    A value of 0 for N as an argument to any of these functions refers to the
1357    current token.  Lookahead is limited to the current command.  Any N greater
1358    than the number of tokens remaining in the current command will be treated
1359    as referring to a T_ENDCMD token. */
1360
1361 static const struct lex_token *
1362 lex_next__ (const struct lexer *lexer_, int n)
1363 {
1364   struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
1365   struct lex_source *src = lex_source__ (lexer);
1366
1367   if (src != NULL)
1368     return lex_source_next__ (src, n);
1369   else
1370     {
1371       static const struct lex_token stop_token = { .token = { .type = T_STOP } };
1372       return &stop_token;
1373     }
1374 }
1375
1376 static const struct lex_token *
1377 lex_source_ofs__ (const struct lex_source *src_, int ofs)
1378 {
1379   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1380
1381   if (ofs < 0)
1382     {
1383       static const struct lex_token endcmd_token
1384         = { .token = { .type = T_ENDCMD } };
1385       return &endcmd_token;
1386     }
1387
1388   while (ofs >= src->n_parse)
1389     {
1390       if (src->n_parse > 0)
1391         {
1392           const struct lex_token *t = src->parse[src->n_parse - 1];
1393           if (t->token.type == T_STOP || t->token.type == T_ENDCMD)
1394             return t;
1395         }
1396
1397       lex_source_get_parse (src);
1398     }
1399
1400   return src->parse[ofs];
1401 }
1402
1403 static const struct lex_token *
1404 lex_source_next__ (const struct lex_source *src, int n)
1405 {
1406   return lex_source_ofs__ (src, n + src->parse_ofs);
1407 }
1408
1409 /* Returns the "struct token" of the token N after the current one in LEXER.
1410    The returned pointer can be invalidated by pretty much any succeeding call
1411    into the lexer, although the string pointer within the returned token is
1412    only invalidated by consuming the token (e.g. with lex_get()). */
1413 const struct token *
1414 lex_next (const struct lexer *lexer, int n)
1415 {
1416   return &lex_next__ (lexer, n)->token;
1417 }
1418
1419 /* Returns the type of the token N after the current one in LEXER. */
1420 enum token_type
1421 lex_next_token (const struct lexer *lexer, int n)
1422 {
1423   return lex_next (lexer, n)->type;
1424 }
1425
1426 /* Returns the number in the tokn N after the current one in LEXER.
1427
1428    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
1429    tokens this function will always return zero. */
1430 double
1431 lex_next_tokval (const struct lexer *lexer, int n)
1432 {
1433   return token_number (lex_next (lexer, n));
1434 }
1435
1436 /* Returns the null-terminated string in the token N after the current one, in
1437    UTF-8 encoding.
1438
1439    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
1440    this functions this function will always return NULL.
1441
1442    The UTF-8 encoding of the returned string is correct for variable names and
1443    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
1444    data_in() to use it in a "union value".  */
1445 const char *
1446 lex_next_tokcstr (const struct lexer *lexer, int n)
1447 {
1448   return lex_next_tokss (lexer, n).string;
1449 }
1450
1451 /* Returns the string in the token N after the current one, in UTF-8 encoding.
1452    The string is null-terminated (but the null terminator is not included in
1453    the returned substring's 'length').
1454
1455    Only T_ID, T_MACRO_ID, T_STRING tokens have meaningful strings.  For other
1456    tokens this functions this function will always return NULL.
1457
1458    The UTF-8 encoding of the returned string is correct for variable names and
1459    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
1460    data_in() to use it in a "union value".  */
1461 struct substring
1462 lex_next_tokss (const struct lexer *lexer, int n)
1463 {
1464   return lex_next (lexer, n)->string;
1465 }
1466
1467 /* Returns the offset of the current token within the command being parsed in
1468    LEXER.  This is 0 for the first token in a command, 1 for the second, and so
1469    on.  The return value is useful later for referring to this token in calls
1470    to lex_ofs_*(). */
1471 int
1472 lex_ofs (const struct lexer *lexer)
1473 {
1474   struct lex_source *src = lex_source__ (lexer);
1475   return src ? src->parse_ofs : 0;
1476 }
1477
1478 /* Returns the offset of the last token in the current command. */
1479 int
1480 lex_max_ofs (const struct lexer *lexer)
1481 {
1482   struct lex_source *src = lex_source__ (lexer);
1483   if (!src)
1484     return 0;
1485
1486   int ofs = MAX (1, src->n_parse) - 1;
1487   for (;;)
1488     {
1489       enum token_type type = lex_source_ofs__ (src, ofs)->token.type;
1490       if (type == T_ENDCMD || type == T_STOP)
1491         return ofs;
1492
1493       ofs++;
1494     }
1495 }
1496
1497 /* Returns the token within LEXER's current command with offset OFS.  Use
1498    lex_ofs() to find out the offset of the current token. */
1499 const struct token *
1500 lex_ofs_token (const struct lexer *lexer_, int ofs)
1501 {
1502   struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
1503   struct lex_source *src = lex_source__ (lexer);
1504
1505   if (src != NULL)
1506     return &lex_source_next__ (src, ofs - src->parse_ofs)->token;
1507   else
1508     {
1509       static const struct token stop_token = { .type = T_STOP };
1510       return &stop_token;
1511     }
1512 }
1513
1514 /* Allocates and returns a new struct msg_location that spans tokens with
1515    offsets OFS0 through OFS1, inclusive, within the current command in
1516    LEXER.  See lex_ofs() for an explanation of token offsets.
1517
1518    The caller owns and must eventually free the returned object. */
1519 struct msg_location *
1520 lex_ofs_location (const struct lexer *lexer, int ofs0, int ofs1)
1521 {
1522   int ofs = lex_ofs (lexer);
1523   return lex_get_location (lexer, ofs0 - ofs, ofs1 - ofs);
1524 }
1525
1526 /* Returns a msg_point for the first character in the token with offset OFS,
1527    where offset 0 is the first token in the command currently being parsed, 1
1528    the second token, and so on.  These are absolute offsets, not relative to
1529    the token currently being parsed within the command.
1530
1531    Returns zeros for a T_STOP token.
1532  */
1533 struct msg_point
1534 lex_ofs_start_point (const struct lexer *lexer, int ofs)
1535 {
1536   const struct lex_source *src = lex_source__ (lexer);
1537   return (src
1538           ? lex_token_start_point (src, lex_source_ofs__ (src, ofs))
1539           : (struct msg_point) { 0, 0 });
1540 }
1541
1542 /* Returns a msg_point for the last character, inclusive, in the token with
1543    offset OFS, where offset 0 is the first token in the command currently being
1544    parsed, 1 the second token, and so on.  These are absolute offsets, not
1545    relative to the token currently being parsed within the command.
1546
1547    Returns zeros for a T_STOP token.
1548
1549    Most of the time, a single token is wholly within a single line of syntax,
1550    so that the start and end point for a given offset have the same line
1551    number.  There are two exceptions: a T_STRING token can be made up of
1552    multiple segments on adjacent lines connected with "+" punctuators, and a
1553    T_NEG_NUM token can consist of a "-" on one line followed by the number on
1554    the next.
1555  */
1556 struct msg_point
1557 lex_ofs_end_point (const struct lexer *lexer, int ofs)
1558 {
1559   const struct lex_source *src = lex_source__ (lexer);
1560   return (src
1561           ? lex_token_end_point (src, lex_source_ofs__ (src, ofs))
1562           : (struct msg_point) { 0, 0 });
1563 }
1564
1565 /* Returns the text of the syntax in tokens N0 ahead of the current one,
1566    through N1 ahead of the current one, inclusive.  (For example, if N0 and N1
1567    are both zero, this requests the syntax for the current token.)
1568
1569    The caller must eventually free the returned string (with free()).  The
1570    syntax is encoded in UTF-8 and in the original form supplied to the lexer so
1571    that, for example, it may include comments, spaces, and new-lines if it
1572    spans multiple tokens.  Macro expansion, however, has already been
1573    performed. */
1574 char *
1575 lex_next_representation (const struct lexer *lexer, int n0, int n1)
1576 {
1577   const struct lex_source *src = lex_source__ (lexer);
1578   return (src
1579           ? lex_source_syntax__ (src, n0 + src->parse_ofs, n1 + src->parse_ofs)
1580           : xstrdup (""));
1581 }
1582
1583
1584 /* Returns the text of the syntax in tokens with offsets OFS0 to OFS1,
1585    inclusive.  (For example, if OFS0 and OFS1 are both zero, this requests the
1586    syntax for the first token in the current command.)
1587
1588    The caller must eventually free the returned string (with free()).  The
1589    syntax is encoded in UTF-8 and in the original form supplied to the lexer so
1590    that, for example, it may include comments, spaces, and new-lines if it
1591    spans multiple tokens.  Macro expansion, however, has already been
1592    performed. */
1593 char *
1594 lex_ofs_representation (const struct lexer *lexer, int ofs0, int ofs1)
1595 {
1596   const struct lex_source *src = lex_source__ (lexer);
1597   return src ? lex_source_syntax__ (src, ofs0, ofs1) : xstrdup ("");
1598 }
1599
1600 /* Returns true if the token N ahead of the current one was produced by macro
1601    expansion, false otherwise. */
1602 bool
1603 lex_next_is_from_macro (const struct lexer *lexer, int n)
1604 {
1605   return lex_next__ (lexer, n)->macro_rep != NULL;
1606 }
1607
1608 static bool
1609 lex_tokens_match (const struct token *actual, const struct token *expected)
1610 {
1611   if (actual->type != expected->type)
1612     return false;
1613
1614   switch (actual->type)
1615     {
1616     case T_POS_NUM:
1617     case T_NEG_NUM:
1618       return actual->number == expected->number;
1619
1620     case T_ID:
1621       return lex_id_match (expected->string, actual->string);
1622
1623     case T_STRING:
1624       return (actual->string.length == expected->string.length
1625               && !memcmp (actual->string.string, expected->string.string,
1626                           actual->string.length));
1627
1628     default:
1629       return true;
1630     }
1631 }
1632
1633 static size_t
1634 lex_ofs_at_phrase__ (struct lexer *lexer, int ofs, const char *s)
1635 {
1636   struct string_lexer slex;
1637   struct token token;
1638
1639   size_t i = 0;
1640   string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE, true);
1641   while (string_lexer_next (&slex, &token))
1642     {
1643       bool match = lex_tokens_match (lex_ofs_token (lexer, ofs + i++), &token);
1644       token_uninit (&token);
1645       if (!match)
1646         return 0;
1647     }
1648   return i;
1649 }
1650
1651 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1652    returns true.  Otherwise, returns false.
1653
1654    S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1655    "2SLS", or "END INPUT PROGRAM".  Identifiers may be abbreviated to their
1656    first three letters. */
1657 bool
1658 lex_at_phrase (struct lexer *lexer, const char *s)
1659 {
1660   return lex_ofs_at_phrase__ (lexer, lex_ofs (lexer), s) > 0;
1661 }
1662
1663 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1664    skips it and returns true.  Otherwise, returns false.
1665
1666    S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1667    "2SLS", or "END INPUT PROGRAM".  Identifiers may be abbreviated to their
1668    first three letters. */
1669 bool
1670 lex_match_phrase (struct lexer *lexer, const char *s)
1671 {
1672   size_t n = lex_ofs_at_phrase__ (lexer, lex_ofs (lexer), s);
1673   if (n > 0)
1674     lex_get_n (lexer, n);
1675   return n > 0;
1676 }
1677
1678 /* Returns the 1-based line number of the source text at the byte OFFSET in
1679    SRC. */
1680 static int
1681 lex_source_ofs_to_line_number (const struct lex_source *src, size_t offset)
1682 {
1683   size_t lo = 0;
1684   size_t hi = src->n_lines;
1685   for (;;)
1686     {
1687       size_t mid = (lo + hi) / 2;
1688       if (mid + 1 >= src->n_lines)
1689         return src->n_lines;
1690       else if (offset >= src->lines[mid + 1])
1691         lo = mid;
1692       else if (offset < src->lines[mid])
1693         hi = mid;
1694       else
1695         return mid + 1;
1696     }
1697 }
1698
1699 /* Returns the 1-based column number of the source text at the byte OFFSET in
1700    SRC. */
1701 static int
1702 lex_source_ofs_to_column_number (const struct lex_source *src, size_t offset)
1703 {
1704   const char *newline = memrchr (src->buffer, '\n', offset);
1705   size_t line_ofs = newline ? newline - src->buffer + 1 : 0;
1706   return utf8_count_columns (&src->buffer[line_ofs], offset - line_ofs) + 1;
1707 }
1708
1709 static struct msg_point
1710 lex_source_ofs_to_point__ (const struct lex_source *src, size_t offset)
1711 {
1712   return (struct msg_point) {
1713     .line = lex_source_ofs_to_line_number (src, offset),
1714     .column = lex_source_ofs_to_column_number (src, offset),
1715   };
1716 }
1717
1718 static struct msg_point
1719 lex_token_start_point (const struct lex_source *src,
1720                        const struct lex_token *token)
1721 {
1722   return lex_source_ofs_to_point__ (src, token->token_pos);
1723 }
1724
1725 static struct msg_point
1726 lex_token_end_point (const struct lex_source *src,
1727                      const struct lex_token *token)
1728 {
1729   return lex_source_ofs_to_point__ (src, lex_token_end (token));
1730 }
1731
1732 static struct msg_location
1733 lex_token_location (const struct lex_source *src,
1734                     const struct lex_token *t0,
1735                     const struct lex_token *t1)
1736 {
1737   return (struct msg_location) {
1738     .file_name = intern_new_if_nonnull (src->reader->file_name),
1739     .start = lex_token_start_point (src, t0),
1740     .end = lex_token_end_point (src, t1),
1741     .src = CONST_CAST (struct lex_source *, src),
1742   };
1743 }
1744
1745 static struct msg_location *
1746 lex_token_location_rw (const struct lex_source *src,
1747                        const struct lex_token *t0,
1748                        const struct lex_token *t1)
1749 {
1750   struct msg_location location = lex_token_location (src, t0, t1);
1751   return msg_location_dup (&location);
1752 }
1753
1754 static struct msg_location *
1755 lex_source_get_location (const struct lex_source *src, int ofs0, int ofs1)
1756 {
1757   return lex_token_location_rw (src,
1758                                 lex_source_ofs__ (src, ofs0),
1759                                 lex_source_ofs__ (src, ofs1));
1760 }
1761
1762 /* Returns the name of the syntax file from which the current command is drawn.
1763    Returns NULL for a T_STOP token or if the command's source does not have
1764    line numbers.
1765
1766    There is no version of this function that takes an N argument because
1767    lookahead only works to the end of a command and any given command is always
1768    within a single syntax file. */
1769 const char *
1770 lex_get_file_name (const struct lexer *lexer)
1771 {
1772   struct lex_source *src = lex_source__ (lexer);
1773   return src == NULL ? NULL : src->reader->file_name;
1774 }
1775
1776 /* Returns a newly allocated msg_location for the syntax that represents tokens
1777    with 0-based offsets N0...N1, inclusive, from the current token.  The caller
1778    must eventually free the location (with msg_location_destroy()). */
1779 struct msg_location *
1780 lex_get_location (const struct lexer *lexer, int n0, int n1)
1781 {
1782   struct msg_location *loc = xmalloc (sizeof *loc);
1783   *loc = (struct msg_location) {
1784     .file_name = intern_new_if_nonnull (lex_get_file_name (lexer)),
1785     .start = lex_ofs_start_point (lexer, n0 + lex_ofs (lexer)),
1786     .end = lex_ofs_end_point (lexer, n1 + lex_ofs (lexer)),
1787     .src = lex_source__ (lexer),
1788   };
1789   lex_source_ref (loc->src);
1790   return loc;
1791 }
1792
1793 const char *
1794 lex_get_encoding (const struct lexer *lexer)
1795 {
1796   struct lex_source *src = lex_source__ (lexer);
1797   return src == NULL ? NULL : src->reader->encoding;
1798 }
1799
1800 /* Returns the syntax mode for the syntax file from which the current drawn is
1801    drawn.  Returns SEG_MODE_AUTO for a T_STOP token or if the command's source
1802    does not have line numbers.
1803
1804    There is no version of this function that takes an N argument because
1805    lookahead only works to the end of a command and any given command is always
1806    within a single syntax file. */
1807 enum segmenter_mode
1808 lex_get_syntax_mode (const struct lexer *lexer)
1809 {
1810   struct lex_source *src = lex_source__ (lexer);
1811   return src == NULL ? SEG_MODE_AUTO : src->reader->syntax;
1812 }
1813
1814 /* Returns the error mode for the syntax file from which the current drawn is
1815    drawn.  Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1816    source does not have line numbers.
1817
1818    There is no version of this function that takes an N argument because
1819    lookahead only works to the end of a command and any given command is always
1820    within a single syntax file. */
1821 enum lex_error_mode
1822 lex_get_error_mode (const struct lexer *lexer)
1823 {
1824   struct lex_source *src = lex_source__ (lexer);
1825   return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1826 }
1827
1828 /* If the source that LEXER is currently reading has error mode
1829    LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1830    token to be read comes directly from whatever is next read from the stream.
1831
1832    It makes sense to call this function after encountering an error in a
1833    command entered on the console, because usually the user would prefer not to
1834    have cascading errors. */
1835 void
1836 lex_interactive_reset (struct lexer *lexer)
1837 {
1838   struct lex_source *src = lex_source__ (lexer);
1839   if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1840     {
1841       src->length = 0;
1842       src->journal_pos = src->seg_pos = 0;
1843       src->n_lines = 0;
1844       src->suppress_next_newline = false;
1845       src->segmenter = segmenter_init (segmenter_get_mode (&src->segmenter),
1846                                        false);
1847       lex_stage_clear (&src->pp);
1848       lex_stage_clear (&src->merge);
1849       lex_source_clear_parse (src);
1850       lex_source_push_endcmd__ (src);
1851     }
1852 }
1853
1854 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1855 void
1856 lex_discard_rest_of_command (struct lexer *lexer)
1857 {
1858   while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1859     lex_get (lexer);
1860 }
1861
1862 /* Discards all lookahead tokens in LEXER, then discards all input sources
1863    until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1864    runs out of input sources. */
1865 void
1866 lex_discard_noninteractive (struct lexer *lexer)
1867 {
1868   struct lex_source *src = lex_source__ (lexer);
1869
1870   if (src != NULL)
1871     {
1872       lex_stage_clear (&src->pp);
1873       lex_stage_clear (&src->merge);
1874       lex_source_clear_parse (src);
1875
1876       for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1877            src = lex_source__ (lexer))
1878         {
1879           ll_remove (&src->ll);
1880           lex_source_unref (src);
1881         }
1882     }
1883 }
1884 \f
1885 static void
1886 lex_source_expand__ (struct lex_source *src)
1887 {
1888   if (src->length >= src->allocated)
1889     src->buffer = x2realloc (src->buffer, &src->allocated);
1890 }
1891
1892 static void
1893 lex_source_read__ (struct lex_source *src)
1894 {
1895   do
1896     {
1897       lex_source_expand__ (src);
1898
1899       size_t space = src->allocated - src->length;
1900       enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1901       size_t n = src->reader->class->read (src->reader,
1902                                            &src->buffer[src->length],
1903                                            space, prompt);
1904       assert (n <= space);
1905
1906       if (n == 0)
1907         {
1908           /* End of input. */
1909           src->reader->eof = true;
1910           return;
1911         }
1912
1913       src->length += n;
1914     }
1915   while (!memchr (&src->buffer[src->seg_pos], '\n',
1916                   src->length - src->seg_pos));
1917 }
1918
1919 static struct lex_source *
1920 lex_source__ (const struct lexer *lexer)
1921 {
1922   return (ll_is_empty (&lexer->sources) ? NULL
1923           : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1924 }
1925
1926 /* Returns the text of the syntax in SRC for tokens with offsets OFS0 through
1927    OFS1 in the current command, inclusive.  (For example, if OFS0 and OFS1 are
1928    both zero, this requests the syntax for the first token in the current
1929    command.)  The caller must eventually free the returned string (with
1930    free()).  The syntax is encoded in UTF-8 and in the original form supplied
1931    to the lexer so that, for example, it may include comments, spaces, and
1932    new-lines if it spans multiple tokens.  Macro expansion, however, has
1933    already been performed. */
1934 static char *
1935 lex_source_syntax__ (const struct lex_source *src, int ofs0, int ofs1)
1936 {
1937   struct string s = DS_EMPTY_INITIALIZER;
1938   for (size_t i = ofs0; i <= ofs1; )
1939     {
1940       /* Find [I,J) as the longest sequence of tokens not produced by macro
1941          expansion, or otherwise the longest sequence expanded from a single
1942          macro call. */
1943       const struct lex_token *first = lex_source_ofs__ (src, i);
1944       size_t j;
1945       for (j = i + 1; j <= ofs1; j++)
1946         {
1947           const struct lex_token *cur = lex_source_ofs__ (src, j);
1948           if ((first->macro_rep != NULL) != (cur->macro_rep != NULL)
1949               || first->macro_rep != cur->macro_rep)
1950             break;
1951         }
1952       const struct lex_token *last = lex_source_ofs__ (src, j - 1);
1953
1954       /* Now add the syntax for this sequence of tokens to SRC. */
1955       if (!ds_is_empty (&s))
1956         ds_put_byte (&s, ' ');
1957       if (!first->macro_rep)
1958         {
1959           size_t start = first->token_pos;
1960           size_t end = last->token_pos + last->token_len;
1961           ds_put_substring (&s, ss_buffer (&src->buffer[start], end - start));
1962         }
1963       else
1964         {
1965           size_t start = first->ofs;
1966           size_t end = last->ofs + last->len;
1967           ds_put_substring (&s, ss_buffer (first->macro_rep + start,
1968                                            end - start));
1969         }
1970
1971       i = j;
1972     }
1973   return ds_steal_cstr (&s);
1974 }
1975
1976 static bool
1977 lex_source_contains_macro_call (struct lex_source *src, int ofs0, int ofs1)
1978 {
1979   for (int i = ofs0; i <= ofs1; i++)
1980     if (lex_source_ofs__ (src, i)->macro_rep)
1981       return true;
1982   return false;
1983 }
1984
1985 /* If tokens N0...N1 (inclusive) in SRC contains a macro call, this returns the
1986    raw UTF-8 syntax for the macro call (not for the expansion) and for any
1987    other tokens included in that range.  The syntax is encoded in UTF-8 and in
1988    the original form supplied to the lexer so that, for example, it may include
1989    comments, spaces, and new-lines if it spans multiple tokens.
1990
1991    Returns an empty string if the token range doesn't include a macro call.
1992
1993    The caller must not modify or free the returned string. */
1994 static struct substring
1995 lex_source_get_macro_call (struct lex_source *src, int ofs0, int ofs1)
1996 {
1997   if (!lex_source_contains_macro_call (src, ofs0, ofs1))
1998     return ss_empty ();
1999
2000   const struct lex_token *token0 = lex_source_ofs__ (src, ofs0);
2001   const struct lex_token *token1 = lex_source_ofs__ (src, MAX (ofs0, ofs1));
2002   size_t start = token0->token_pos;
2003   size_t end = token1->token_pos + token1->token_len;
2004
2005   return ss_buffer (&src->buffer[start], end - start);
2006 }
2007
2008 static void
2009 lex_source_msg_valist (struct lex_source *src, enum msg_class class,
2010                        int ofs0, int ofs1, const char *format, va_list args)
2011 {
2012   struct string s = DS_EMPTY_INITIALIZER;
2013
2014   if (src)
2015     {
2016       /* Get the macro call(s) that expanded to the syntax that caused the
2017          error. */
2018       char call[64];
2019       str_ellipsize (lex_source_get_macro_call (src, ofs0, ofs1),
2020                      call, sizeof call);
2021       if (call[0])
2022         ds_put_format (&s, _("In syntax expanded from `%s'"), call);
2023     }
2024   else
2025     ds_put_cstr (&s, _("At end of input"));
2026
2027   if (!ds_is_empty (&s))
2028     ds_put_cstr (&s, ": ");
2029   if (format)
2030     ds_put_vformat (&s, format, args);
2031   else
2032     ds_put_cstr (&s, _("Syntax error."));
2033
2034   if (ds_last (&s) != '.')
2035     ds_put_byte (&s, '.');
2036
2037   struct msg *m = xmalloc (sizeof *m);
2038   *m = (struct msg) {
2039     .category = msg_class_to_category (class),
2040     .severity = msg_class_to_severity (class),
2041     .location = src ? lex_source_get_location (src, ofs0, ofs1) : NULL,
2042     .text = ds_steal_cstr (&s),
2043   };
2044   msg_emit (m);
2045 }
2046
2047 static void
2048 lex_get_error (struct lex_source *src, const struct lex_token *token)
2049 {
2050   char syntax[64];
2051   str_ellipsize (ss_buffer (&src->buffer[token->token_pos], token->token_len),
2052                  syntax, sizeof syntax);
2053
2054   struct string s = DS_EMPTY_INITIALIZER;
2055   ds_put_cstr (&s, token->token.string.string);
2056
2057   struct msg *m = xmalloc (sizeof *m);
2058   *m = (struct msg) {
2059     .category = MSG_C_SYNTAX,
2060     .severity = MSG_S_ERROR,
2061     .location = lex_token_location_rw (src, token, token),
2062     .text = ds_steal_cstr (&s),
2063   };
2064   msg_emit (m);
2065 }
2066
2067 /* Attempts to append an additional token to 'pp' in SRC, reading more from the
2068    underlying lex_reader if necessary.  Returns true if a new token was added
2069    to SRC's deque, false otherwise.  The caller should retry failures unless
2070    SRC's 'eof' marker was set to true indicating that there will be no more
2071    tokens from this source. */
2072 static bool
2073 lex_source_try_get_pp (struct lex_source *src)
2074 {
2075   /* Append a new token to SRC and initialize it. */
2076   struct lex_token *token = xmalloc (sizeof *token);
2077   token->token = (struct token) { .type = T_STOP };
2078   token->macro_rep = NULL;
2079   token->ref_cnt = NULL;
2080   token->token_pos = src->seg_pos;
2081
2082   /* Extract a segment. */
2083   const char *segment;
2084   enum segment_type seg_type;
2085   int seg_len;
2086   for (;;)
2087     {
2088       segment = &src->buffer[src->seg_pos];
2089       seg_len = segmenter_push (&src->segmenter, segment,
2090                                 src->length - src->seg_pos,
2091                                 src->reader->eof, &seg_type);
2092       if (seg_len >= 0)
2093         break;
2094
2095       /* The segmenter needs more input to produce a segment. */
2096       assert (!src->reader->eof);
2097       lex_source_read__ (src);
2098     }
2099
2100   /* Update state based on the segment. */
2101   token->token_len = seg_len;
2102   src->seg_pos += seg_len;
2103   if (seg_type == SEG_NEWLINE)
2104     {
2105       if (src->n_lines >= src->allocated_lines)
2106         src->lines = x2nrealloc (src->lines, &src->allocated_lines,
2107                                  sizeof *src->lines);
2108       src->lines[src->n_lines++] = src->seg_pos;
2109     }
2110
2111   /* Get a token from the segment. */
2112   enum tokenize_result result = token_from_segment (
2113     seg_type, ss_buffer (segment, seg_len), &token->token);
2114
2115   /* If we've reached the end of a line, or the end of a command, then pass
2116      the line to the output engine as a syntax text item.  */
2117   int n_lines = seg_type == SEG_NEWLINE;
2118   if (seg_type == SEG_END_COMMAND && !src->suppress_next_newline)
2119     {
2120       n_lines++;
2121       src->suppress_next_newline = true;
2122     }
2123   else if (n_lines > 0 && src->suppress_next_newline)
2124     {
2125       n_lines--;
2126       src->suppress_next_newline = false;
2127     }
2128   for (int i = 0; i < n_lines; i++)
2129     {
2130       /* Beginning of line. */
2131       const char *line = &src->buffer[src->journal_pos];
2132
2133       /* Calculate line length, including \n or \r\n end-of-line if present.
2134
2135          We use src->length even though that may be beyond what we've actually
2136          converted to tokens.  That's because, if we're emitting the line due
2137          to SEG_END_COMMAND, we want to take the whole line through the
2138          newline, not just through the '.'. */
2139       size_t max_len = src->length - src->journal_pos;
2140       const char *newline = memchr (line, '\n', max_len);
2141       size_t line_len = newline ? newline - line + 1 : max_len;
2142
2143       /* Calculate line length excluding end-of-line. */
2144       size_t copy_len = line_len;
2145       if (copy_len > 0 && line[copy_len - 1] == '\n')
2146         copy_len--;
2147       if (copy_len > 0 && line[copy_len - 1] == '\r')
2148         copy_len--;
2149
2150       /* Submit the line as syntax. */
2151       output_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX,
2152                                                    xmemdup0 (line, copy_len),
2153                                                    NULL));
2154
2155       src->journal_pos += line_len;
2156     }
2157
2158   switch (result)
2159     {
2160     case TOKENIZE_ERROR:
2161       lex_get_error (src, token);
2162       /* Fall through. */
2163     case TOKENIZE_EMPTY:
2164       lex_token_destroy (token);
2165       return false;
2166
2167     case TOKENIZE_TOKEN:
2168       if (token->token.type == T_STOP)
2169         {
2170           token->token.type = T_ENDCMD;
2171           src->eof = true;
2172         }
2173       lex_stage_push_last (&src->pp, token);
2174       return true;
2175     }
2176   NOT_REACHED ();
2177 }
2178
2179 /* Attempts to append a new token to SRC.  Returns true if successful, false on
2180    failure.  On failure, the end of SRC has been reached and no more tokens
2181    will be forthcoming from it.
2182
2183    Does not make the new token available for lookahead yet; the caller must
2184    adjust SRC's 'middle' pointer to do so. */
2185 static bool
2186 lex_source_get_pp (struct lex_source *src)
2187 {
2188   while (!src->eof)
2189     if (lex_source_try_get_pp (src))
2190       return true;
2191   return false;
2192 }
2193
2194 static bool
2195 lex_source_try_get_merge (const struct lex_source *src_)
2196 {
2197   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
2198
2199   if (lex_stage_is_empty (&src->pp) && !lex_source_get_pp (src))
2200     return false;
2201
2202   if (!settings_get_mexpand ())
2203     {
2204       lex_stage_shift (&src->merge, &src->pp, lex_stage_count (&src->pp));
2205       return true;
2206     }
2207
2208   /* Now pass tokens one-by-one to the macro expander.
2209
2210      In the common case where there is no macro to expand, the loop is not
2211      entered.  */
2212   struct macro_call *mc;
2213   int n_call = macro_call_create (src->lexer->macros,
2214                                   &lex_stage_first (&src->pp)->token, &mc);
2215   for (int ofs = 1; !n_call; ofs++)
2216     {
2217       if (lex_stage_count (&src->pp) <= ofs && !lex_source_get_pp (src))
2218         {
2219           /* This should not be reachable because we always get a T_ENDCMD at
2220              the end of an input file (transformed from T_STOP by
2221              lex_source_try_get_pp()) and the macro_expander should always
2222              terminate expansion on T_ENDCMD. */
2223           NOT_REACHED ();
2224         }
2225
2226       const struct lex_token *t = lex_stage_nth (&src->pp, ofs);
2227       const struct macro_token mt = {
2228         .token = t->token,
2229         .syntax = ss_buffer (&src->buffer[t->token_pos], t->token_len),
2230       };
2231       const struct msg_location loc = lex_token_location (src, t, t);
2232       n_call = macro_call_add (mc, &mt, &loc);
2233     }
2234   if (n_call < 0)
2235     {
2236       /* False alarm: no macro expansion after all.  Use first token as
2237          lookahead.  We'll retry macro expansion from the second token next
2238          time around. */
2239       macro_call_destroy (mc);
2240       lex_stage_shift (&src->merge, &src->pp, 1);
2241       return true;
2242     }
2243
2244   /* The first 'n_call' tokens in 'pp', which we bracket as C0...C1, inclusive,
2245      are a macro call.  (These are likely to be the only tokens in 'pp'.)
2246      Expand them.  */
2247   const struct lex_token *c0 = lex_stage_first (&src->pp);
2248   const struct lex_token *c1 = lex_stage_nth (&src->pp, n_call - 1);
2249   struct macro_tokens expansion = { .n = 0 };
2250   struct msg_location loc = lex_token_location (src, c0, c1);
2251   macro_call_expand (mc, src->reader->syntax, &loc, &expansion);
2252   macro_call_destroy (mc);
2253
2254   /* Convert the macro expansion into syntax for possible error messages
2255      later. */
2256   size_t *ofs = xnmalloc (expansion.n, sizeof *ofs);
2257   size_t *len = xnmalloc (expansion.n, sizeof *len);
2258   struct string s = DS_EMPTY_INITIALIZER;
2259   macro_tokens_to_syntax (&expansion, &s, ofs, len);
2260
2261   if (settings_get_mprint ())
2262     output_item_submit (text_item_create (TEXT_ITEM_LOG, ds_cstr (&s),
2263                                           _("Macro Expansion")));
2264
2265   /* Append the macro expansion tokens to the lookahead. */
2266   if (expansion.n > 0)
2267     {
2268       char *macro_rep = ds_steal_cstr (&s);
2269       size_t *ref_cnt = xmalloc (sizeof *ref_cnt);
2270       *ref_cnt = expansion.n;
2271       for (size_t i = 0; i < expansion.n; i++)
2272         {
2273           struct lex_token *token = xmalloc (sizeof *token);
2274           *token = (struct lex_token) {
2275             .token = expansion.mts[i].token,
2276             .token_pos = c0->token_pos,
2277             .token_len = (c1->token_pos + c1->token_len) - c0->token_pos,
2278             .macro_rep = macro_rep,
2279             .ofs = ofs[i],
2280             .len = len[i],
2281             .ref_cnt = ref_cnt,
2282           };
2283           lex_stage_push_last (&src->merge, token);
2284
2285           ss_dealloc (&expansion.mts[i].syntax);
2286         }
2287     }
2288   else
2289     ds_destroy (&s);
2290   free (expansion.mts);
2291   free (ofs);
2292   free (len);
2293
2294   /* Destroy the tokens for the call. */
2295   for (size_t i = 0; i < n_call; i++)
2296     lex_stage_pop_first (&src->pp);
2297
2298   return expansion.n > 0;
2299 }
2300
2301 /* Attempts to obtain at least one new token into 'merge' in SRC.
2302
2303    Returns true if successful, false on failure.  In the latter case, SRC is
2304    exhausted and 'src->eof' is now true. */
2305 static bool
2306 lex_source_get_merge (struct lex_source *src)
2307 {
2308   while (!src->eof)
2309     if (lex_source_try_get_merge (src))
2310       return true;
2311   return false;
2312 }
2313
2314 /* Attempts to obtain at least one new token into 'lookahead' in SRC.
2315
2316    Returns true if successful, false on failure.  In the latter case, SRC is
2317    exhausted and 'src->eof' is now true. */
2318 static bool
2319 lex_source_get_parse (struct lex_source *src)
2320 {
2321   struct merger m = MERGER_INIT;
2322   struct token out;
2323   for (size_t i = 0; ; i++)
2324     {
2325       while (lex_stage_count (&src->merge) <= i && !lex_source_get_merge (src))
2326         {
2327           /* We always get a T_ENDCMD at the end of an input file
2328              (transformed from T_STOP by lex_source_try_get_pp()) and
2329              merger_add() should never return -1 on T_ENDCMD. */
2330           assert (lex_stage_is_empty (&src->merge));
2331           return false;
2332         }
2333
2334       int retval = merger_add (&m, &lex_stage_nth (&src->merge, i)->token,
2335                                &out);
2336       if (!retval)
2337         {
2338           lex_source_push_parse (src, lex_stage_take_first (&src->merge));
2339           return true;
2340         }
2341       else if (retval > 0)
2342         {
2343           /* Add a token that merges all the tokens together. */
2344           const struct lex_token *first = lex_stage_first (&src->merge);
2345           const struct lex_token *last = lex_stage_nth (&src->merge,
2346                                                         retval - 1);
2347           bool macro = first->macro_rep && first->macro_rep == last->macro_rep;
2348           struct lex_token *t = xmalloc (sizeof *t);
2349           *t = (struct lex_token) {
2350             .token = out,
2351             .token_pos = first->token_pos,
2352             .token_len = (last->token_pos - first->token_pos) + last->token_len,
2353
2354             /* This works well if all the tokens were not expanded from macros,
2355                or if they came from the same macro expansion.  It just gives up
2356                in the other (corner) cases. */
2357             .macro_rep = macro ? first->macro_rep : NULL,
2358             .ofs = macro ? first->ofs : 0,
2359             .len = macro ? (last->ofs - first->ofs) + last->len : 0,
2360             .ref_cnt = macro ? first->ref_cnt : NULL,
2361           };
2362           if (t->ref_cnt)
2363             ++*t->ref_cnt;
2364           lex_source_push_parse (src, t);
2365
2366           for (int i = 0; i < retval; i++)
2367             lex_stage_pop_first (&src->merge);
2368           return true;
2369         }
2370     }
2371 }
2372 \f
2373 static void
2374 lex_source_push_endcmd__ (struct lex_source *src)
2375 {
2376   assert (src->n_parse == 0);
2377
2378   struct lex_token *token = xmalloc (sizeof *token);
2379   *token = (struct lex_token) { .token = { .type = T_ENDCMD } };
2380   lex_source_push_parse (src, token);
2381 }
2382
2383 static void
2384 lex_source_push_parse (struct lex_source *src, struct lex_token *token)
2385 {
2386   if (src->n_parse >= src->allocated_parse)
2387     src->parse = x2nrealloc (src->parse, &src->allocated_parse,
2388                              sizeof *src->parse);
2389   src->parse[src->n_parse++] = token;
2390 }
2391
2392 static void
2393 lex_source_clear_parse (struct lex_source *src)
2394 {
2395   for (size_t i = 0; i < src->n_parse; i++)
2396     lex_token_destroy (src->parse[i]);
2397   src->n_parse = src->parse_ofs = 0;
2398 }
2399
2400 static struct lex_source *
2401 lex_source_create (struct lexer *lexer, struct lex_reader *reader)
2402 {
2403   size_t allocated_lines = 4;
2404   size_t *lines = xmalloc (allocated_lines * sizeof *lines);
2405   *lines = 0;
2406
2407   struct lex_source *src = xmalloc (sizeof *src);
2408   *src = (struct lex_source) {
2409     .n_refs = 1,
2410     .reader = reader,
2411     .segmenter = segmenter_init (reader->syntax, false),
2412     .lexer = lexer,
2413     .lines = lines,
2414     .n_lines = 1,
2415     .allocated_lines = allocated_lines,
2416   };
2417
2418   lex_source_push_endcmd__ (src);
2419
2420   return src;
2421 }
2422
2423 void
2424 lex_set_message_handler (struct lexer *lexer,
2425                          void (*output_msg) (const struct msg *,
2426                                              struct lexer *))
2427 {
2428   struct msg_handler msg_handler = {
2429     .output_msg = (void (*)(const struct msg *, void *)) output_msg,
2430     .aux = lexer,
2431     .lex_source_ref = lex_source_ref,
2432     .lex_source_unref = lex_source_unref,
2433     .lex_source_get_line = lex_source_get_line,
2434   };
2435   msg_set_handler (&msg_handler);
2436 }
2437
2438 struct lex_source *
2439 lex_source_ref (const struct lex_source *src_)
2440 {
2441   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
2442   if (src)
2443     {
2444       assert (src->n_refs > 0);
2445       src->n_refs++;
2446     }
2447   return src;
2448 }
2449
2450 void
2451 lex_source_unref (struct lex_source *src)
2452 {
2453   if (!src)
2454     return;
2455
2456   assert (src->n_refs > 0);
2457   if (--src->n_refs > 0)
2458     return;
2459
2460   char *file_name = src->reader->file_name;
2461   char *encoding = src->reader->encoding;
2462   if (src->reader->class->destroy != NULL)
2463     src->reader->class->destroy (src->reader);
2464   free (file_name);
2465   free (encoding);
2466   free (src->buffer);
2467   free (src->lines);
2468   lex_stage_uninit (&src->pp);
2469   lex_stage_uninit (&src->merge);
2470   lex_source_clear_parse (src);
2471   free (src->parse);
2472   free (src);
2473 }
2474 \f
2475 struct lex_file_reader
2476   {
2477     struct lex_reader reader;
2478     struct u8_istream *istream;
2479   };
2480
2481 static struct lex_reader_class lex_file_reader_class;
2482
2483 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
2484    from stdin if FILE_NAME is "-").  The file is expected to be encoded with
2485    ENCODING, which should take one of the forms accepted by
2486    u8_istream_for_file().  SYNTAX and ERROR become the syntax mode and error
2487    mode of the new reader, respectively.
2488
2489    Returns a null pointer if FILE_NAME cannot be opened. */
2490 struct lex_reader *
2491 lex_reader_for_file (const char *file_name, const char *encoding,
2492                      enum segmenter_mode syntax,
2493                      enum lex_error_mode error)
2494 {
2495   struct lex_file_reader *r;
2496   struct u8_istream *istream;
2497
2498   istream = (!strcmp(file_name, "-")
2499              ? u8_istream_for_fd (encoding, STDIN_FILENO)
2500              : u8_istream_for_file (encoding, file_name, O_RDONLY));
2501   if (istream == NULL)
2502     {
2503       msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
2504       return NULL;
2505     }
2506
2507   r = xmalloc (sizeof *r);
2508   lex_reader_init (&r->reader, &lex_file_reader_class);
2509   r->reader.syntax = syntax;
2510   r->reader.error = error;
2511   r->reader.file_name = xstrdup (file_name);
2512   r->reader.encoding = xstrdup_if_nonnull (encoding);
2513   r->reader.line_number = 1;
2514   r->istream = istream;
2515
2516   return &r->reader;
2517 }
2518
2519 static struct lex_file_reader *
2520 lex_file_reader_cast (struct lex_reader *r)
2521 {
2522   return UP_CAST (r, struct lex_file_reader, reader);
2523 }
2524
2525 static size_t
2526 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
2527                enum prompt_style prompt_style UNUSED)
2528 {
2529   struct lex_file_reader *r = lex_file_reader_cast (r_);
2530   ssize_t n_read = u8_istream_read (r->istream, buf, n);
2531   if (n_read < 0)
2532     {
2533       msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
2534       return 0;
2535     }
2536   return n_read;
2537 }
2538
2539 static void
2540 lex_file_close (struct lex_reader *r_)
2541 {
2542   struct lex_file_reader *r = lex_file_reader_cast (r_);
2543
2544   if (u8_istream_fileno (r->istream) != STDIN_FILENO)
2545     {
2546       if (u8_istream_close (r->istream) != 0)
2547         msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
2548     }
2549   else
2550     u8_istream_free (r->istream);
2551
2552   free (r);
2553 }
2554
2555 static struct lex_reader_class lex_file_reader_class =
2556   {
2557     lex_file_read,
2558     lex_file_close
2559   };
2560 \f
2561 struct lex_string_reader
2562   {
2563     struct lex_reader reader;
2564     struct substring s;
2565     size_t offset;
2566   };
2567
2568 static struct lex_reader_class lex_string_reader_class;
2569
2570 /* Creates and returns a new lex_reader for the contents of S, which must be
2571    encoded in the given ENCODING.  The new reader takes ownership of S and will free it
2572    with ss_dealloc() when it is closed. */
2573 struct lex_reader *
2574 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
2575 {
2576   struct lex_string_reader *r;
2577
2578   r = xmalloc (sizeof *r);
2579   lex_reader_init (&r->reader, &lex_string_reader_class);
2580   r->reader.syntax = SEG_MODE_AUTO;
2581   r->reader.encoding = xstrdup_if_nonnull (encoding);
2582   r->s = s;
2583   r->offset = 0;
2584
2585   return &r->reader;
2586 }
2587
2588 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
2589    which must be encoded in ENCODING.  The caller retains ownership of S. */
2590 struct lex_reader *
2591 lex_reader_for_string (const char *s, const char *encoding)
2592 {
2593   struct substring ss;
2594   ss_alloc_substring (&ss, ss_cstr (s));
2595   return lex_reader_for_substring_nocopy (ss, encoding);
2596 }
2597
2598 /* Formats FORMAT as a printf()-like format string and creates and returns a
2599    new lex_reader for the formatted result.  */
2600 struct lex_reader *
2601 lex_reader_for_format (const char *format, const char *encoding, ...)
2602 {
2603   struct lex_reader *r;
2604   va_list args;
2605
2606   va_start (args, encoding);
2607   r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
2608   va_end (args);
2609
2610   return r;
2611 }
2612
2613 static struct lex_string_reader *
2614 lex_string_reader_cast (struct lex_reader *r)
2615 {
2616   return UP_CAST (r, struct lex_string_reader, reader);
2617 }
2618
2619 static size_t
2620 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
2621                  enum prompt_style prompt_style UNUSED)
2622 {
2623   struct lex_string_reader *r = lex_string_reader_cast (r_);
2624   size_t chunk;
2625
2626   chunk = MIN (n, r->s.length - r->offset);
2627   memcpy (buf, r->s.string + r->offset, chunk);
2628   r->offset += chunk;
2629
2630   return chunk;
2631 }
2632
2633 static void
2634 lex_string_close (struct lex_reader *r_)
2635 {
2636   struct lex_string_reader *r = lex_string_reader_cast (r_);
2637
2638   ss_dealloc (&r->s);
2639   free (r);
2640 }
2641
2642 static struct lex_reader_class lex_string_reader_class =
2643   {
2644     lex_string_read,
2645     lex_string_close
2646   };
2647 \f
2648 struct substring
2649 lex_source_get_line (const struct lex_source *src, int line)
2650 {
2651   if (line < 1 || line > src->n_lines)
2652     return ss_empty ();
2653
2654   size_t ofs = src->lines[line - 1];
2655   size_t end;
2656   if (line < src->n_lines)
2657     end = src->lines[line];
2658   else
2659     {
2660       const char *newline = memchr (src->buffer + ofs, '\n', src->length - ofs);
2661       end = newline ? newline - src->buffer : src->length;
2662     }
2663   return ss_buffer (&src->buffer[ofs], end - ofs);
2664 }