pintos-os.org Git - pspp/blob - src/language/lexer/lexer.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "language/lexer/lexer.h"
  20
  21 #include <errno.h>
  22 #include <fcntl.h>
  23 #include <limits.h>
  24 #include <math.h>
  25 #include <stdarg.h>
  26 #include <stdlib.h>
  27 #include <string.h>
  28 #include <unictype.h>
  29 #include <unistd.h>
  30 #include <unistr.h>
  31
  32 #include "language/command.h"
  33 #include "language/lexer/macro.h"
  34 #include "language/lexer/scan.h"
  35 #include "language/lexer/segment.h"
  36 #include "language/lexer/token.h"
  37 #include "libpspp/assertion.h"
  38 #include "libpspp/cast.h"
  39 #include "libpspp/deque.h"
  40 #include "libpspp/i18n.h"
  41 #include "libpspp/intern.h"
  42 #include "libpspp/ll.h"
  43 #include "libpspp/message.h"
  44 #include "libpspp/misc.h"
  45 #include "libpspp/str.h"
  46 #include "libpspp/u8-istream.h"
  47 #include "output/journal.h"
  48 #include "output/output-item.h"
  49
  50 #include "gl/c-ctype.h"
  51 #include "gl/minmax.h"
  52 #include "gl/xalloc.h"
  53 #include "gl/xmemdup0.h"
  54
  55 #include "gettext.h"
  56 #define _(msgid) gettext (msgid)
  57 #define N_(msgid) msgid
  58
  59 /* A token within a lex_source. */
  60 struct lex_token
  61   {
  62     /* The regular token information. */
  63     struct token token;
  64
  65     /* For a token obtained through the lexer in an ordinary way, this is the
  66        location of the token in terms of the lex_source's buffer.
  67
  68        For a token produced through macro expansion, this is the entire macro
  69        call. */
  70     size_t token_pos;           /* Offset into src->buffer of token start. */
  71     size_t token_len;           /* Length of source for token in bytes. */
  72
  73     /* For a token obtained through macro expansion, this is just this token.
  74
  75        For a token obtained through the lexer in an ordinary way, these are
  76        nulls and zeros. */
  77     char *macro_rep;        /* The whole macro expansion. */
  78     size_t ofs;             /* Offset of this token in macro_rep. */
  79     size_t len;             /* Length of this token in macro_rep. */
  80     size_t *ref_cnt;        /* Number of lex_tokens that refer to macro_rep. */
  81   };
  82
  83 static struct msg_point lex_token_start_point (const struct lex_source *,
  84                                                const struct lex_token *);
  85 static struct msg_point lex_token_end_point (const struct lex_source *,
  86                                              const struct lex_token *);
  87
  88 static size_t lex_ofs_at_phrase__ (struct lexer *, int ofs, const char *s);
  89
  90 /* Source offset of the last byte in TOKEN. */
  91 static size_t
  92 lex_token_end (const struct lex_token *token)
  93 {
  94   return token->token_pos + MAX (token->token_len, 1) - 1;
  95 }
  96
  97 static void
  98 lex_token_destroy (struct lex_token *t)
  99 {
 100   token_uninit (&t->token);
 101   if (t->ref_cnt)
 102     {
 103       assert (*t->ref_cnt > 0);
 104       if (!--*t->ref_cnt)
 105         {
 106           free (t->macro_rep);
 107           free (t->ref_cnt);
 108         }
 109     }
 110   free (t);
 111 }
 112 \f
 113 /* A deque of lex_tokens that comprises one stage in the token pipeline in a
 114    lex_source. */
 115 struct lex_stage
 116   {
 117     struct deque deque;
 118     struct lex_token **tokens;
 119   };
 120
 121 static void lex_stage_clear (struct lex_stage *);
 122 static void lex_stage_uninit (struct lex_stage *);
 123
 124 static size_t lex_stage_count (const struct lex_stage *);
 125 static bool lex_stage_is_empty (const struct lex_stage *);
 126
 127 static struct lex_token *lex_stage_first (struct lex_stage *);
 128 static struct lex_token *lex_stage_nth (struct lex_stage *, size_t ofs);
 129
 130 static void lex_stage_push_last (struct lex_stage *, struct lex_token *);
 131 static void lex_stage_pop_first (struct lex_stage *);
 132
 133 static void lex_stage_shift (struct lex_stage *dst, struct lex_stage *src,
 134                              size_t n);
 135
 136 /* Deletes all the tokens from STAGE. */
 137 static void
 138 lex_stage_clear (struct lex_stage *stage)
 139 {
 140   while (!deque_is_empty (&stage->deque))
 141     lex_stage_pop_first (stage);
 142 }
 143
 144 /* Deletes all the tokens from STAGE and frees storage for the deque. */
 145 static void
 146 lex_stage_uninit (struct lex_stage *stage)
 147 {
 148   lex_stage_clear (stage);
 149   free (stage->tokens);
 150 }
 151
 152 /* Returns true if STAGE contains no tokens, otherwise false. */
 153 static bool
 154 lex_stage_is_empty (const struct lex_stage *stage)
 155 {
 156   return deque_is_empty (&stage->deque);
 157 }
 158
 159 /* Returns the number of tokens in STAGE. */
 160 static size_t
 161 lex_stage_count (const struct lex_stage *stage)
 162 {
 163   return deque_count (&stage->deque);
 164 }
 165
 166 /* Returns the first token in STAGE, which must be nonempty.
 167    The first token is the one accessed with the least lookahead. */
 168 static struct lex_token *
 169 lex_stage_first (struct lex_stage *stage)
 170 {
 171   return lex_stage_nth (stage, 0);
 172 }
 173
 174 /* Returns the token the given INDEX in STAGE.  The first token (with the least
 175    lookahead) is 0, the second token is 1, and so on.  There must be at least
 176    INDEX + 1 tokens in STAGE. */
 177 static struct lex_token *
 178 lex_stage_nth (struct lex_stage *stage, size_t index)
 179 {
 180   return stage->tokens[deque_back (&stage->deque, index)];
 181 }
 182
 183 /* Adds TOKEN so that it becomes the last token in STAGE. */
 184 static void
 185 lex_stage_push_last (struct lex_stage *stage, struct lex_token *token)
 186 {
 187   if (deque_is_full (&stage->deque))
 188     stage->tokens = deque_expand (&stage->deque, stage->tokens,
 189                                   sizeof *stage->tokens);
 190   stage->tokens[deque_push_front (&stage->deque)] = token;
 191 }
 192
 193 /* Removes and returns the first token from STAGE. */
 194 static struct lex_token *
 195 lex_stage_take_first (struct lex_stage *stage)
 196 {
 197   return stage->tokens[deque_pop_back (&stage->deque)];
 198 }
 199
 200 /* Removes the first token from STAGE and uninitializes it. */
 201 static void
 202 lex_stage_pop_first (struct lex_stage *stage)
 203 {
 204   lex_token_destroy (lex_stage_take_first (stage));
 205 }
 206
 207 /* Removes the first N tokens from SRC, appending them to DST as the last
 208    tokens. */
 209 static void
 210 lex_stage_shift (struct lex_stage *dst, struct lex_stage *src, size_t n)
 211 {
 212   for (size_t i = 0; i < n; i++)
 213     lex_stage_push_last (dst, lex_stage_take_first (src));
 214 }
 215
 216 /* A source of tokens, corresponding to a syntax file.
 217
 218    This is conceptually a lex_reader wrapped with everything needed to convert
 219    its UTF-8 bytes into tokens. */
 220 struct lex_source
 221   {
 222     struct ll ll;               /* In lexer's list of sources. */
 223
 224     /* Reference count:
 225
 226        - One for struct lexer.
 227
 228        - One for each struct msg_location that references this source. */
 229     size_t n_refs;
 230
 231     struct lex_reader *reader;
 232     struct lexer *lexer;
 233     struct segmenter segmenter;
 234     bool eof;                   /* True if T_STOP was read from 'reader'. */
 235
 236     /* Buffer of UTF-8 bytes. */
 237     char *buffer;               /* Source file contents. */
 238     size_t length;              /* Number of bytes filled. */
 239     size_t allocated;           /* Number of bytes allocated. */
 240
 241     /* Offsets into 'buffer'. */
 242     size_t journal_pos;         /* First byte not yet output to journal. */
 243     size_t seg_pos;             /* First byte not yet scanned as token. */
 244
 245     /* Offset into 'buffer' of starts of lines. */
 246     size_t *lines;
 247     size_t n_lines, allocated_lines;
 248
 249     bool suppress_next_newline;
 250
 251     /* Tokens.
 252
 253        This is a pipeline with the following stages.  Each token eventually
 254        made available to the parser passes through of these stages.  The stages
 255        are named after the processing that happens in each one.
 256
 257        Initially, tokens come from the segmenter and scanner to 'pp':
 258
 259        - pp: Tokens that need to pass through the macro preprocessor to end up
 260          in 'merge'.
 261
 262        - merge: Tokens that need to pass through scan_merge() to end up in
 263          'parse'.
 264
 265        - parse: Tokens available to the client for parsing.
 266
 267       'pp' and 'merge' store tokens only temporarily until they pass into
 268       'parse'.  Tokens then live in 'parse' until the command is fully
 269       consumed, at which time they are freed together. */
 270     struct lex_stage pp;
 271     struct lex_stage merge;
 272     struct lex_token **parse;
 273     size_t n_parse, allocated_parse, parse_ofs;
 274   };
 275
 276 static struct lex_source *lex_source_create (struct lexer *,
 277                                              struct lex_reader *);
 278
 279 /* Lexer. */
 280 struct lexer
 281   {
 282     struct ll_list sources;     /* Contains "struct lex_source"s. */
 283     struct macro_set *macros;
 284   };
 285
 286 static struct lex_source *lex_source__ (const struct lexer *);
 287 static char *lex_source_syntax__ (const struct lex_source *,
 288                                   int ofs0, int ofs1);
 289 static const struct lex_token *lex_next__ (const struct lexer *, int n);
 290 static void lex_source_push_endcmd__ (struct lex_source *);
 291 static void lex_source_push_parse (struct lex_source *, struct lex_token *);
 292 static void lex_source_clear_parse (struct lex_source *);
 293
 294 static bool lex_source_get_parse (struct lex_source *);
 295 static void lex_source_msg_valist (struct lex_source *, enum msg_class,
 296                                    int ofs0, int ofs1,
 297                                    const char *format, va_list)
 298    PRINTF_FORMAT (5, 0);
 299 static const struct lex_token *lex_source_next__ (const struct lex_source *,
 300                                                   int n);
 301 \f
 302 /* Initializes READER with the specified CLASS and otherwise some reasonable
 303    defaults.  The caller should fill in the others members as desired. */
 304 void
 305 lex_reader_init (struct lex_reader *reader,
 306                  const struct lex_reader_class *class)
 307 {
 308   reader->class = class;
 309   reader->syntax = SEG_MODE_AUTO;
 310   reader->error = LEX_ERROR_CONTINUE;
 311   reader->file_name = NULL;
 312   reader->encoding = NULL;
 313   reader->line_number = 0;
 314   reader->eof = false;
 315 }
 316
 317 /* Frees any file name already in READER and replaces it by a copy of
 318    FILE_NAME, or if FILE_NAME is null then clears any existing name. */
 319 void
 320 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
 321 {
 322   free (reader->file_name);
 323   reader->file_name = xstrdup_if_nonnull (file_name);
 324 }
 325 \f
 326 /* Creates and returns a new lexer. */
 327 struct lexer *
 328 lex_create (void)
 329 {
 330   struct lexer *lexer = xmalloc (sizeof *lexer);
 331   *lexer = (struct lexer) {
 332     .sources = LL_INITIALIZER (lexer->sources),
 333     .macros = macro_set_create (),
 334   };
 335   return lexer;
 336 }
 337
 338 /* Destroys LEXER. */
 339 void
 340 lex_destroy (struct lexer *lexer)
 341 {
 342   if (lexer != NULL)
 343     {
 344       struct lex_source *source, *next;
 345
 346       ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
 347         {
 348           ll_remove (&source->ll);
 349           lex_source_unref (source);
 350         }
 351       macro_set_destroy (lexer->macros);
 352       free (lexer);
 353     }
 354 }
 355
 356 /* Adds M to LEXER's set of macros.  M replaces any existing macro with the
 357    same name.  Takes ownership of M. */
 358 void
 359 lex_define_macro (struct lexer *lexer, struct macro *m)
 360 {
 361   macro_set_add (lexer->macros, m);
 362 }
 363
 364 /* Inserts READER into LEXER so that the next token read by LEXER comes from
 365    READER.  Before the caller, LEXER must either be empty or at a T_ENDCMD
 366    token. */
 367 void
 368 lex_include (struct lexer *lexer, struct lex_reader *reader)
 369 {
 370   assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
 371   ll_push_head (&lexer->sources, &lex_source_create (lexer, reader)->ll);
 372 }
 373
 374 /* Appends READER to LEXER, so that it will be read after all other current
 375    readers have already been read. */
 376 void
 377 lex_append (struct lexer *lexer, struct lex_reader *reader)
 378 {
 379   ll_push_tail (&lexer->sources, &lex_source_create (lexer, reader)->ll);
 380 }
 381 \f
 382 /* Advancing. */
 383
 384 /* Advances LEXER to the next token, consuming the current token. */
 385 void
 386 lex_get (struct lexer *lexer)
 387 {
 388   struct lex_source *src;
 389
 390   src = lex_source__ (lexer);
 391   if (src == NULL)
 392     return;
 393
 394   if (src->parse_ofs < src->n_parse)
 395     {
 396       if (src->parse[src->parse_ofs]->token.type == T_ENDCMD)
 397         lex_source_clear_parse (src);
 398       else
 399         src->parse_ofs++;
 400     }
 401
 402   while (src->parse_ofs == src->n_parse)
 403     if (!lex_source_get_parse (src))
 404       {
 405         ll_remove (&src->ll);
 406         lex_source_unref (src);
 407         src = lex_source__ (lexer);
 408         if (src == NULL)
 409           return;
 410       }
 411 }
 412
 413 /* Advances LEXER by N tokens. */
 414 void
 415 lex_get_n (struct lexer *lexer, size_t n)
 416 {
 417   while (n-- > 0)
 418     lex_get (lexer);
 419 }
 420 \f
 421 /* Issuing errors. */
 422
 423 /* Prints a syntax error message containing the current token and
 424    given message MESSAGE (if non-null). */
 425 void
 426 lex_error (struct lexer *lexer, const char *format, ...)
 427 {
 428   va_list args;
 429
 430   va_start (args, format);
 431   lex_ofs_msg_valist (lexer, SE, lex_ofs (lexer), lex_ofs (lexer),
 432                       format, args);
 433   va_end (args);
 434 }
 435
 436 /* Prints a syntax error message for the span of tokens N0 through N1,
 437    inclusive, from the current token in LEXER, adding message MESSAGE (if
 438    non-null). */
 439 void
 440 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
 441 {
 442   va_list args;
 443
 444   va_start (args, format);
 445   int ofs = lex_ofs (lexer);
 446   lex_ofs_msg_valist (lexer, SE, n0 + ofs, n1 + ofs, format, args);
 447   va_end (args);
 448 }
 449
 450 /* Prints a syntax error message for the span of tokens with offsets OFS0
 451    through OFS1, inclusive, within the current command in LEXER, adding message
 452    MESSAGE (if non-null). */
 453 void
 454 lex_ofs_error (struct lexer *lexer, int ofs0, int ofs1, const char *format, ...)
 455 {
 456   va_list args;
 457
 458   va_start (args, format);
 459   lex_ofs_msg_valist (lexer, SE, ofs0, ofs1, format, args);
 460   va_end (args);
 461 }
 462
 463 /* Prints a message of the given CLASS containing the current token and given
 464    message MESSAGE (if non-null). */
 465 void
 466 lex_msg (struct lexer *lexer, enum msg_class class, const char *format, ...)
 467 {
 468   va_list args;
 469
 470   va_start (args, format);
 471   lex_ofs_msg_valist (lexer, class, lex_ofs (lexer), lex_ofs (lexer),
 472                       format, args);
 473   va_end (args);
 474 }
 475
 476 /* Prints a syntax error message for the span of tokens N0 through N1,
 477    inclusive, from the current token in LEXER, adding message MESSAGE (if
 478    non-null). */
 479 void
 480 lex_next_msg (struct lexer *lexer, enum msg_class class, int n0, int n1,
 481               const char *format, ...)
 482 {
 483   va_list args;
 484
 485   va_start (args, format);
 486   int ofs = lex_ofs (lexer);
 487   lex_ofs_msg_valist (lexer, class, n0 + ofs, n1 + ofs, format, args);
 488   va_end (args);
 489 }
 490
 491 /* Prints a message of the given CLASS for the span of tokens with offsets OFS0
 492    through OFS1, inclusive, within the current command in LEXER, adding message
 493    MESSAGE (if non-null). */
 494 void
 495 lex_ofs_msg (struct lexer *lexer, enum msg_class class, int ofs0, int ofs1,
 496              const char *format, ...)
 497 {
 498   va_list args;
 499
 500   va_start (args, format);
 501   lex_ofs_msg_valist (lexer, class, ofs0, ofs1, format, args);
 502   va_end (args);
 503 }
 504
 505 /* Prints a syntax error message saying that one of the strings provided as
 506    varargs, up to the first NULL, is expected. */
 507 void
 508 (lex_error_expecting) (struct lexer *lexer, ...)
 509 {
 510   va_list args;
 511
 512   va_start (args, lexer);
 513   lex_error_expecting_valist (lexer, args);
 514   va_end (args);
 515 }
 516
 517 /* Prints a syntax error message saying that one of the options provided in
 518    ARGS, up to the first NULL, is expected. */
 519 void
 520 lex_error_expecting_valist (struct lexer *lexer, va_list args)
 521 {
 522   enum { MAX_OPTIONS = 9 };
 523   const char *options[MAX_OPTIONS];
 524   int n = 0;
 525   while (n < MAX_OPTIONS)
 526     {
 527       const char *option = va_arg (args, const char *);
 528       if (!option)
 529         break;
 530
 531       options[n++] = option;
 532     }
 533   lex_error_expecting_array (lexer, options, n);
 534 }
 535
 536 void
 537 lex_error_expecting_array (struct lexer *lexer, const char **options, size_t n)
 538 {
 539   switch (n)
 540     {
 541     case 0:
 542       lex_error (lexer, NULL);
 543       break;
 544
 545     case 1:
 546       lex_error (lexer, _("Syntax error expecting %s."), options[0]);
 547       break;
 548
 549     case 2:
 550       lex_error (lexer, _("Syntax error expecting %s or %s."),
 551                  options[0], options[1]);
 552       break;
 553
 554     case 3:
 555       lex_error (lexer, _("Syntax error expecting %s, %s, or %s."),
 556                  options[0], options[1], options[2]);
 557       break;
 558
 559     case 4:
 560       lex_error (lexer, _("Syntax error expecting %s, %s, %s, or %s."),
 561                  options[0], options[1], options[2], options[3]);
 562       break;
 563
 564     case 5:
 565       lex_error (lexer, _("Syntax error expecting %s, %s, %s, %s, or %s."),
 566                  options[0], options[1], options[2], options[3], options[4]);
 567       break;
 568
 569     case 6:
 570       lex_error (lexer, _("Syntax error expecting %s, %s, %s, %s, %s, or %s."),
 571                  options[0], options[1], options[2], options[3], options[4],
 572                  options[5]);
 573       break;
 574
 575     case 7:
 576       lex_error (lexer, _("Syntax error expecting %s, %s, %s, %s, %s, %s, "
 577                           "or %s."),
 578                  options[0], options[1], options[2], options[3], options[4],
 579                  options[5], options[6]);
 580       break;
 581
 582     case 8:
 583       lex_error (lexer, _("Syntax error expecting %s, %s, %s, %s, %s, %s, %s, "
 584                           "or %s."),
 585                  options[0], options[1], options[2], options[3], options[4],
 586                  options[5], options[6], options[7]);
 587       break;
 588
 589     default:
 590       {
 591         struct string s = DS_EMPTY_INITIALIZER;
 592         for (size_t i = 0; i < n; i++)
 593           {
 594             if (i > 0)
 595               ds_put_cstr (&s, ", ");
 596             ds_put_cstr (&s, options[i]);
 597           }
 598         lex_error (lexer, _("Syntax error expecting one of the following: %s."),
 599                    ds_cstr (&s));
 600         ds_destroy (&s);
 601       }
 602       break;
 603     }
 604 }
 605
 606 /* Reports an error to the effect that subcommand SBC may only be specified
 607    once. */
 608 void
 609 lex_sbc_only_once (struct lexer *lexer, const char *sbc)
 610 {
 611   int ofs = lex_ofs (lexer) - 1;
 612   if (lex_ofs_token (lexer, ofs)->type == T_EQUALS)
 613     ofs--;
 614
 615   /* lex_ofs_at_phrase__() handles subcommand names that are keywords, such as
 616      BY. */
 617   if (lex_ofs_at_phrase__ (lexer, ofs, sbc))
 618     lex_ofs_error (lexer, ofs, ofs,
 619                    _("Subcommand %s may only be specified once."), sbc);
 620   else
 621     msg (SE, _("Subcommand %s may only be specified once."), sbc);
 622 }
 623
 624 /* Reports an error to the effect that subcommand SBC is missing.
 625
 626    This function does not take a lexer as an argument or use lex_error(),
 627    because a missing subcommand can normally be detected only after the whole
 628    command has been parsed, and so lex_error() would always report "Syntax
 629    error at end of command", which does not help the user find the error. */
 630 void
 631 lex_sbc_missing (struct lexer *lexer, const char *sbc)
 632 {
 633   lex_ofs_error (lexer, 0, lex_max_ofs (lexer),
 634                  _("Required subcommand %s was not specified."), sbc);
 635 }
 636
 637 /* Reports an error to the effect that specification SPEC may only be specified
 638    once within subcommand SBC. */
 639 void
 640 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
 641 {
 642   lex_error (lexer, _("%s may only be specified once within subcommand %s."),
 643              spec, sbc);
 644 }
 645
 646 /* Reports an error to the effect that specification SPEC is missing within
 647    subcommand SBC. */
 648 void
 649 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
 650 {
 651   lex_error (lexer, _("Required %s specification missing from %s subcommand."),
 652              spec, sbc);
 653 }
 654
 655 /* Prints a syntax error message for the span of tokens with offsets OFS0
 656    through OFS1, inclusive, within the current command in LEXER, adding message
 657    MESSAGE (if non-null) with the given ARGS. */
 658 void
 659 lex_ofs_msg_valist (struct lexer *lexer, enum msg_class class,
 660                     int ofs0, int ofs1, const char *format, va_list args)
 661 {
 662   lex_source_msg_valist (lex_source__ (lexer), class, ofs0, ofs1, format, args);
 663 }
 664
 665 /* Checks that we're at end of command.
 666    If so, returns a successful command completion code.
 667    If not, flags a syntax error and returns an error command
 668    completion code. */
 669 int
 670 lex_end_of_command (struct lexer *lexer)
 671 {
 672   if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
 673     {
 674       lex_error (lexer, _("Syntax error expecting end of command."));
 675       return CMD_FAILURE;
 676     }
 677   else
 678     return CMD_SUCCESS;
 679 }
 680 \f
 681 /* Token testing functions. */
 682
 683 /* Returns true if the current token is a number. */
 684 bool
 685 lex_is_number (const struct lexer *lexer)
 686 {
 687   return lex_next_is_number (lexer, 0);
 688 }
 689
 690 /* Returns true if the current token is a string. */
 691 bool
 692 lex_is_string (const struct lexer *lexer)
 693 {
 694   return lex_next_is_string (lexer, 0);
 695 }
 696
 697 /* Returns the value of the current token, which must be a
 698    floating point number. */
 699 double
 700 lex_number (const struct lexer *lexer)
 701 {
 702   return lex_next_number (lexer, 0);
 703 }
 704
 705 /* Returns true iff the current token is an integer. */
 706 bool
 707 lex_is_integer (const struct lexer *lexer)
 708 {
 709   return lex_next_is_integer (lexer, 0);
 710 }
 711
 712 /* Returns the value of the current token, which must be an
 713    integer. */
 714 long
 715 lex_integer (const struct lexer *lexer)
 716 {
 717   return lex_next_integer (lexer, 0);
 718 }
 719 \f
 720 /* Token testing functions with lookahead.
 721
 722    A value of 0 for N as an argument to any of these functions refers to the
 723    current token.  Lookahead is limited to the current command.  Any N greater
 724    than the number of tokens remaining in the current command will be treated
 725    as referring to a T_ENDCMD token. */
 726
 727 /* Returns true if the token N ahead of the current token is a number. */
 728 bool
 729 lex_next_is_number (const struct lexer *lexer, int n)
 730 {
 731   return token_is_number (lex_next (lexer, n));
 732 }
 733
 734 /* Returns true if the token N ahead of the current token is a string. */
 735 bool
 736 lex_next_is_string (const struct lexer *lexer, int n)
 737 {
 738   return token_is_string (lex_next (lexer, n));
 739 }
 740
 741 /* Returns the value of the token N ahead of the current token, which must be a
 742    floating point number. */
 743 double
 744 lex_next_number (const struct lexer *lexer, int n)
 745 {
 746   return token_number (lex_next (lexer, n));
 747 }
 748
 749 /* Returns true if the token N ahead of the current token is an integer. */
 750 bool
 751 lex_next_is_integer (const struct lexer *lexer, int n)
 752 {
 753   return token_is_integer (lex_next (lexer, n));
 754 }
 755
 756 /* Returns the value of the token N ahead of the current token, which must be
 757    an integer. */
 758 long
 759 lex_next_integer (const struct lexer *lexer, int n)
 760 {
 761   return token_integer (lex_next (lexer, n));
 762 }
 763 \f
 764 /* Token matching functions. */
 765
 766 /* If the current token has the specified TYPE, skips it and returns true.
 767    Otherwise, returns false. */
 768 bool
 769 lex_match (struct lexer *lexer, enum token_type type)
 770 {
 771   if (lex_token (lexer) == type)
 772     {
 773       lex_get (lexer);
 774       return true;
 775     }
 776   else
 777     return false;
 778 }
 779
 780 /* If the current token matches IDENTIFIER, skips it and returns true.
 781    IDENTIFIER may be abbreviated to its first three letters.  Otherwise,
 782    returns false.
 783
 784    IDENTIFIER must be an ASCII string. */
 785 bool
 786 lex_match_id (struct lexer *lexer, const char *identifier)
 787 {
 788   return lex_match_id_n (lexer, identifier, 3);
 789 }
 790
 791 /* If the current token is IDENTIFIER, skips it and returns true.  IDENTIFIER
 792    may be abbreviated to its first N letters.  Otherwise, returns false.
 793
 794    IDENTIFIER must be an ASCII string. */
 795 bool
 796 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
 797 {
 798   if (lex_token (lexer) == T_ID
 799       && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
 800     {
 801       lex_get (lexer);
 802       return true;
 803     }
 804   else
 805     return false;
 806 }
 807
 808 /* If the current token is integer X, skips it and returns true.  Otherwise,
 809    returns false. */
 810 bool
 811 lex_match_int (struct lexer *lexer, int x)
 812 {
 813   if (lex_is_integer (lexer) && lex_integer (lexer) == x)
 814     {
 815       lex_get (lexer);
 816       return true;
 817     }
 818   else
 819     return false;
 820 }
 821 \f
 822 /* Forced matches. */
 823
 824 /* If this token is IDENTIFIER, skips it and returns true.  IDENTIFIER may be
 825    abbreviated to its first 3 letters.  Otherwise, reports an error and returns
 826    false.
 827
 828    IDENTIFIER must be an ASCII string. */
 829 bool
 830 lex_force_match_id (struct lexer *lexer, const char *identifier)
 831 {
 832   if (lex_match_id (lexer, identifier))
 833     return true;
 834   else
 835     {
 836       lex_error_expecting (lexer, identifier);
 837       return false;
 838     }
 839 }
 840
 841 /* If the current token has the specified TYPE, skips it and returns true.
 842    Otherwise, reports an error and returns false. */
 843 bool
 844 lex_force_match (struct lexer *lexer, enum token_type type)
 845 {
 846   if (lex_token (lexer) == type)
 847     {
 848       lex_get (lexer);
 849       return true;
 850     }
 851   else
 852     {
 853       const char *type_string = token_type_to_string (type);
 854       if (type_string)
 855         {
 856           char *s = xasprintf ("`%s'", type_string);
 857           lex_error_expecting (lexer, s);
 858           free (s);
 859         }
 860       else
 861         lex_error_expecting (lexer, token_type_to_name (type));
 862
 863       return false;
 864     }
 865 }
 866
 867 /* If the current token is a string, does nothing and returns true.
 868    Otherwise, reports an error and returns false. */
 869 bool
 870 lex_force_string (struct lexer *lexer)
 871 {
 872   if (lex_is_string (lexer))
 873     return true;
 874   else
 875     {
 876       lex_error (lexer, _("Syntax error expecting string."));
 877       return false;
 878     }
 879 }
 880
 881 /* If the current token is a string or an identifier, does nothing and returns
 882    true.  Otherwise, reports an error and returns false.
 883
 884    This is meant for use in syntactic situations where we want to encourage the
 885    user to supply a quoted string, but for compatibility we also accept
 886    identifiers.  (One example of such a situation is file names.)  Therefore,
 887    the error message issued when the current token is wrong only says that a
 888    string is expected and doesn't mention that an identifier would also be
 889    accepted. */
 890 bool
 891 lex_force_string_or_id (struct lexer *lexer)
 892 {
 893   return lex_token (lexer) == T_ID || lex_force_string (lexer);
 894 }
 895
 896 /* If the current token is an integer, does nothing and returns true.
 897    Otherwise, reports an error and returns false. */
 898 bool
 899 lex_force_int (struct lexer *lexer)
 900 {
 901   if (lex_is_integer (lexer))
 902     return true;
 903   else
 904     {
 905       lex_error (lexer, _("Syntax error expecting integer."));
 906       return false;
 907     }
 908 }
 909
 910 /* If the current token is an integer in the range MIN...MAX (inclusive), does
 911    nothing and returns true.  Otherwise, reports an error and returns false.
 912    If NAME is nonnull, then it is used in the error message. */
 913 bool
 914 lex_force_int_range (struct lexer *lexer, const char *name, long min, long max)
 915 {
 916   bool is_number = lex_is_number (lexer);
 917   bool is_integer = lex_is_integer (lexer);
 918   bool too_small = (is_integer ? lex_integer (lexer) < min
 919                     : is_number ? lex_number (lexer) < min
 920                     : false);
 921   bool too_big = (is_integer ? lex_integer (lexer) > max
 922                   : is_number ? lex_number (lexer) > max
 923                   : false);
 924   if (is_integer && !too_small && !too_big)
 925     return true;
 926
 927   if (min > max)
 928     {
 929       /* Weird, maybe a bug in the caller.  Just report that we needed an
 930          integer. */
 931       if (name)
 932         lex_error (lexer, _("Syntax error expecting integer for %s."), name);
 933       else
 934         lex_error (lexer, _("Syntax error expecting integer."));
 935     }
 936   else if (min == max)
 937     {
 938       if (name)
 939         lex_error (lexer, _("Syntax error expecting %ld for %s."), min, name);
 940       else
 941         lex_error (lexer, _("Syntax error expecting %ld."), min);
 942     }
 943   else if (min + 1 == max)
 944     {
 945       if (name)
 946         lex_error (lexer, _("Syntax error expecting %ld or %ld for %s."),
 947                    min, min + 1, name);
 948       else
 949         lex_error (lexer, _("Syntax error expecting %ld or %ld."),
 950                    min, min + 1);
 951     }
 952   else
 953     {
 954       bool report_lower_bound = (min > INT_MIN / 2) || too_small;
 955       bool report_upper_bound = (max < INT_MAX / 2) || too_big;
 956
 957       if (report_lower_bound && report_upper_bound)
 958         {
 959           if (name)
 960             lex_error (lexer,
 961                        _("Syntax error expecting integer "
 962                          "between %ld and %ld for %s."),
 963                        min, max, name);
 964           else
 965             lex_error (lexer, _("Syntax error expecting integer "
 966                                 "between %ld and %ld."),
 967                        min, max);
 968         }
 969       else if (report_lower_bound)
 970         {
 971           if (min == 0)
 972             {
 973               if (name)
 974                 lex_error (lexer, _("Syntax error expecting "
 975                                     "non-negative integer for %s."),
 976                            name);
 977               else
 978                 lex_error (lexer, _("Syntax error expecting "
 979                                     "non-negative integer."));
 980             }
 981           else if (min == 1)
 982             {
 983               if (name)
 984                 lex_error (lexer, _("Syntax error expecting "
 985                                     "positive integer for %s."),
 986                            name);
 987               else
 988                 lex_error (lexer, _("Syntax error expecting "
 989                                     "positive integer."));
 990             }
 991           else
 992             {
 993               if (name)
 994                 lex_error (lexer, _("Syntax error expecting "
 995                                     "integer %ld or greater for %s."),
 996                            min, name);
 997               else
 998                 lex_error (lexer, _("Syntax error expecting "
 999                                     "integer %ld or greater."), min);
1000             }
1001         }
1002       else if (report_upper_bound)
1003         {
1004           if (name)
1005             lex_error (lexer,
1006                        _("Syntax error expecting integer less than or equal "
1007                          "to %ld for %s."),
1008                        max, name);
1009           else
1010             lex_error (lexer, _("Syntax error expecting integer less than or "
1011                                 "equal to %ld."),
1012                        max);
1013         }
1014       else
1015         {
1016           if (name)
1017             lex_error (lexer, _("Syntax error expecting integer for %s."),
1018                        name);
1019           else
1020             lex_error (lexer, _("Syntax error expecting integer."));
1021         }
1022     }
1023   return false;
1024 }
1025
1026 /* If the current token is a number, does nothing and returns true.
1027    Otherwise, reports an error and returns false. */
1028 bool
1029 lex_force_num (struct lexer *lexer)
1030 {
1031   if (lex_is_number (lexer))
1032     return true;
1033
1034   lex_error (lexer, _("Syntax error expecting number."));
1035   return false;
1036 }
1037
1038 /* If the current token is an number in the closed range [MIN,MAX], does
1039    nothing and returns true.  Otherwise, reports an error and returns false.
1040    If NAME is nonnull, then it is used in the error message. */
1041 bool
1042 lex_force_num_range_closed (struct lexer *lexer, const char *name,
1043                             double min, double max)
1044 {
1045   bool is_number = lex_is_number (lexer);
1046   bool too_small = is_number && lex_number (lexer) < min;
1047   bool too_big = is_number && lex_number (lexer) > max;
1048   if (is_number && !too_small && !too_big)
1049     return true;
1050
1051   if (min > max)
1052     {
1053       /* Weird, maybe a bug in the caller.  Just report that we needed an
1054          number. */
1055       if (name)
1056         lex_error (lexer, _("Syntax error expecting number for %s."), name);
1057       else
1058         lex_error (lexer, _("Syntax error expecting number."));
1059     }
1060   else if (min == max)
1061     {
1062       if (name)
1063         lex_error (lexer, _("Syntax error expecting number %g for %s."),
1064                    min, name);
1065       else
1066         lex_error (lexer, _("Syntax error expecting number %g."), min);
1067     }
1068   else
1069     {
1070       bool report_lower_bound = min > -DBL_MAX || too_small;
1071       bool report_upper_bound = max < DBL_MAX || too_big;
1072
1073       if (report_lower_bound && report_upper_bound)
1074         {
1075           if (name)
1076             lex_error (lexer,
1077                        _("Syntax error expecting number "
1078                          "between %g and %g for %s."),
1079                        min, max, name);
1080           else
1081             lex_error (lexer, _("Syntax error expecting number "
1082                                 "between %g and %g."),
1083                        min, max);
1084         }
1085       else if (report_lower_bound)
1086         {
1087           if (min == 0)
1088             {
1089               if (name)
1090                 lex_error (lexer, _("Syntax error expecting "
1091                                     "non-negative number for %s."),
1092                            name);
1093               else
1094                 lex_error (lexer, _("Syntax error expecting "
1095                                     "non-negative number."));
1096             }
1097           else
1098             {
1099               if (name)
1100                 lex_error (lexer, _("Syntax error expecting number "
1101                                     "%g or greater for %s."),
1102                            min, name);
1103               else
1104                 lex_error (lexer, _("Syntax error expecting number "
1105                                     "%g or greater."), min);
1106             }
1107         }
1108       else if (report_upper_bound)
1109         {
1110           if (name)
1111             lex_error (lexer,
1112                        _("Syntax error expecting number "
1113                          "less than or equal to %g for %s."),
1114                        max, name);
1115           else
1116             lex_error (lexer, _("Syntax error expecting number "
1117                                 "less than or equal to %g."),
1118                        max);
1119         }
1120       else
1121         {
1122           if (name)
1123             lex_error (lexer, _("Syntax error expecting number for %s."), name);
1124           else
1125             lex_error (lexer, _("Syntax error expecting number."));
1126         }
1127     }
1128   return false;
1129 }
1130
1131 /* If the current token is an number in the half-open range [MIN,MAX), does
1132    nothing and returns true.  Otherwise, reports an error and returns false.
1133    If NAME is nonnull, then it is used in the error message. */
1134 bool
1135 lex_force_num_range_halfopen (struct lexer *lexer, const char *name,
1136                               double min, double max)
1137 {
1138   bool is_number = lex_is_number (lexer);
1139   bool too_small = is_number && lex_number (lexer) < min;
1140   bool too_big = is_number && lex_number (lexer) >= max;
1141   if (is_number && !too_small && !too_big)
1142     return true;
1143
1144   if (min >= max)
1145     {
1146       /* Weird, maybe a bug in the caller.  Just report that we needed an
1147          number. */
1148       if (name)
1149         lex_error (lexer, _("Syntax error expecting number for %s."), name);
1150       else
1151         lex_error (lexer, _("Syntax error expecting number."));
1152     }
1153   else
1154     {
1155       bool report_lower_bound = min > -DBL_MAX || too_small;
1156       bool report_upper_bound = max < DBL_MAX || too_big;
1157
1158       if (report_lower_bound && report_upper_bound)
1159         {
1160           if (name)
1161             lex_error (lexer, _("Syntax error expecting number "
1162                                 "in [%g,%g) for %s."),
1163                        min, max, name);
1164           else
1165             lex_error (lexer, _("Syntax error expecting number in [%g,%g)."),
1166                        min, max);
1167         }
1168       else if (report_lower_bound)
1169         {
1170           if (min == 0)
1171             {
1172               if (name)
1173                 lex_error (lexer, _("Syntax error expecting "
1174                                     "non-negative number for %s."),
1175                            name);
1176               else
1177                 lex_error (lexer, _("Syntax error expecting "
1178                                     "non-negative number."));
1179             }
1180           else
1181             {
1182               if (name)
1183                 lex_error (lexer, _("Syntax error expecting "
1184                                     "number %g or greater for %s."),
1185                            min, name);
1186               else
1187                 lex_error (lexer, _("Syntax error expecting "
1188                                     "number %g or greater."), min);
1189             }
1190         }
1191       else if (report_upper_bound)
1192         {
1193           if (name)
1194             lex_error (lexer,
1195                        _("Syntax error expecting "
1196                          "number less than %g for %s."), max, name);
1197           else
1198             lex_error (lexer, _("Syntax error expecting "
1199                                 "number less than %g."), max);
1200         }
1201       else
1202         {
1203           if (name)
1204             lex_error (lexer, _("Syntax error expecting number for %s."), name);
1205           else
1206             lex_error (lexer, _("Syntax error expecting number."));
1207         }
1208     }
1209   return false;
1210 }
1211
1212 /* If the current token is an number in the open range (MIN,MAX), does
1213    nothing and returns true.  Otherwise, reports an error and returns false.
1214    If NAME is nonnull, then it is used in the error message. */
1215 bool
1216 lex_force_num_range_open (struct lexer *lexer, const char *name,
1217                           double min, double max)
1218 {
1219   bool is_number = lex_is_number (lexer);
1220   bool too_small = is_number && lex_number (lexer) <= min;
1221   bool too_big = is_number && lex_number (lexer) >= max;
1222   if (is_number && !too_small && !too_big)
1223     return true;
1224
1225   if (min >= max)
1226     {
1227       /* Weird, maybe a bug in the caller.  Just report that we needed an
1228          number. */
1229       if (name)
1230         lex_error (lexer, _("Syntax error expecting number for %s."), name);
1231       else
1232         lex_error (lexer, _("Syntax error expecting number."));
1233     }
1234   else
1235     {
1236       bool report_lower_bound = min > -DBL_MAX || too_small;
1237       bool report_upper_bound = max < DBL_MAX || too_big;
1238
1239       if (report_lower_bound && report_upper_bound)
1240         {
1241           if (name)
1242             lex_error (lexer, _("Syntax error expecting number "
1243                                 "in (%g,%g) for %s."),
1244                        min, max, name);
1245           else
1246             lex_error (lexer, _("Syntax error expecting number "
1247                                 "in (%g,%g)."), min, max);
1248         }
1249       else if (report_lower_bound)
1250         {
1251           if (min == 0)
1252             {
1253               if (name)
1254                 lex_error (lexer, _("Syntax error expecting "
1255                                     "positive number for %s."), name);
1256               else
1257                 lex_error (lexer, _("Syntax error expecting "
1258                                     "positive number."));
1259             }
1260           else
1261             {
1262               if (name)
1263                 lex_error (lexer, _("Syntax error expecting number "
1264                                     "greater than %g for %s."),
1265                            min, name);
1266               else
1267                 lex_error (lexer, _("Syntax error expecting number "
1268                                     "greater than %g."), min);
1269             }
1270         }
1271       else if (report_upper_bound)
1272         {
1273           if (name)
1274             lex_error (lexer, _("Syntax error expecting number "
1275                                 "less than %g for %s."),
1276                        max, name);
1277           else
1278             lex_error (lexer, _("Syntax error expecting number "
1279                                 "less than %g."), max);
1280         }
1281       else
1282         {
1283           if (name)
1284             lex_error (lexer, _("Syntax error expecting number "
1285                                 "for %s."), name);
1286           else
1287             lex_error (lexer, _("Syntax error expecting number."));
1288         }
1289     }
1290   return false;
1291 }
1292
1293 /* If the current token is an identifier, does nothing and returns true.
1294    Otherwise, reports an error and returns false. */
1295 bool
1296 lex_force_id (struct lexer *lexer)
1297 {
1298   if (lex_token (lexer) == T_ID)
1299     return true;
1300
1301   lex_error (lexer, _("Syntax error expecting identifier."));
1302   return false;
1303 }
1304 \f
1305 /* Token accessors. */
1306
1307 /* Returns the type of LEXER's current token. */
1308 enum token_type
1309 lex_token (const struct lexer *lexer)
1310 {
1311   return lex_next_token (lexer, 0);
1312 }
1313
1314 /* Returns the number in LEXER's current token.
1315
1316    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
1317    tokens this function will always return zero. */
1318 double
1319 lex_tokval (const struct lexer *lexer)
1320 {
1321   return lex_next_tokval (lexer, 0);
1322 }
1323
1324 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
1325
1326    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
1327    this functions this function will always return NULL.
1328
1329    The UTF-8 encoding of the returned string is correct for variable names and
1330    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
1331    data_in() to use it in a "union value".  */
1332 const char *
1333 lex_tokcstr (const struct lexer *lexer)
1334 {
1335   return lex_next_tokcstr (lexer, 0);
1336 }
1337
1338 /* Returns the string in LEXER's current token, UTF-8 encoded.  The string is
1339    null-terminated (but the null terminator is not included in the returned
1340    substring's 'length').
1341
1342    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
1343    this functions this function will always return NULL.
1344
1345    The UTF-8 encoding of the returned string is correct for variable names and
1346    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
1347    data_in() to use it in a "union value".  */
1348 struct substring
1349 lex_tokss (const struct lexer *lexer)
1350 {
1351   return lex_next_tokss (lexer, 0);
1352 }
1353 \f
1354 /* Looking ahead.
1355
1356    A value of 0 for N as an argument to any of these functions refers to the
1357    current token.  Lookahead is limited to the current command.  Any N greater
1358    than the number of tokens remaining in the current command will be treated
1359    as referring to a T_ENDCMD token. */
1360
1361 static const struct lex_token *
1362 lex_next__ (const struct lexer *lexer_, int n)
1363 {
1364   struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
1365   struct lex_source *src = lex_source__ (lexer);
1366
1367   if (src != NULL)
1368     return lex_source_next__ (src, n);
1369   else
1370     {
1371       static const struct lex_token stop_token = { .token = { .type = T_STOP } };
1372       return &stop_token;
1373     }
1374 }
1375
1376 static const struct lex_token *
1377 lex_source_ofs__ (const struct lex_source *src_, int ofs)
1378 {
1379   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1380
1381   if (ofs < 0)
1382     {
1383       static const struct lex_token endcmd_token
1384         = { .token = { .type = T_ENDCMD } };
1385       return &endcmd_token;
1386     }
1387
1388   while (ofs >= src->n_parse)
1389     {
1390       if (src->n_parse > 0)
1391         {
1392           const struct lex_token *t = src->parse[src->n_parse - 1];
1393           if (t->token.type == T_STOP || t->token.type == T_ENDCMD)
1394             return t;
1395         }
1396
1397       lex_source_get_parse (src);
1398     }
1399
1400   return src->parse[ofs];
1401 }
1402
1403 static const struct lex_token *
1404 lex_source_next__ (const struct lex_source *src, int n)
1405 {
1406   return lex_source_ofs__ (src, n + src->parse_ofs);
1407 }
1408
1409 /* Returns the "struct token" of the token N after the current one in LEXER.
1410    The returned pointer can be invalidated by pretty much any succeeding call
1411    into the lexer, although the string pointer within the returned token is
1412    only invalidated by consuming the token (e.g. with lex_get()). */
1413 const struct token *
1414 lex_next (const struct lexer *lexer, int n)
1415 {
1416   return &lex_next__ (lexer, n)->token;
1417 }
1418
1419 /* Returns the type of the token N after the current one in LEXER. */
1420 enum token_type
1421 lex_next_token (const struct lexer *lexer, int n)
1422 {
1423   return lex_next (lexer, n)->type;
1424 }
1425
1426 /* Returns the number in the tokn N after the current one in LEXER.
1427
1428    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
1429    tokens this function will always return zero. */
1430 double
1431 lex_next_tokval (const struct lexer *lexer, int n)
1432 {
1433   return token_number (lex_next (lexer, n));
1434 }
1435
1436 /* Returns the null-terminated string in the token N after the current one, in
1437    UTF-8 encoding.
1438
1439    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
1440    this functions this function will always return NULL.
1441
1442    The UTF-8 encoding of the returned string is correct for variable names and
1443    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
1444    data_in() to use it in a "union value".  */
1445 const char *
1446 lex_next_tokcstr (const struct lexer *lexer, int n)
1447 {
1448   return lex_next_tokss (lexer, n).string;
1449 }
1450
1451 /* Returns the string in the token N after the current one, in UTF-8 encoding.
1452    The string is null-terminated (but the null terminator is not included in
1453    the returned substring's 'length').
1454
1455    Only T_ID, T_MACRO_ID, T_STRING tokens have meaningful strings.  For other
1456    tokens this functions this function will always return NULL.
1457
1458    The UTF-8 encoding of the returned string is correct for variable names and
1459    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
1460    data_in() to use it in a "union value".  */
1461 struct substring
1462 lex_next_tokss (const struct lexer *lexer, int n)
1463 {
1464   return lex_next (lexer, n)->string;
1465 }
1466
1467 /* Returns the offset of the current token within the command being parsed in
1468    LEXER.  This is 0 for the first token in a command, 1 for the second, and so
1469    on.  The return value is useful later for referring to this token in calls
1470    to lex_ofs_*(). */
1471 int
1472 lex_ofs (const struct lexer *lexer)
1473 {
1474   struct lex_source *src = lex_source__ (lexer);
1475   return src ? src->parse_ofs : 0;
1476 }
1477
1478 /* Returns the offset of the last token in the current command. */
1479 int
1480 lex_max_ofs (const struct lexer *lexer)
1481 {
1482   struct lex_source *src = lex_source__ (lexer);
1483   if (!src)
1484     return 0;
1485
1486   int ofs = MAX (1, src->n_parse) - 1;
1487   for (;;)
1488     {
1489       enum token_type type = lex_source_ofs__ (src, ofs)->token.type;
1490       if (type == T_ENDCMD || type == T_STOP)
1491         return ofs;
1492
1493       ofs++;
1494     }
1495 }
1496
1497 /* Returns the token within LEXER's current command with offset OFS.  Use
1498    lex_ofs() to find out the offset of the current token. */
1499 const struct token *
1500 lex_ofs_token (const struct lexer *lexer_, int ofs)
1501 {
1502   struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
1503   struct lex_source *src = lex_source__ (lexer);
1504
1505   if (src != NULL)
1506     return &lex_source_next__ (src, ofs - src->parse_ofs)->token;
1507   else
1508     {
1509       static const struct token stop_token = { .type = T_STOP };
1510       return &stop_token;
1511     }
1512 }
1513
1514 /* Allocates and returns a new struct msg_location that spans tokens with
1515    offsets OFS0 through OFS1, inclusive, within the current command in
1516    LEXER.  See lex_ofs() for an explanation of token offsets.
1517
1518    The caller owns and must eventually free the returned object. */
1519 struct msg_location *
1520 lex_ofs_location (const struct lexer *lexer, int ofs0, int ofs1)
1521 {
1522   int ofs = lex_ofs (lexer);
1523   return lex_get_location (lexer, ofs0 - ofs, ofs1 - ofs);
1524 }
1525
1526 /* Returns a msg_point for the first character in the token with offset OFS,
1527    where offset 0 is the first token in the command currently being parsed, 1
1528    the second token, and so on.  These are absolute offsets, not relative to
1529    the token currently being parsed within the command.
1530
1531    Returns zeros for a T_STOP token.
1532  */
1533 struct msg_point
1534 lex_ofs_start_point (const struct lexer *lexer, int ofs)
1535 {
1536   const struct lex_source *src = lex_source__ (lexer);
1537   return (src
1538           ? lex_token_start_point (src, lex_source_ofs__ (src, ofs))
1539           : (struct msg_point) { 0, 0 });
1540 }
1541
1542 /* Returns a msg_point for the last character, inclusive, in the token with
1543    offset OFS, where offset 0 is the first token in the command currently being
1544    parsed, 1 the second token, and so on.  These are absolute offsets, not
1545    relative to the token currently being parsed within the command.
1546
1547    Returns zeros for a T_STOP token.
1548
1549    Most of the time, a single token is wholly within a single line of syntax,
1550    so that the start and end point for a given offset have the same line
1551    number.  There are two exceptions: a T_STRING token can be made up of
1552    multiple segments on adjacent lines connected with "+" punctuators, and a
1553    T_NEG_NUM token can consist of a "-" on one line followed by the number on
1554    the next.
1555  */
1556 struct msg_point
1557 lex_ofs_end_point (const struct lexer *lexer, int ofs)
1558 {
1559   const struct lex_source *src = lex_source__ (lexer);
1560   return (src
1561           ? lex_token_end_point (src, lex_source_ofs__ (src, ofs))
1562           : (struct msg_point) { 0, 0 });
1563 }
1564
1565 /* Returns the text of the syntax in tokens N0 ahead of the current one,
1566    through N1 ahead of the current one, inclusive.  (For example, if N0 and N1
1567    are both zero, this requests the syntax for the current token.)
1568
1569    The caller must eventually free the returned string (with free()).  The
1570    syntax is encoded in UTF-8 and in the original form supplied to the lexer so
1571    that, for example, it may include comments, spaces, and new-lines if it
1572    spans multiple tokens.  Macro expansion, however, has already been
1573    performed. */
1574 char *
1575 lex_next_representation (const struct lexer *lexer, int n0, int n1)
1576 {
1577   const struct lex_source *src = lex_source__ (lexer);
1578   return (src
1579           ? lex_source_syntax__ (src, n0 + src->parse_ofs, n1 + src->parse_ofs)
1580           : xstrdup (""));
1581 }
1582
1583
1584 /* Returns the text of the syntax in tokens with offsets OFS0 to OFS1,
1585    inclusive.  (For example, if OFS0 and OFS1 are both zero, this requests the
1586    syntax for the first token in the current command.)
1587
1588    The caller must eventually free the returned string (with free()).  The
1589    syntax is encoded in UTF-8 and in the original form supplied to the lexer so
1590    that, for example, it may include comments, spaces, and new-lines if it
1591    spans multiple tokens.  Macro expansion, however, has already been
1592    performed. */
1593 char *
1594 lex_ofs_representation (const struct lexer *lexer, int ofs0, int ofs1)
1595 {
1596   const struct lex_source *src = lex_source__ (lexer);
1597   return src ? lex_source_syntax__ (src, ofs0, ofs1) : xstrdup ("");
1598 }
1599
1600 /* Returns true if the token N ahead of the current one was produced by macro
1601    expansion, false otherwise. */
1602 bool
1603 lex_next_is_from_macro (const struct lexer *lexer, int n)
1604 {
1605   return lex_next__ (lexer, n)->macro_rep != NULL;
1606 }
1607
1608 static bool
1609 lex_tokens_match (const struct token *actual, const struct token *expected)
1610 {
1611   if (actual->type != expected->type)
1612     return false;
1613
1614   switch (actual->type)
1615     {
1616     case T_POS_NUM:
1617     case T_NEG_NUM:
1618       return actual->number == expected->number;
1619
1620     case T_ID:
1621       return lex_id_match (expected->string, actual->string);
1622
1623     case T_STRING:
1624       return (actual->string.length == expected->string.length
1625               && !memcmp (actual->string.string, expected->string.string,
1626                           actual->string.length));
1627
1628     default:
1629       return true;
1630     }
1631 }
1632
1633 static size_t
1634 lex_ofs_at_phrase__ (struct lexer *lexer, int ofs, const char *s)
1635 {
1636   struct string_lexer slex;
1637   struct token token;
1638
1639   size_t i = 0;
1640   string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE, true);
1641   while (string_lexer_next (&slex, &token))
1642     {
1643       bool match = lex_tokens_match (lex_ofs_token (lexer, ofs + i++), &token);
1644       token_uninit (&token);
1645       if (!match)
1646         return 0;
1647     }
1648   return i;
1649 }
1650
1651 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1652    returns true.  Otherwise, returns false.
1653
1654    S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1655    "2SLS", or "END INPUT PROGRAM".  Identifiers may be abbreviated to their
1656    first three letters. */
1657 bool
1658 lex_at_phrase (struct lexer *lexer, const char *s)
1659 {
1660   return lex_ofs_at_phrase__ (lexer, lex_ofs (lexer), s) > 0;
1661 }
1662
1663 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1664    skips it and returns true.  Otherwise, returns false.
1665
1666    S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1667    "2SLS", or "END INPUT PROGRAM".  Identifiers may be abbreviated to their
1668    first three letters. */
1669 bool
1670 lex_match_phrase (struct lexer *lexer, const char *s)
1671 {
1672   size_t n = lex_ofs_at_phrase__ (lexer, lex_ofs (lexer), s);
1673   if (n > 0)
1674     lex_get_n (lexer, n);
1675   return n > 0;
1676 }
1677
1678 /* Returns the 1-based line number of the source text at the byte OFFSET in
1679    SRC. */
1680 static int
1681 lex_source_ofs_to_line_number (const struct lex_source *src, size_t offset)
1682 {
1683   size_t lo = 0;
1684   size_t hi = src->n_lines;
1685   for (;;)
1686     {
1687       size_t mid = (lo + hi) / 2;
1688       if (mid + 1 >= src->n_lines)
1689         return src->n_lines;
1690       else if (offset >= src->lines[mid + 1])
1691         lo = mid;
1692       else if (offset < src->lines[mid])
1693         hi = mid;
1694       else
1695         return mid + 1;
1696     }
1697 }
1698
1699 /* Returns the 1-based column number of the source text at the byte OFFSET in
1700    SRC. */
1701 static int
1702 lex_source_ofs_to_column_number (const struct lex_source *src, size_t offset)
1703 {
1704   const char *newline = memrchr (src->buffer, '\n', offset);
1705   size_t line_ofs = newline ? newline - src->buffer + 1 : 0;
1706   return utf8_count_columns (&src->buffer[line_ofs], offset - line_ofs) + 1;
1707 }
1708
1709 static struct msg_point
1710 lex_source_ofs_to_point__ (const struct lex_source *src, size_t offset)
1711 {
1712   return (struct msg_point) {
1713     .line = lex_source_ofs_to_line_number (src, offset),
1714     .column = lex_source_ofs_to_column_number (src, offset),
1715   };
1716 }
1717
1718 static struct msg_point
1719 lex_token_start_point (const struct lex_source *src,
1720                        const struct lex_token *token)
1721 {
1722   return lex_source_ofs_to_point__ (src, token->token_pos);
1723 }
1724
1725 static struct msg_point
1726 lex_token_end_point (const struct lex_source *src,
1727                      const struct lex_token *token)
1728 {
1729   return lex_source_ofs_to_point__ (src, lex_token_end (token));
1730 }
1731
1732 static struct msg_location
1733 lex_token_location (const struct lex_source *src,
1734                     const struct lex_token *t0,
1735                     const struct lex_token *t1)
1736 {
1737   return (struct msg_location) {
1738     .file_name = intern_new_if_nonnull (src->reader->file_name),
1739     .start = lex_token_start_point (src, t0),
1740     .end = lex_token_end_point (src, t1),
1741     .src = CONST_CAST (struct lex_source *, src),
1742   };
1743 }
1744
1745 static struct msg_location *
1746 lex_token_location_rw (const struct lex_source *src,
1747                        const struct lex_token *t0,
1748                        const struct lex_token *t1)
1749 {
1750   struct msg_location location = lex_token_location (src, t0, t1);
1751   return msg_location_dup (&location);
1752 }
1753
1754 static struct msg_location *
1755 lex_source_get_location (const struct lex_source *src, int ofs0, int ofs1)
1756 {
1757   return lex_token_location_rw (src,
1758                                 lex_source_ofs__ (src, ofs0),
1759                                 lex_source_ofs__ (src, ofs1));
1760 }
1761
1762 /* Returns the name of the syntax file from which the current command is drawn.
1763    Returns NULL for a T_STOP token or if the command's source does not have
1764    line numbers.
1765
1766    There is no version of this function that takes an N argument because
1767    lookahead only works to the end of a command and any given command is always
1768    within a single syntax file. */
1769 const char *
1770 lex_get_file_name (const struct lexer *lexer)
1771 {
1772   struct lex_source *src = lex_source__ (lexer);
1773   return src == NULL ? NULL : src->reader->file_name;
1774 }
1775
1776 /* Returns a newly allocated msg_location for the syntax that represents tokens
1777    with 0-based offsets N0...N1, inclusive, from the current token.  The caller
1778    must eventually free the location (with msg_location_destroy()). */
1779 struct msg_location *
1780 lex_get_location (const struct lexer *lexer, int n0, int n1)
1781 {
1782   struct msg_location *loc = xmalloc (sizeof *loc);
1783   *loc = (struct msg_location) {
1784     .file_name = intern_new_if_nonnull (lex_get_file_name (lexer)),
1785     .start = lex_ofs_start_point (lexer, n0 + lex_ofs (lexer)),
1786     .end = lex_ofs_end_point (lexer, n1 + lex_ofs (lexer)),
1787     .src = lex_source__ (lexer),
1788   };
1789   lex_source_ref (loc->src);
1790   return loc;
1791 }
1792
1793 const char *
1794 lex_get_encoding (const struct lexer *lexer)
1795 {
1796   struct lex_source *src = lex_source__ (lexer);
1797   return src == NULL ? NULL : src->reader->encoding;
1798 }
1799
1800 /* Returns the syntax mode for the syntax file from which the current drawn is
1801    drawn.  Returns SEG_MODE_AUTO for a T_STOP token or if the command's source
1802    does not have line numbers.
1803
1804    There is no version of this function that takes an N argument because
1805    lookahead only works to the end of a command and any given command is always
1806    within a single syntax file. */
1807 enum segmenter_mode
1808 lex_get_syntax_mode (const struct lexer *lexer)
1809 {
1810   struct lex_source *src = lex_source__ (lexer);
1811   return src == NULL ? SEG_MODE_AUTO : src->reader->syntax;
1812 }
1813
1814 /* Returns the error mode for the syntax file from which the current drawn is
1815    drawn.  Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1816    source does not have line numbers.
1817
1818    There is no version of this function that takes an N argument because
1819    lookahead only works to the end of a command and any given command is always
1820    within a single syntax file. */
1821 enum lex_error_mode
1822 lex_get_error_mode (const struct lexer *lexer)
1823 {
1824   struct lex_source *src = lex_source__ (lexer);
1825   return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1826 }
1827
1828 /* If the source that LEXER is currently reading has error mode
1829    LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1830    token to be read comes directly from whatever is next read from the stream.
1831
1832    It makes sense to call this function after encountering an error in a
1833    command entered on the console, because usually the user would prefer not to
1834    have cascading errors. */
1835 void
1836 lex_interactive_reset (struct lexer *lexer)
1837 {
1838   struct lex_source *src = lex_source__ (lexer);
1839   if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1840     {
1841       src->length = 0;
1842       src->journal_pos = src->seg_pos = 0;
1843       src->n_lines = 0;
1844       src->suppress_next_newline = false;
1845       src->segmenter = segmenter_init (segmenter_get_mode (&src->segmenter),
1846                                        false);
1847       lex_stage_clear (&src->pp);
1848       lex_stage_clear (&src->merge);
1849       lex_source_clear_parse (src);
1850       lex_source_push_endcmd__ (src);
1851     }
1852 }
1853
1854 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1855 void
1856 lex_discard_rest_of_command (struct lexer *lexer)
1857 {
1858   while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1859     lex_get (lexer);
1860 }
1861
1862 /* Discards all lookahead tokens in LEXER, then discards all input sources
1863    until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1864    runs out of input sources. */
1865 void
1866 lex_discard_noninteractive (struct lexer *lexer)
1867 {
1868   struct lex_source *src = lex_source__ (lexer);
1869
1870   if (src != NULL)
1871     {
1872       lex_stage_clear (&src->pp);
1873       lex_stage_clear (&src->merge);
1874       lex_source_clear_parse (src);
1875
1876       for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1877            src = lex_source__ (lexer))
1878         {
1879           ll_remove (&src->ll);
1880           lex_source_unref (src);
1881         }
1882     }
1883 }
1884 \f
1885 static void
1886 lex_source_expand__ (struct lex_source *src)
1887 {
1888   if (src->length >= src->allocated)
1889     src->buffer = x2realloc (src->buffer, &src->allocated);
1890 }
1891
1892 static void
1893 lex_source_read__ (struct lex_source *src)
1894 {
1895   do
1896     {
1897       lex_source_expand__ (src);
1898
1899       size_t space = src->allocated - src->length;
1900       enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1901       size_t n = src->reader->class->read (src->reader,
1902                                            &src->buffer[src->length],
1903                                            space, prompt);
1904       assert (n <= space);
1905
1906       if (n == 0)
1907         {
1908           /* End of input. */
1909           src->reader->eof = true;
1910           return;
1911         }
1912
1913       src->length += n;
1914     }
1915   while (!memchr (&src->buffer[src->seg_pos], '\n',
1916                   src->length - src->seg_pos));
1917 }
1918
1919 static struct lex_source *
1920 lex_source__ (const struct lexer *lexer)
1921 {
1922   return (ll_is_empty (&lexer->sources) ? NULL
1923           : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1924 }
1925
1926 const struct lex_source *
1927 lex_source (const struct lexer *lexer)
1928 {
1929   return lex_source__ (lexer);
1930 }
1931
1932 /* Returns the text of the syntax in SRC for tokens with offsets OFS0 through
1933    OFS1 in the current command, inclusive.  (For example, if OFS0 and OFS1 are
1934    both zero, this requests the syntax for the first token in the current
1935    command.)  The caller must eventually free the returned string (with
1936    free()).  The syntax is encoded in UTF-8 and in the original form supplied
1937    to the lexer so that, for example, it may include comments, spaces, and
1938    new-lines if it spans multiple tokens.  Macro expansion, however, has
1939    already been performed. */
1940 static char *
1941 lex_source_syntax__ (const struct lex_source *src, int ofs0, int ofs1)
1942 {
1943   struct string s = DS_EMPTY_INITIALIZER;
1944   for (size_t i = ofs0; i <= ofs1; )
1945     {
1946       /* Find [I,J) as the longest sequence of tokens not produced by macro
1947          expansion, or otherwise the longest sequence expanded from a single
1948          macro call. */
1949       const struct lex_token *first = lex_source_ofs__ (src, i);
1950       size_t j;
1951       for (j = i + 1; j <= ofs1; j++)
1952         {
1953           const struct lex_token *cur = lex_source_ofs__ (src, j);
1954           if ((first->macro_rep != NULL) != (cur->macro_rep != NULL)
1955               || first->macro_rep != cur->macro_rep)
1956             break;
1957         }
1958       const struct lex_token *last = lex_source_ofs__ (src, j - 1);
1959
1960       /* Now add the syntax for this sequence of tokens to SRC. */
1961       if (!ds_is_empty (&s))
1962         ds_put_byte (&s, ' ');
1963       if (!first->macro_rep)
1964         {
1965           size_t start = first->token_pos;
1966           size_t end = last->token_pos + last->token_len;
1967           ds_put_substring (&s, ss_buffer (&src->buffer[start], end - start));
1968         }
1969       else
1970         {
1971           size_t start = first->ofs;
1972           size_t end = last->ofs + last->len;
1973           ds_put_substring (&s, ss_buffer (first->macro_rep + start,
1974                                            end - start));
1975         }
1976
1977       i = j;
1978     }
1979   return ds_steal_cstr (&s);
1980 }
1981
1982 static bool
1983 lex_source_contains_macro_call (struct lex_source *src, int ofs0, int ofs1)
1984 {
1985   for (int i = ofs0; i <= ofs1; i++)
1986     if (lex_source_ofs__ (src, i)->macro_rep)
1987       return true;
1988   return false;
1989 }
1990
1991 /* If tokens N0...N1 (inclusive) in SRC contains a macro call, this returns the
1992    raw UTF-8 syntax for the macro call (not for the expansion) and for any
1993    other tokens included in that range.  The syntax is encoded in UTF-8 and in
1994    the original form supplied to the lexer so that, for example, it may include
1995    comments, spaces, and new-lines if it spans multiple tokens.
1996
1997    Returns an empty string if the token range doesn't include a macro call.
1998
1999    The caller must not modify or free the returned string. */
2000 static struct substring
2001 lex_source_get_macro_call (struct lex_source *src, int ofs0, int ofs1)
2002 {
2003   if (!lex_source_contains_macro_call (src, ofs0, ofs1))
2004     return ss_empty ();
2005
2006   const struct lex_token *token0 = lex_source_ofs__ (src, ofs0);
2007   const struct lex_token *token1 = lex_source_ofs__ (src, MAX (ofs0, ofs1));
2008   size_t start = token0->token_pos;
2009   size_t end = token1->token_pos + token1->token_len;
2010
2011   return ss_buffer (&src->buffer[start], end - start);
2012 }
2013
2014 static void
2015 lex_source_msg_valist (struct lex_source *src, enum msg_class class,
2016                        int ofs0, int ofs1, const char *format, va_list args)
2017 {
2018   struct string s = DS_EMPTY_INITIALIZER;
2019
2020   if (src)
2021     {
2022       /* Get the macro call(s) that expanded to the syntax that caused the
2023          error. */
2024       char call[64];
2025       str_ellipsize (lex_source_get_macro_call (src, ofs0, ofs1),
2026                      call, sizeof call);
2027       if (call[0])
2028         ds_put_format (&s, _("In syntax expanded from `%s'"), call);
2029     }
2030   else
2031     ds_put_cstr (&s, _("At end of input"));
2032
2033   if (!ds_is_empty (&s))
2034     ds_put_cstr (&s, ": ");
2035   if (format)
2036     ds_put_vformat (&s, format, args);
2037   else
2038     ds_put_cstr (&s, _("Syntax error."));
2039
2040   if (ds_last (&s) != '.')
2041     ds_put_byte (&s, '.');
2042
2043   struct msg *m = xmalloc (sizeof *m);
2044   *m = (struct msg) {
2045     .category = msg_class_to_category (class),
2046     .severity = msg_class_to_severity (class),
2047     .location = src ? lex_source_get_location (src, ofs0, ofs1) : NULL,
2048     .text = ds_steal_cstr (&s),
2049   };
2050   msg_emit (m);
2051 }
2052
2053 static void
2054 lex_get_error (struct lex_source *src, const struct lex_token *token)
2055 {
2056   char syntax[64];
2057   str_ellipsize (ss_buffer (&src->buffer[token->token_pos], token->token_len),
2058                  syntax, sizeof syntax);
2059
2060   struct string s = DS_EMPTY_INITIALIZER;
2061   ds_put_cstr (&s, token->token.string.string);
2062
2063   struct msg *m = xmalloc (sizeof *m);
2064   *m = (struct msg) {
2065     .category = MSG_C_SYNTAX,
2066     .severity = MSG_S_ERROR,
2067     .location = lex_token_location_rw (src, token, token),
2068     .text = ds_steal_cstr (&s),
2069   };
2070   msg_emit (m);
2071 }
2072
2073 /* Attempts to append an additional token to 'pp' in SRC, reading more from the
2074    underlying lex_reader if necessary.  Returns true if a new token was added
2075    to SRC's deque, false otherwise.  The caller should retry failures unless
2076    SRC's 'eof' marker was set to true indicating that there will be no more
2077    tokens from this source. */
2078 static bool
2079 lex_source_try_get_pp (struct lex_source *src)
2080 {
2081   /* Append a new token to SRC and initialize it. */
2082   struct lex_token *token = xmalloc (sizeof *token);
2083   token->token = (struct token) { .type = T_STOP };
2084   token->macro_rep = NULL;
2085   token->ref_cnt = NULL;
2086   token->token_pos = src->seg_pos;
2087
2088   /* Extract a segment. */
2089   const char *segment;
2090   enum segment_type seg_type;
2091   int seg_len;
2092   for (;;)
2093     {
2094       segment = &src->buffer[src->seg_pos];
2095       seg_len = segmenter_push (&src->segmenter, segment,
2096                                 src->length - src->seg_pos,
2097                                 src->reader->eof, &seg_type);
2098       if (seg_len >= 0)
2099         break;
2100
2101       /* The segmenter needs more input to produce a segment. */
2102       assert (!src->reader->eof);
2103       lex_source_read__ (src);
2104     }
2105
2106   /* Update state based on the segment. */
2107   token->token_len = seg_len;
2108   src->seg_pos += seg_len;
2109   if (seg_type == SEG_NEWLINE)
2110     {
2111       if (src->n_lines >= src->allocated_lines)
2112         src->lines = x2nrealloc (src->lines, &src->allocated_lines,
2113                                  sizeof *src->lines);
2114       src->lines[src->n_lines++] = src->seg_pos;
2115     }
2116
2117   /* Get a token from the segment. */
2118   enum tokenize_result result = token_from_segment (
2119     seg_type, ss_buffer (segment, seg_len), &token->token);
2120
2121   /* If we've reached the end of a line, or the end of a command, then pass
2122      the line to the output engine as a syntax text item.  */
2123   int n_lines = seg_type == SEG_NEWLINE;
2124   if (seg_type == SEG_END_COMMAND && !src->suppress_next_newline)
2125     {
2126       n_lines++;
2127       src->suppress_next_newline = true;
2128     }
2129   else if (n_lines > 0 && src->suppress_next_newline)
2130     {
2131       n_lines--;
2132       src->suppress_next_newline = false;
2133     }
2134   for (int i = 0; i < n_lines; i++)
2135     {
2136       /* Beginning of line. */
2137       const char *line = &src->buffer[src->journal_pos];
2138
2139       /* Calculate line length, including \n or \r\n end-of-line if present.
2140
2141          We use src->length even though that may be beyond what we've actually
2142          converted to tokens.  That's because, if we're emitting the line due
2143          to SEG_END_COMMAND, we want to take the whole line through the
2144          newline, not just through the '.'. */
2145       size_t max_len = src->length - src->journal_pos;
2146       const char *newline = memchr (line, '\n', max_len);
2147       size_t line_len = newline ? newline - line + 1 : max_len;
2148
2149       /* Calculate line length excluding end-of-line. */
2150       size_t copy_len = line_len;
2151       if (copy_len > 0 && line[copy_len - 1] == '\n')
2152         copy_len--;
2153       if (copy_len > 0 && line[copy_len - 1] == '\r')
2154         copy_len--;
2155
2156       /* Submit the line as syntax. */
2157       output_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX,
2158                                                    xmemdup0 (line, copy_len),
2159                                                    NULL));
2160
2161       src->journal_pos += line_len;
2162     }
2163
2164   switch (result)
2165     {
2166     case TOKENIZE_ERROR:
2167       lex_get_error (src, token);
2168       /* Fall through. */
2169     case TOKENIZE_EMPTY:
2170       lex_token_destroy (token);
2171       return false;
2172
2173     case TOKENIZE_TOKEN:
2174       if (token->token.type == T_STOP)
2175         {
2176           token->token.type = T_ENDCMD;
2177           src->eof = true;
2178         }
2179       lex_stage_push_last (&src->pp, token);
2180       return true;
2181     }
2182   NOT_REACHED ();
2183 }
2184
2185 /* Attempts to append a new token to SRC.  Returns true if successful, false on
2186    failure.  On failure, the end of SRC has been reached and no more tokens
2187    will be forthcoming from it.
2188
2189    Does not make the new token available for lookahead yet; the caller must
2190    adjust SRC's 'middle' pointer to do so. */
2191 static bool
2192 lex_source_get_pp (struct lex_source *src)
2193 {
2194   while (!src->eof)
2195     if (lex_source_try_get_pp (src))
2196       return true;
2197   return false;
2198 }
2199
2200 static bool
2201 lex_source_try_get_merge (const struct lex_source *src_)
2202 {
2203   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
2204
2205   if (lex_stage_is_empty (&src->pp) && !lex_source_get_pp (src))
2206     return false;
2207
2208   if (!settings_get_mexpand ())
2209     {
2210       lex_stage_shift (&src->merge, &src->pp, lex_stage_count (&src->pp));
2211       return true;
2212     }
2213
2214   /* Now pass tokens one-by-one to the macro expander.
2215
2216      In the common case where there is no macro to expand, the loop is not
2217      entered.  */
2218   struct macro_call *mc;
2219   int n_call = macro_call_create (src->lexer->macros,
2220                                   &lex_stage_first (&src->pp)->token, &mc);
2221   for (int ofs = 1; !n_call; ofs++)
2222     {
2223       if (lex_stage_count (&src->pp) <= ofs && !lex_source_get_pp (src))
2224         {
2225           /* This should not be reachable because we always get a T_ENDCMD at
2226              the end of an input file (transformed from T_STOP by
2227              lex_source_try_get_pp()) and the macro_expander should always
2228              terminate expansion on T_ENDCMD. */
2229           NOT_REACHED ();
2230         }
2231
2232       const struct lex_token *t = lex_stage_nth (&src->pp, ofs);
2233       const struct macro_token mt = {
2234         .token = t->token,
2235         .syntax = ss_buffer (&src->buffer[t->token_pos], t->token_len),
2236       };
2237       const struct msg_location loc = lex_token_location (src, t, t);
2238       n_call = macro_call_add (mc, &mt, &loc);
2239     }
2240   if (n_call < 0)
2241     {
2242       /* False alarm: no macro expansion after all.  Use first token as
2243          lookahead.  We'll retry macro expansion from the second token next
2244          time around. */
2245       macro_call_destroy (mc);
2246       lex_stage_shift (&src->merge, &src->pp, 1);
2247       return true;
2248     }
2249
2250   /* The first 'n_call' tokens in 'pp', which we bracket as C0...C1, inclusive,
2251      are a macro call.  (These are likely to be the only tokens in 'pp'.)
2252      Expand them.  */
2253   const struct lex_token *c0 = lex_stage_first (&src->pp);
2254   const struct lex_token *c1 = lex_stage_nth (&src->pp, n_call - 1);
2255   struct macro_tokens expansion = { .n = 0 };
2256   struct msg_location loc = lex_token_location (src, c0, c1);
2257   macro_call_expand (mc, src->reader->syntax, &loc, &expansion);
2258   macro_call_destroy (mc);
2259
2260   /* Convert the macro expansion into syntax for possible error messages
2261      later. */
2262   size_t *ofs = xnmalloc (expansion.n, sizeof *ofs);
2263   size_t *len = xnmalloc (expansion.n, sizeof *len);
2264   struct string s = DS_EMPTY_INITIALIZER;
2265   macro_tokens_to_syntax (&expansion, &s, ofs, len);
2266
2267   if (settings_get_mprint ())
2268     output_item_submit (text_item_create (TEXT_ITEM_LOG, ds_cstr (&s),
2269                                           _("Macro Expansion")));
2270
2271   /* Append the macro expansion tokens to the lookahead. */
2272   if (expansion.n > 0)
2273     {
2274       char *macro_rep = ds_steal_cstr (&s);
2275       size_t *ref_cnt = xmalloc (sizeof *ref_cnt);
2276       *ref_cnt = expansion.n;
2277       for (size_t i = 0; i < expansion.n; i++)
2278         {
2279           struct lex_token *token = xmalloc (sizeof *token);
2280           *token = (struct lex_token) {
2281             .token = expansion.mts[i].token,
2282             .token_pos = c0->token_pos,
2283             .token_len = (c1->token_pos + c1->token_len) - c0->token_pos,
2284             .macro_rep = macro_rep,
2285             .ofs = ofs[i],
2286             .len = len[i],
2287             .ref_cnt = ref_cnt,
2288           };
2289           lex_stage_push_last (&src->merge, token);
2290
2291           ss_dealloc (&expansion.mts[i].syntax);
2292         }
2293     }
2294   else
2295     ds_destroy (&s);
2296   free (expansion.mts);
2297   free (ofs);
2298   free (len);
2299
2300   /* Destroy the tokens for the call. */
2301   for (size_t i = 0; i < n_call; i++)
2302     lex_stage_pop_first (&src->pp);
2303
2304   return expansion.n > 0;
2305 }
2306
2307 /* Attempts to obtain at least one new token into 'merge' in SRC.
2308
2309    Returns true if successful, false on failure.  In the latter case, SRC is
2310    exhausted and 'src->eof' is now true. */
2311 static bool
2312 lex_source_get_merge (struct lex_source *src)
2313 {
2314   while (!src->eof)
2315     if (lex_source_try_get_merge (src))
2316       return true;
2317   return false;
2318 }
2319
2320 /* Attempts to obtain at least one new token into 'lookahead' in SRC.
2321
2322    Returns true if successful, false on failure.  In the latter case, SRC is
2323    exhausted and 'src->eof' is now true. */
2324 static bool
2325 lex_source_get_parse (struct lex_source *src)
2326 {
2327   struct merger m = MERGER_INIT;
2328   struct token out;
2329   for (size_t i = 0; ; i++)
2330     {
2331       while (lex_stage_count (&src->merge) <= i && !lex_source_get_merge (src))
2332         {
2333           /* We always get a T_ENDCMD at the end of an input file
2334              (transformed from T_STOP by lex_source_try_get_pp()) and
2335              merger_add() should never return -1 on T_ENDCMD. */
2336           assert (lex_stage_is_empty (&src->merge));
2337           return false;
2338         }
2339
2340       int retval = merger_add (&m, &lex_stage_nth (&src->merge, i)->token,
2341                                &out);
2342       if (!retval)
2343         {
2344           lex_source_push_parse (src, lex_stage_take_first (&src->merge));
2345           return true;
2346         }
2347       else if (retval > 0)
2348         {
2349           /* Add a token that merges all the tokens together. */
2350           const struct lex_token *first = lex_stage_first (&src->merge);
2351           const struct lex_token *last = lex_stage_nth (&src->merge,
2352                                                         retval - 1);
2353           bool macro = first->macro_rep && first->macro_rep == last->macro_rep;
2354           struct lex_token *t = xmalloc (sizeof *t);
2355           *t = (struct lex_token) {
2356             .token = out,
2357             .token_pos = first->token_pos,
2358             .token_len = (last->token_pos - first->token_pos) + last->token_len,
2359
2360             /* This works well if all the tokens were not expanded from macros,
2361                or if they came from the same macro expansion.  It just gives up
2362                in the other (corner) cases. */
2363             .macro_rep = macro ? first->macro_rep : NULL,
2364             .ofs = macro ? first->ofs : 0,
2365             .len = macro ? (last->ofs - first->ofs) + last->len : 0,
2366             .ref_cnt = macro ? first->ref_cnt : NULL,
2367           };
2368           if (t->ref_cnt)
2369             ++*t->ref_cnt;
2370           lex_source_push_parse (src, t);
2371
2372           for (int i = 0; i < retval; i++)
2373             lex_stage_pop_first (&src->merge);
2374           return true;
2375         }
2376     }
2377 }
2378 \f
2379 static void
2380 lex_source_push_endcmd__ (struct lex_source *src)
2381 {
2382   assert (src->n_parse == 0);
2383
2384   struct lex_token *token = xmalloc (sizeof *token);
2385   *token = (struct lex_token) { .token = { .type = T_ENDCMD } };
2386   lex_source_push_parse (src, token);
2387 }
2388
2389 static void
2390 lex_source_push_parse (struct lex_source *src, struct lex_token *token)
2391 {
2392   if (src->n_parse >= src->allocated_parse)
2393     src->parse = x2nrealloc (src->parse, &src->allocated_parse,
2394                              sizeof *src->parse);
2395   src->parse[src->n_parse++] = token;
2396 }
2397
2398 static void
2399 lex_source_clear_parse (struct lex_source *src)
2400 {
2401   for (size_t i = 0; i < src->n_parse; i++)
2402     lex_token_destroy (src->parse[i]);
2403   src->n_parse = src->parse_ofs = 0;
2404 }
2405
2406 static struct lex_source *
2407 lex_source_create (struct lexer *lexer, struct lex_reader *reader)
2408 {
2409   size_t allocated_lines = 4;
2410   size_t *lines = xmalloc (allocated_lines * sizeof *lines);
2411   *lines = 0;
2412
2413   struct lex_source *src = xmalloc (sizeof *src);
2414   *src = (struct lex_source) {
2415     .n_refs = 1,
2416     .reader = reader,
2417     .segmenter = segmenter_init (reader->syntax, false),
2418     .lexer = lexer,
2419     .lines = lines,
2420     .n_lines = 1,
2421     .allocated_lines = allocated_lines,
2422   };
2423
2424   lex_source_push_endcmd__ (src);
2425
2426   return src;
2427 }
2428
2429 void
2430 lex_set_message_handler (struct lexer *lexer,
2431                          void (*output_msg) (const struct msg *,
2432                                              struct lexer *))
2433 {
2434   struct msg_handler msg_handler = {
2435     .output_msg = (void (*)(const struct msg *, void *)) output_msg,
2436     .aux = lexer,
2437     .lex_source_ref = lex_source_ref,
2438     .lex_source_unref = lex_source_unref,
2439     .lex_source_get_line = lex_source_get_line,
2440   };
2441   msg_set_handler (&msg_handler);
2442 }
2443
2444 struct lex_source *
2445 lex_source_ref (const struct lex_source *src_)
2446 {
2447   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
2448   if (src)
2449     {
2450       assert (src->n_refs > 0);
2451       src->n_refs++;
2452     }
2453   return src;
2454 }
2455
2456 void
2457 lex_source_unref (struct lex_source *src)
2458 {
2459   if (!src)
2460     return;
2461
2462   assert (src->n_refs > 0);
2463   if (--src->n_refs > 0)
2464     return;
2465
2466   char *file_name = src->reader->file_name;
2467   char *encoding = src->reader->encoding;
2468   if (src->reader->class->destroy != NULL)
2469     src->reader->class->destroy (src->reader);
2470   free (file_name);
2471   free (encoding);
2472   free (src->buffer);
2473   free (src->lines);
2474   lex_stage_uninit (&src->pp);
2475   lex_stage_uninit (&src->merge);
2476   lex_source_clear_parse (src);
2477   free (src->parse);
2478   free (src);
2479 }
2480 \f
2481 struct lex_file_reader
2482   {
2483     struct lex_reader reader;
2484     struct u8_istream *istream;
2485   };
2486
2487 static struct lex_reader_class lex_file_reader_class;
2488
2489 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
2490    from stdin if FILE_NAME is "-").  The file is expected to be encoded with
2491    ENCODING, which should take one of the forms accepted by
2492    u8_istream_for_file().  SYNTAX and ERROR become the syntax mode and error
2493    mode of the new reader, respectively.
2494
2495    Returns a null pointer if FILE_NAME cannot be opened. */
2496 struct lex_reader *
2497 lex_reader_for_file (const char *file_name, const char *encoding,
2498                      enum segmenter_mode syntax,
2499                      enum lex_error_mode error)
2500 {
2501   struct lex_file_reader *r;
2502   struct u8_istream *istream;
2503
2504   istream = (!strcmp(file_name, "-")
2505              ? u8_istream_for_fd (encoding, STDIN_FILENO)
2506              : u8_istream_for_file (encoding, file_name, O_RDONLY));
2507   if (istream == NULL)
2508     {
2509       msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
2510       return NULL;
2511     }
2512
2513   r = xmalloc (sizeof *r);
2514   lex_reader_init (&r->reader, &lex_file_reader_class);
2515   r->reader.syntax = syntax;
2516   r->reader.error = error;
2517   r->reader.file_name = xstrdup (file_name);
2518   r->reader.encoding = xstrdup_if_nonnull (encoding);
2519   r->reader.line_number = 1;
2520   r->istream = istream;
2521
2522   return &r->reader;
2523 }
2524
2525 static struct lex_file_reader *
2526 lex_file_reader_cast (struct lex_reader *r)
2527 {
2528   return UP_CAST (r, struct lex_file_reader, reader);
2529 }
2530
2531 static size_t
2532 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
2533                enum prompt_style prompt_style UNUSED)
2534 {
2535   struct lex_file_reader *r = lex_file_reader_cast (r_);
2536   ssize_t n_read = u8_istream_read (r->istream, buf, n);
2537   if (n_read < 0)
2538     {
2539       msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
2540       return 0;
2541     }
2542   return n_read;
2543 }
2544
2545 static void
2546 lex_file_close (struct lex_reader *r_)
2547 {
2548   struct lex_file_reader *r = lex_file_reader_cast (r_);
2549
2550   if (u8_istream_fileno (r->istream) != STDIN_FILENO)
2551     {
2552       if (u8_istream_close (r->istream) != 0)
2553         msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
2554     }
2555   else
2556     u8_istream_free (r->istream);
2557
2558   free (r);
2559 }
2560
2561 static struct lex_reader_class lex_file_reader_class =
2562   {
2563     lex_file_read,
2564     lex_file_close
2565   };
2566 \f
2567 struct lex_string_reader
2568   {
2569     struct lex_reader reader;
2570     struct substring s;
2571     size_t offset;
2572   };
2573
2574 static struct lex_reader_class lex_string_reader_class;
2575
2576 /* Creates and returns a new lex_reader for the contents of S, which must be
2577    encoded in the given ENCODING.  The new reader takes ownership of S and will free it
2578    with ss_dealloc() when it is closed. */
2579 struct lex_reader *
2580 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
2581 {
2582   struct lex_string_reader *r;
2583
2584   r = xmalloc (sizeof *r);
2585   lex_reader_init (&r->reader, &lex_string_reader_class);
2586   r->reader.syntax = SEG_MODE_AUTO;
2587   r->reader.encoding = xstrdup_if_nonnull (encoding);
2588   r->s = s;
2589   r->offset = 0;
2590
2591   return &r->reader;
2592 }
2593
2594 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
2595    which must be encoded in ENCODING.  The caller retains ownership of S. */
2596 struct lex_reader *
2597 lex_reader_for_string (const char *s, const char *encoding)
2598 {
2599   struct substring ss;
2600   ss_alloc_substring (&ss, ss_cstr (s));
2601   return lex_reader_for_substring_nocopy (ss, encoding);
2602 }
2603
2604 /* Formats FORMAT as a printf()-like format string and creates and returns a
2605    new lex_reader for the formatted result.  */
2606 struct lex_reader *
2607 lex_reader_for_format (const char *format, const char *encoding, ...)
2608 {
2609   struct lex_reader *r;
2610   va_list args;
2611
2612   va_start (args, encoding);
2613   r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
2614   va_end (args);
2615
2616   return r;
2617 }
2618
2619 static struct lex_string_reader *
2620 lex_string_reader_cast (struct lex_reader *r)
2621 {
2622   return UP_CAST (r, struct lex_string_reader, reader);
2623 }
2624
2625 static size_t
2626 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
2627                  enum prompt_style prompt_style UNUSED)
2628 {
2629   struct lex_string_reader *r = lex_string_reader_cast (r_);
2630   size_t chunk;
2631
2632   chunk = MIN (n, r->s.length - r->offset);
2633   memcpy (buf, r->s.string + r->offset, chunk);
2634   r->offset += chunk;
2635
2636   return chunk;
2637 }
2638
2639 static void
2640 lex_string_close (struct lex_reader *r_)
2641 {
2642   struct lex_string_reader *r = lex_string_reader_cast (r_);
2643
2644   ss_dealloc (&r->s);
2645   free (r);
2646 }
2647
2648 static struct lex_reader_class lex_string_reader_class =
2649   {
2650     lex_string_read,
2651     lex_string_close
2652   };
2653 \f
2654 struct substring
2655 lex_source_get_line (const struct lex_source *src, int line)
2656 {
2657   if (line < 1 || line > src->n_lines)
2658     return ss_empty ();
2659
2660   size_t ofs = src->lines[line - 1];
2661   size_t end;
2662   if (line < src->n_lines)
2663     end = src->lines[line];
2664   else
2665     {
2666       const char *newline = memchr (src->buffer + ofs, '\n', src->length - ofs);
2667       end = newline ? newline - src->buffer : src->length;
2668     }
2669   return ss_buffer (&src->buffer[ofs], end - ofs);
2670 }