pintos-os.org Git - pspp/blob - src/language/lexer/lexer.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "language/lexer/lexer.h"
  20
  21 #include <errno.h>
  22 #include <fcntl.h>
  23 #include <limits.h>
  24 #include <math.h>
  25 #include <stdarg.h>
  26 #include <stdlib.h>
  27 #include <string.h>
  28 #include <unictype.h>
  29 #include <unistd.h>
  30 #include <unistr.h>
  31
  32 #include "language/command.h"
  33 #include "language/lexer/macro.h"
  34 #include "language/lexer/scan.h"
  35 #include "language/lexer/segment.h"
  36 #include "language/lexer/token.h"
  37 #include "libpspp/assertion.h"
  38 #include "libpspp/cast.h"
  39 #include "libpspp/deque.h"
  40 #include "libpspp/i18n.h"
  41 #include "libpspp/intern.h"
  42 #include "libpspp/ll.h"
  43 #include "libpspp/message.h"
  44 #include "libpspp/misc.h"
  45 #include "libpspp/str.h"
  46 #include "libpspp/u8-istream.h"
  47 #include "output/journal.h"
  48 #include "output/output-item.h"
  49
  50 #include "gl/c-ctype.h"
  51 #include "gl/minmax.h"
  52 #include "gl/xalloc.h"
  53 #include "gl/xmemdup0.h"
  54
  55 #include "gettext.h"
  56 #define _(msgid) gettext (msgid)
  57 #define N_(msgid) msgid
  58
  59 /* A token within a lex_source. */
  60 struct lex_token
  61   {
  62     /* The regular token information. */
  63     struct token token;
  64
  65     /* For a token obtained through the lexer in an ordinary way, this is the
  66        location of the token in terms of the lex_source's buffer.
  67
  68        For a token produced through macro expansion, this is the entire macro
  69        call. */
  70     size_t token_pos;           /* Offset into src->buffer of token start. */
  71     size_t token_len;           /* Length of source for token in bytes. */
  72
  73     /* For a token obtained through macro expansion, this is just this token.
  74
  75        For a token obtained through the lexer in an ordinary way, these are
  76        nulls and zeros. */
  77     char *macro_rep;        /* The whole macro expansion. */
  78     size_t ofs;             /* Offset of this token in macro_rep. */
  79     size_t len;             /* Length of this token in macro_rep. */
  80     size_t *ref_cnt;        /* Number of lex_tokens that refer to macro_rep. */
  81   };
  82
  83 static struct msg_point lex_token_start_point (const struct lex_source *,
  84                                                const struct lex_token *);
  85 static struct msg_point lex_token_end_point (const struct lex_source *,
  86                                              const struct lex_token *);
  87
  88 /* Source offset of the last byte in TOKEN. */
  89 static size_t
  90 lex_token_end (const struct lex_token *token)
  91 {
  92   return token->token_pos + MAX (token->token_len, 1) - 1;
  93 }
  94
  95 static void
  96 lex_token_destroy (struct lex_token *t)
  97 {
  98   token_uninit (&t->token);
  99   if (t->ref_cnt)
 100     {
 101       assert (*t->ref_cnt > 0);
 102       if (!--*t->ref_cnt)
 103         {
 104           free (t->macro_rep);
 105           free (t->ref_cnt);
 106         }
 107     }
 108   free (t);
 109 }
 110 \f
 111 /* A deque of lex_tokens that comprises one stage in the token pipeline in a
 112    lex_source. */
 113 struct lex_stage
 114   {
 115     struct deque deque;
 116     struct lex_token **tokens;
 117   };
 118
 119 static void lex_stage_clear (struct lex_stage *);
 120 static void lex_stage_uninit (struct lex_stage *);
 121
 122 static size_t lex_stage_count (const struct lex_stage *);
 123 static bool lex_stage_is_empty (const struct lex_stage *);
 124
 125 static struct lex_token *lex_stage_first (struct lex_stage *);
 126 static struct lex_token *lex_stage_nth (struct lex_stage *, size_t ofs);
 127
 128 static void lex_stage_push_last (struct lex_stage *, struct lex_token *);
 129 static void lex_stage_pop_first (struct lex_stage *);
 130
 131 static void lex_stage_shift (struct lex_stage *dst, struct lex_stage *src,
 132                              size_t n);
 133
 134 /* Deletes all the tokens from STAGE. */
 135 static void
 136 lex_stage_clear (struct lex_stage *stage)
 137 {
 138   while (!deque_is_empty (&stage->deque))
 139     lex_stage_pop_first (stage);
 140 }
 141
 142 /* Deletes all the tokens from STAGE and frees storage for the deque. */
 143 static void
 144 lex_stage_uninit (struct lex_stage *stage)
 145 {
 146   lex_stage_clear (stage);
 147   free (stage->tokens);
 148 }
 149
 150 /* Returns true if STAGE contains no tokens, otherwise false. */
 151 static bool
 152 lex_stage_is_empty (const struct lex_stage *stage)
 153 {
 154   return deque_is_empty (&stage->deque);
 155 }
 156
 157 /* Returns the number of tokens in STAGE. */
 158 static size_t
 159 lex_stage_count (const struct lex_stage *stage)
 160 {
 161   return deque_count (&stage->deque);
 162 }
 163
 164 /* Returns the first token in STAGE, which must be nonempty.
 165    The first token is the one accessed with the least lookahead. */
 166 static struct lex_token *
 167 lex_stage_first (struct lex_stage *stage)
 168 {
 169   return lex_stage_nth (stage, 0);
 170 }
 171
 172 /* Returns the token the given INDEX in STAGE.  The first token (with the least
 173    lookahead) is 0, the second token is 1, and so on.  There must be at least
 174    INDEX + 1 tokens in STAGE. */
 175 static struct lex_token *
 176 lex_stage_nth (struct lex_stage *stage, size_t index)
 177 {
 178   return stage->tokens[deque_back (&stage->deque, index)];
 179 }
 180
 181 /* Adds TOKEN so that it becomes the last token in STAGE. */
 182 static void
 183 lex_stage_push_last (struct lex_stage *stage, struct lex_token *token)
 184 {
 185   if (deque_is_full (&stage->deque))
 186     stage->tokens = deque_expand (&stage->deque, stage->tokens,
 187                                   sizeof *stage->tokens);
 188   stage->tokens[deque_push_front (&stage->deque)] = token;
 189 }
 190
 191 /* Removes and returns the first token from STAGE. */
 192 static struct lex_token *
 193 lex_stage_take_first (struct lex_stage *stage)
 194 {
 195   return stage->tokens[deque_pop_back (&stage->deque)];
 196 }
 197
 198 /* Removes the first token from STAGE and uninitializes it. */
 199 static void
 200 lex_stage_pop_first (struct lex_stage *stage)
 201 {
 202   lex_token_destroy (lex_stage_take_first (stage));
 203 }
 204
 205 /* Removes the first N tokens from SRC, appending them to DST as the last
 206    tokens. */
 207 static void
 208 lex_stage_shift (struct lex_stage *dst, struct lex_stage *src, size_t n)
 209 {
 210   for (size_t i = 0; i < n; i++)
 211     lex_stage_push_last (dst, lex_stage_take_first (src));
 212 }
 213
 214 /* A source of tokens, corresponding to a syntax file.
 215
 216    This is conceptually a lex_reader wrapped with everything needed to convert
 217    its UTF-8 bytes into tokens. */
 218 struct lex_source
 219   {
 220     struct ll ll;               /* In lexer's list of sources. */
 221
 222     /* Reference count:
 223
 224        - One for struct lexer.
 225
 226        - One for each struct msg_location that references this source. */
 227     size_t n_refs;
 228
 229     struct lex_reader *reader;
 230     struct lexer *lexer;
 231     struct segmenter segmenter;
 232     bool eof;                   /* True if T_STOP was read from 'reader'. */
 233
 234     /* Buffer of UTF-8 bytes. */
 235     char *buffer;               /* Source file contents. */
 236     size_t length;              /* Number of bytes filled. */
 237     size_t allocated;           /* Number of bytes allocated. */
 238
 239     /* Offsets into 'buffer'. */
 240     size_t journal_pos;         /* First byte not yet output to journal. */
 241     size_t seg_pos;             /* First byte not yet scanned as token. */
 242
 243     /* Offset into 'buffer' of starts of lines. */
 244     size_t *lines;
 245     size_t n_lines, allocated_lines;
 246
 247     bool suppress_next_newline;
 248
 249     /* Tokens.
 250
 251        This is a pipeline with the following stages.  Each token eventually
 252        made available to the parser passes through of these stages.  The stages
 253        are named after the processing that happens in each one.
 254
 255        Initially, tokens come from the segmenter and scanner to 'pp':
 256
 257        - pp: Tokens that need to pass through the macro preprocessor to end up
 258          in 'merge'.
 259
 260        - merge: Tokens that need to pass through scan_merge() to end up in
 261          'parse'.
 262
 263        - parse: Tokens available to the client for parsing.
 264
 265       'pp' and 'merge' store tokens only temporarily until they pass into
 266       'parse'.  Tokens then live in 'parse' until the command is fully
 267       consumed, at which time they are freed together. */
 268     struct lex_stage pp;
 269     struct lex_stage merge;
 270     struct lex_token **parse;
 271     size_t n_parse, allocated_parse, parse_ofs;
 272   };
 273
 274 static struct lex_source *lex_source_create (struct lexer *,
 275                                              struct lex_reader *);
 276
 277 /* Lexer. */
 278 struct lexer
 279   {
 280     struct ll_list sources;     /* Contains "struct lex_source"s. */
 281     struct macro_set *macros;
 282   };
 283
 284 static struct lex_source *lex_source__ (const struct lexer *);
 285 static char *lex_source_syntax__ (const struct lex_source *,
 286                                   int ofs0, int ofs1);
 287 static const struct lex_token *lex_next__ (const struct lexer *, int n);
 288 static void lex_source_push_endcmd__ (struct lex_source *);
 289 static void lex_source_push_parse (struct lex_source *, struct lex_token *);
 290 static void lex_source_clear_parse (struct lex_source *);
 291
 292 static bool lex_source_get_parse (struct lex_source *);
 293 static void lex_source_error_valist (struct lex_source *, int ofs0, int ofs1,
 294                                      const char *format, va_list)
 295    PRINTF_FORMAT (4, 0);
 296 static const struct lex_token *lex_source_next__ (const struct lex_source *,
 297                                                   int n);
 298 \f
 299 /* Initializes READER with the specified CLASS and otherwise some reasonable
 300    defaults.  The caller should fill in the others members as desired. */
 301 void
 302 lex_reader_init (struct lex_reader *reader,
 303                  const struct lex_reader_class *class)
 304 {
 305   reader->class = class;
 306   reader->syntax = SEG_MODE_AUTO;
 307   reader->error = LEX_ERROR_CONTINUE;
 308   reader->file_name = NULL;
 309   reader->encoding = NULL;
 310   reader->line_number = 0;
 311   reader->eof = false;
 312 }
 313
 314 /* Frees any file name already in READER and replaces it by a copy of
 315    FILE_NAME, or if FILE_NAME is null then clears any existing name. */
 316 void
 317 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
 318 {
 319   free (reader->file_name);
 320   reader->file_name = xstrdup_if_nonnull (file_name);
 321 }
 322 \f
 323 /* Creates and returns a new lexer. */
 324 struct lexer *
 325 lex_create (void)
 326 {
 327   struct lexer *lexer = xmalloc (sizeof *lexer);
 328   *lexer = (struct lexer) {
 329     .sources = LL_INITIALIZER (lexer->sources),
 330     .macros = macro_set_create (),
 331   };
 332   return lexer;
 333 }
 334
 335 /* Destroys LEXER. */
 336 void
 337 lex_destroy (struct lexer *lexer)
 338 {
 339   if (lexer != NULL)
 340     {
 341       struct lex_source *source, *next;
 342
 343       ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
 344         {
 345           ll_remove (&source->ll);
 346           lex_source_unref (source);
 347         }
 348       macro_set_destroy (lexer->macros);
 349       free (lexer);
 350     }
 351 }
 352
 353 /* Adds M to LEXER's set of macros.  M replaces any existing macro with the
 354    same name.  Takes ownership of M. */
 355 void
 356 lex_define_macro (struct lexer *lexer, struct macro *m)
 357 {
 358   macro_set_add (lexer->macros, m);
 359 }
 360
 361 /* Inserts READER into LEXER so that the next token read by LEXER comes from
 362    READER.  Before the caller, LEXER must either be empty or at a T_ENDCMD
 363    token. */
 364 void
 365 lex_include (struct lexer *lexer, struct lex_reader *reader)
 366 {
 367   assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
 368   ll_push_head (&lexer->sources, &lex_source_create (lexer, reader)->ll);
 369 }
 370
 371 /* Appends READER to LEXER, so that it will be read after all other current
 372    readers have already been read. */
 373 void
 374 lex_append (struct lexer *lexer, struct lex_reader *reader)
 375 {
 376   ll_push_tail (&lexer->sources, &lex_source_create (lexer, reader)->ll);
 377 }
 378 \f
 379 /* Advancing. */
 380
 381 /* Advances LEXER to the next token, consuming the current token. */
 382 void
 383 lex_get (struct lexer *lexer)
 384 {
 385   struct lex_source *src;
 386
 387   src = lex_source__ (lexer);
 388   if (src == NULL)
 389     return;
 390
 391   if (src->parse_ofs < src->n_parse)
 392     {
 393       if (src->parse[src->parse_ofs]->token.type == T_ENDCMD)
 394         lex_source_clear_parse (src);
 395       else
 396         src->parse_ofs++;
 397     }
 398
 399   while (src->parse_ofs == src->n_parse)
 400     if (!lex_source_get_parse (src))
 401       {
 402         ll_remove (&src->ll);
 403         lex_source_unref (src);
 404         src = lex_source__ (lexer);
 405         if (src == NULL)
 406           return;
 407       }
 408 }
 409
 410 /* Advances LEXER by N tokens. */
 411 void
 412 lex_get_n (struct lexer *lexer, size_t n)
 413 {
 414   while (n-- > 0)
 415     lex_get (lexer);
 416 }
 417 \f
 418 /* Issuing errors. */
 419
 420 /* Prints a syntax error message containing the current token and
 421    given message MESSAGE (if non-null). */
 422 void
 423 lex_error (struct lexer *lexer, const char *format, ...)
 424 {
 425   va_list args;
 426
 427   va_start (args, format);
 428   lex_ofs_error_valist (lexer, lex_ofs (lexer), lex_ofs (lexer), format, args);
 429   va_end (args);
 430 }
 431
 432 /* Prints a syntax error message containing the current token and
 433    given message MESSAGE (if non-null). */
 434 void
 435 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
 436 {
 437   lex_ofs_error_valist (lexer, lex_ofs (lexer), lex_ofs (lexer), format, args);
 438 }
 439
 440 /* Prints a syntax error message for the span of tokens N0 through N1,
 441    inclusive, from the current token in LEXER, adding message MESSAGE (if
 442    non-null). */
 443 void
 444 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
 445 {
 446   va_list args;
 447
 448   va_start (args, format);
 449   int ofs = lex_ofs (lexer);
 450   lex_ofs_error_valist (lexer, n0 + ofs, n1 + ofs, format, args);
 451   va_end (args);
 452 }
 453
 454 /* Prints a syntax error message for the span of tokens with offsets OFS0
 455    through OFS1, inclusive, within the current command in LEXER, adding message
 456    MESSAGE (if non-null). */
 457 void
 458 lex_ofs_error (struct lexer *lexer, int ofs0, int ofs1, const char *format, ...)
 459 {
 460   va_list args;
 461
 462   va_start (args, format);
 463   lex_ofs_error_valist (lexer, ofs0, ofs1, format, args);
 464   va_end (args);
 465 }
 466
 467 /* Prints a syntax error message saying that one of the strings provided as
 468    varargs, up to the first NULL, is expected. */
 469 void
 470 (lex_error_expecting) (struct lexer *lexer, ...)
 471 {
 472   va_list args;
 473
 474   va_start (args, lexer);
 475   lex_error_expecting_valist (lexer, args);
 476   va_end (args);
 477 }
 478
 479 /* Prints a syntax error message saying that one of the options provided in
 480    ARGS, up to the first NULL, is expected. */
 481 void
 482 lex_error_expecting_valist (struct lexer *lexer, va_list args)
 483 {
 484   enum { MAX_OPTIONS = 9 };
 485   const char *options[MAX_OPTIONS];
 486   int n = 0;
 487   while (n < MAX_OPTIONS)
 488     {
 489       const char *option = va_arg (args, const char *);
 490       if (!option)
 491         break;
 492
 493       options[n++] = option;
 494     }
 495   lex_error_expecting_array (lexer, options, n);
 496 }
 497
 498 void
 499 lex_error_expecting_array (struct lexer *lexer, const char **options, size_t n)
 500 {
 501   switch (n)
 502     {
 503     case 0:
 504       lex_error (lexer, NULL);
 505       break;
 506
 507     case 1:
 508       lex_error (lexer, _("expecting %s"), options[0]);
 509       break;
 510
 511     case 2:
 512       lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
 513       break;
 514
 515     case 3:
 516       lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
 517                  options[2]);
 518       break;
 519
 520     case 4:
 521       lex_error (lexer, _("expecting %s, %s, %s, or %s"),
 522                  options[0], options[1], options[2], options[3]);
 523       break;
 524
 525     case 5:
 526       lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
 527                  options[0], options[1], options[2], options[3], options[4]);
 528       break;
 529
 530     case 6:
 531       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
 532                  options[0], options[1], options[2], options[3], options[4],
 533                  options[5]);
 534       break;
 535
 536     case 7:
 537       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
 538                  options[0], options[1], options[2], options[3], options[4],
 539                  options[5], options[6]);
 540       break;
 541
 542     case 8:
 543       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
 544                  options[0], options[1], options[2], options[3], options[4],
 545                  options[5], options[6], options[7]);
 546       break;
 547
 548     case 9:
 549       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, %s, or %s"),
 550                  options[0], options[1], options[2], options[3], options[4],
 551                  options[5], options[6], options[7], options[8]);
 552       break;
 553
 554     default:
 555       lex_error (lexer, NULL);
 556     }
 557 }
 558
 559 /* Reports an error to the effect that subcommand SBC may only be specified
 560    once.
 561
 562    This function does not take a lexer as an argument or use lex_error(),
 563    because the result would ordinarily just be redundant: "Syntax error at
 564    SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
 565    not help the user find the error. */
 566 void
 567 lex_sbc_only_once (const char *sbc)
 568 {
 569   msg (SE, _("Subcommand %s may only be specified once."), sbc);
 570 }
 571
 572 /* Reports an error to the effect that subcommand SBC is missing.
 573
 574    This function does not take a lexer as an argument or use lex_error(),
 575    because a missing subcommand can normally be detected only after the whole
 576    command has been parsed, and so lex_error() would always report "Syntax
 577    error at end of command", which does not help the user find the error. */
 578 void
 579 lex_sbc_missing (const char *sbc)
 580 {
 581   msg (SE, _("Required subcommand %s was not specified."), sbc);
 582 }
 583
 584 /* Reports an error to the effect that specification SPEC may only be specified
 585    once within subcommand SBC. */
 586 void
 587 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
 588 {
 589   lex_error (lexer, _("%s may only be specified once within subcommand %s"),
 590              spec, sbc);
 591 }
 592
 593 /* Reports an error to the effect that specification SPEC is missing within
 594    subcommand SBC. */
 595 void
 596 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
 597 {
 598   lex_error (lexer, _("Required %s specification missing from %s subcommand"),
 599              sbc, spec);
 600 }
 601
 602 /* Prints a syntax error message for the span of tokens with offsets OFS0
 603    through OFS1, inclusive, within the current command in LEXER, adding message
 604    MESSAGE (if non-null) with the given ARGS. */
 605 void
 606 lex_ofs_error_valist (struct lexer *lexer, int ofs0, int ofs1,
 607                       const char *format, va_list args)
 608 {
 609   struct lex_source *src = lex_source__ (lexer);
 610
 611   if (src != NULL)
 612     lex_source_error_valist (src, ofs0, ofs1, format, args);
 613   else
 614     {
 615       struct string s;
 616
 617       ds_init_empty (&s);
 618       ds_put_format (&s, _("Syntax error at end of input"));
 619       if (format != NULL)
 620         {
 621           ds_put_cstr (&s, ": ");
 622           ds_put_vformat (&s, format, args);
 623         }
 624       if (ds_last (&s) != '.')
 625         ds_put_byte (&s, '.');
 626       msg (SE, "%s", ds_cstr (&s));
 627       ds_destroy (&s);
 628     }
 629 }
 630
 631 /* Checks that we're at end of command.
 632    If so, returns a successful command completion code.
 633    If not, flags a syntax error and returns an error command
 634    completion code. */
 635 int
 636 lex_end_of_command (struct lexer *lexer)
 637 {
 638   if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
 639     {
 640       lex_error (lexer, _("expecting end of command"));
 641       return CMD_FAILURE;
 642     }
 643   else
 644     return CMD_SUCCESS;
 645 }
 646 \f
 647 /* Token testing functions. */
 648
 649 /* Returns true if the current token is a number. */
 650 bool
 651 lex_is_number (const struct lexer *lexer)
 652 {
 653   return lex_next_is_number (lexer, 0);
 654 }
 655
 656 /* Returns true if the current token is a string. */
 657 bool
 658 lex_is_string (const struct lexer *lexer)
 659 {
 660   return lex_next_is_string (lexer, 0);
 661 }
 662
 663 /* Returns the value of the current token, which must be a
 664    floating point number. */
 665 double
 666 lex_number (const struct lexer *lexer)
 667 {
 668   return lex_next_number (lexer, 0);
 669 }
 670
 671 /* Returns true iff the current token is an integer. */
 672 bool
 673 lex_is_integer (const struct lexer *lexer)
 674 {
 675   return lex_next_is_integer (lexer, 0);
 676 }
 677
 678 /* Returns the value of the current token, which must be an
 679    integer. */
 680 long
 681 lex_integer (const struct lexer *lexer)
 682 {
 683   return lex_next_integer (lexer, 0);
 684 }
 685 \f
 686 /* Token testing functions with lookahead.
 687
 688    A value of 0 for N as an argument to any of these functions refers to the
 689    current token.  Lookahead is limited to the current command.  Any N greater
 690    than the number of tokens remaining in the current command will be treated
 691    as referring to a T_ENDCMD token. */
 692
 693 /* Returns true if the token N ahead of the current token is a number. */
 694 bool
 695 lex_next_is_number (const struct lexer *lexer, int n)
 696 {
 697   return token_is_number (lex_next (lexer, n));
 698 }
 699
 700 /* Returns true if the token N ahead of the current token is a string. */
 701 bool
 702 lex_next_is_string (const struct lexer *lexer, int n)
 703 {
 704   return token_is_string (lex_next (lexer, n));
 705 }
 706
 707 /* Returns the value of the token N ahead of the current token, which must be a
 708    floating point number. */
 709 double
 710 lex_next_number (const struct lexer *lexer, int n)
 711 {
 712   return token_number (lex_next (lexer, n));
 713 }
 714
 715 /* Returns true if the token N ahead of the current token is an integer. */
 716 bool
 717 lex_next_is_integer (const struct lexer *lexer, int n)
 718 {
 719   return token_is_integer (lex_next (lexer, n));
 720 }
 721
 722 /* Returns the value of the token N ahead of the current token, which must be
 723    an integer. */
 724 long
 725 lex_next_integer (const struct lexer *lexer, int n)
 726 {
 727   return token_integer (lex_next (lexer, n));
 728 }
 729 \f
 730 /* Token matching functions. */
 731
 732 /* If the current token has the specified TYPE, skips it and returns true.
 733    Otherwise, returns false. */
 734 bool
 735 lex_match (struct lexer *lexer, enum token_type type)
 736 {
 737   if (lex_token (lexer) == type)
 738     {
 739       lex_get (lexer);
 740       return true;
 741     }
 742   else
 743     return false;
 744 }
 745
 746 /* If the current token matches IDENTIFIER, skips it and returns true.
 747    IDENTIFIER may be abbreviated to its first three letters.  Otherwise,
 748    returns false.
 749
 750    IDENTIFIER must be an ASCII string. */
 751 bool
 752 lex_match_id (struct lexer *lexer, const char *identifier)
 753 {
 754   return lex_match_id_n (lexer, identifier, 3);
 755 }
 756
 757 /* If the current token is IDENTIFIER, skips it and returns true.  IDENTIFIER
 758    may be abbreviated to its first N letters.  Otherwise, returns false.
 759
 760    IDENTIFIER must be an ASCII string. */
 761 bool
 762 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
 763 {
 764   if (lex_token (lexer) == T_ID
 765       && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
 766     {
 767       lex_get (lexer);
 768       return true;
 769     }
 770   else
 771     return false;
 772 }
 773
 774 /* If the current token is integer X, skips it and returns true.  Otherwise,
 775    returns false. */
 776 bool
 777 lex_match_int (struct lexer *lexer, int x)
 778 {
 779   if (lex_is_integer (lexer) && lex_integer (lexer) == x)
 780     {
 781       lex_get (lexer);
 782       return true;
 783     }
 784   else
 785     return false;
 786 }
 787 \f
 788 /* Forced matches. */
 789
 790 /* If this token is IDENTIFIER, skips it and returns true.  IDENTIFIER may be
 791    abbreviated to its first 3 letters.  Otherwise, reports an error and returns
 792    false.
 793
 794    IDENTIFIER must be an ASCII string. */
 795 bool
 796 lex_force_match_id (struct lexer *lexer, const char *identifier)
 797 {
 798   if (lex_match_id (lexer, identifier))
 799     return true;
 800   else
 801     {
 802       lex_error_expecting (lexer, identifier);
 803       return false;
 804     }
 805 }
 806
 807 /* If the current token has the specified TYPE, skips it and returns true.
 808    Otherwise, reports an error and returns false. */
 809 bool
 810 lex_force_match (struct lexer *lexer, enum token_type type)
 811 {
 812   if (lex_token (lexer) == type)
 813     {
 814       lex_get (lexer);
 815       return true;
 816     }
 817   else
 818     {
 819       const char *type_string = token_type_to_string (type);
 820       if (type_string)
 821         {
 822           char *s = xasprintf ("`%s'", type_string);
 823           lex_error_expecting (lexer, s);
 824           free (s);
 825         }
 826       else
 827         lex_error_expecting (lexer, token_type_to_name (type));
 828
 829       return false;
 830     }
 831 }
 832
 833 /* If the current token is a string, does nothing and returns true.
 834    Otherwise, reports an error and returns false. */
 835 bool
 836 lex_force_string (struct lexer *lexer)
 837 {
 838   if (lex_is_string (lexer))
 839     return true;
 840   else
 841     {
 842       lex_error (lexer, _("expecting string"));
 843       return false;
 844     }
 845 }
 846
 847 /* If the current token is a string or an identifier, does nothing and returns
 848    true.  Otherwise, reports an error and returns false.
 849
 850    This is meant for use in syntactic situations where we want to encourage the
 851    user to supply a quoted string, but for compatibility we also accept
 852    identifiers.  (One example of such a situation is file names.)  Therefore,
 853    the error message issued when the current token is wrong only says that a
 854    string is expected and doesn't mention that an identifier would also be
 855    accepted. */
 856 bool
 857 lex_force_string_or_id (struct lexer *lexer)
 858 {
 859   return lex_token (lexer) == T_ID || lex_force_string (lexer);
 860 }
 861
 862 /* If the current token is an integer, does nothing and returns true.
 863    Otherwise, reports an error and returns false. */
 864 bool
 865 lex_force_int (struct lexer *lexer)
 866 {
 867   if (lex_is_integer (lexer))
 868     return true;
 869   else
 870     {
 871       lex_error (lexer, _("expecting integer"));
 872       return false;
 873     }
 874 }
 875
 876 /* If the current token is an integer in the range MIN...MAX (inclusive), does
 877    nothing and returns true.  Otherwise, reports an error and returns false.
 878    If NAME is nonnull, then it is used in the error message. */
 879 bool
 880 lex_force_int_range (struct lexer *lexer, const char *name, long min, long max)
 881 {
 882   bool is_number = lex_is_number (lexer);
 883   bool is_integer = lex_is_integer (lexer);
 884   bool too_small = (is_integer ? lex_integer (lexer) < min
 885                     : is_number ? lex_number (lexer) < min
 886                     : false);
 887   bool too_big = (is_integer ? lex_integer (lexer) > max
 888                   : is_number ? lex_number (lexer) > max
 889                   : false);
 890   if (is_integer && !too_small && !too_big)
 891     return true;
 892
 893   if (min > max)
 894     {
 895       /* Weird, maybe a bug in the caller.  Just report that we needed an
 896          integer. */
 897       if (name)
 898         lex_error (lexer, _("Integer expected for %s."), name);
 899       else
 900         lex_error (lexer, _("Integer expected."));
 901     }
 902   else if (min == max)
 903     {
 904       if (name)
 905         lex_error (lexer, _("Expected %ld for %s."), min, name);
 906       else
 907         lex_error (lexer, _("Expected %ld."), min);
 908     }
 909   else if (min + 1 == max)
 910     {
 911       if (name)
 912         lex_error (lexer, _("Expected %ld or %ld for %s."), min, min + 1, name);
 913       else
 914         lex_error (lexer, _("Expected %ld or %ld."), min, min + 1);
 915     }
 916   else
 917     {
 918       bool report_lower_bound = (min > INT_MIN / 2) || too_small;
 919       bool report_upper_bound = (max < INT_MAX / 2) || too_big;
 920
 921       if (report_lower_bound && report_upper_bound)
 922         {
 923           if (name)
 924             lex_error (lexer,
 925                        _("Expected integer between %ld and %ld for %s."),
 926                        min, max, name);
 927           else
 928             lex_error (lexer, _("Expected integer between %ld and %ld."),
 929                        min, max);
 930         }
 931       else if (report_lower_bound)
 932         {
 933           if (min == 0)
 934             {
 935               if (name)
 936                 lex_error (lexer, _("Expected non-negative integer for %s."),
 937                            name);
 938               else
 939                 lex_error (lexer, _("Expected non-negative integer."));
 940             }
 941           else if (min == 1)
 942             {
 943               if (name)
 944                 lex_error (lexer, _("Expected positive integer for %s."),
 945                            name);
 946               else
 947                 lex_error (lexer, _("Expected positive integer."));
 948             }
 949           else
 950             {
 951               if (name)
 952                 lex_error (lexer, _("Expected integer %ld or greater for %s."),
 953                            min, name);
 954               else
 955                 lex_error (lexer, _("Expected integer %ld or greater."), min);
 956             }
 957         }
 958       else if (report_upper_bound)
 959         {
 960           if (name)
 961             lex_error (lexer,
 962                        _("Expected integer less than or equal to %ld for %s."),
 963                        max, name);
 964           else
 965             lex_error (lexer, _("Expected integer less than or equal to %ld."),
 966                        max);
 967         }
 968       else
 969         {
 970           if (name)
 971             lex_error (lexer, _("Integer expected for %s."), name);
 972           else
 973             lex_error (lexer, _("Integer expected."));
 974         }
 975     }
 976   return false;
 977 }
 978
 979 /* If the current token is a number, does nothing and returns true.
 980    Otherwise, reports an error and returns false. */
 981 bool
 982 lex_force_num (struct lexer *lexer)
 983 {
 984   if (lex_is_number (lexer))
 985     return true;
 986
 987   lex_error (lexer, _("expecting number"));
 988   return false;
 989 }
 990
 991 /* If the current token is an number in the closed range [MIN,MAX], does
 992    nothing and returns true.  Otherwise, reports an error and returns false.
 993    If NAME is nonnull, then it is used in the error message. */
 994 bool
 995 lex_force_num_range_closed (struct lexer *lexer, const char *name,
 996                             double min, double max)
 997 {
 998   bool is_number = lex_is_number (lexer);
 999   bool too_small = is_number && lex_number (lexer) < min;
1000   bool too_big = is_number && lex_number (lexer) > max;
1001   if (is_number && !too_small && !too_big)
1002     return true;
1003
1004   if (min > max)
1005     {
1006       /* Weird, maybe a bug in the caller.  Just report that we needed an
1007          number. */
1008       if (name)
1009         lex_error (lexer, _("Number expected for %s."), name);
1010       else
1011         lex_error (lexer, _("Number expected."));
1012     }
1013   else if (min == max)
1014     {
1015       if (name)
1016         lex_error (lexer, _("Expected %g for %s."), min, name);
1017       else
1018         lex_error (lexer, _("Expected %g."), min);
1019     }
1020   else
1021     {
1022       bool report_lower_bound = min > -DBL_MAX || too_small;
1023       bool report_upper_bound = max < DBL_MAX || too_big;
1024
1025       if (report_lower_bound && report_upper_bound)
1026         {
1027           if (name)
1028             lex_error (lexer,
1029                        _("Expected number between %g and %g for %s."),
1030                        min, max, name);
1031           else
1032             lex_error (lexer, _("Expected number between %g and %g."),
1033                        min, max);
1034         }
1035       else if (report_lower_bound)
1036         {
1037           if (min == 0)
1038             {
1039               if (name)
1040                 lex_error (lexer, _("Expected non-negative number for %s."),
1041                            name);
1042               else
1043                 lex_error (lexer, _("Expected non-negative number."));
1044             }
1045           else
1046             {
1047               if (name)
1048                 lex_error (lexer, _("Expected number %g or greater for %s."),
1049                            min, name);
1050               else
1051                 lex_error (lexer, _("Expected number %g or greater."), min);
1052             }
1053         }
1054       else if (report_upper_bound)
1055         {
1056           if (name)
1057             lex_error (lexer,
1058                        _("Expected number less than or equal to %g for %s."),
1059                        max, name);
1060           else
1061             lex_error (lexer, _("Expected number less than or equal to %g."),
1062                        max);
1063         }
1064       else
1065         {
1066           if (name)
1067             lex_error (lexer, _("Number expected for %s."), name);
1068           else
1069             lex_error (lexer, _("Number expected."));
1070         }
1071     }
1072   return false;
1073 }
1074
1075 /* If the current token is an number in the half-open range [MIN,MAX), does
1076    nothing and returns true.  Otherwise, reports an error and returns false.
1077    If NAME is nonnull, then it is used in the error message. */
1078 bool
1079 lex_force_num_range_halfopen (struct lexer *lexer, const char *name,
1080                               double min, double max)
1081 {
1082   bool is_number = lex_is_number (lexer);
1083   bool too_small = is_number && lex_number (lexer) < min;
1084   bool too_big = is_number && lex_number (lexer) >= max;
1085   if (is_number && !too_small && !too_big)
1086     return true;
1087
1088   if (min >= max)
1089     {
1090       /* Weird, maybe a bug in the caller.  Just report that we needed an
1091          number. */
1092       if (name)
1093         lex_error (lexer, _("Number expected for %s."), name);
1094       else
1095         lex_error (lexer, _("Number expected."));
1096     }
1097   else
1098     {
1099       bool report_lower_bound = min > -DBL_MAX || too_small;
1100       bool report_upper_bound = max < DBL_MAX || too_big;
1101
1102       if (report_lower_bound && report_upper_bound)
1103         {
1104           if (name)
1105             lex_error (lexer, _("Expected number in [%g,%g) for %s."),
1106                        min, max, name);
1107           else
1108             lex_error (lexer, _("Expected number in [%g,%g)."),
1109                        min, max);
1110         }
1111       else if (report_lower_bound)
1112         {
1113           if (min == 0)
1114             {
1115               if (name)
1116                 lex_error (lexer, _("Expected non-negative number for %s."),
1117                            name);
1118               else
1119                 lex_error (lexer, _("Expected non-negative number."));
1120             }
1121           else
1122             {
1123               if (name)
1124                 lex_error (lexer, _("Expected number %g or greater for %s."),
1125                            min, name);
1126               else
1127                 lex_error (lexer, _("Expected number %g or greater."), min);
1128             }
1129         }
1130       else if (report_upper_bound)
1131         {
1132           if (name)
1133             lex_error (lexer,
1134                        _("Expected number less than %g for %s."), max, name);
1135           else
1136             lex_error (lexer, _("Expected number less than %g."), max);
1137         }
1138       else
1139         {
1140           if (name)
1141             lex_error (lexer, _("Number expected for %s."), name);
1142           else
1143             lex_error (lexer, _("Number expected."));
1144         }
1145     }
1146   return false;
1147 }
1148
1149 /* If the current token is an number in the open range (MIN,MAX], does
1150    nothing and returns true.  Otherwise, reports an error and returns false.
1151    If NAME is nonnull, then it is used in the error message. */
1152 bool
1153 lex_force_num_range_open (struct lexer *lexer, const char *name,
1154                           double min, double max)
1155 {
1156   bool is_number = lex_is_number (lexer);
1157   bool too_small = is_number && lex_number (lexer) <= min;
1158   bool too_big = is_number && lex_number (lexer) >= max;
1159   if (is_number && !too_small && !too_big)
1160     return true;
1161
1162   if (min >= max)
1163     {
1164       /* Weird, maybe a bug in the caller.  Just report that we needed an
1165          number. */
1166       if (name)
1167         lex_error (lexer, _("Number expected for %s."), name);
1168       else
1169         lex_error (lexer, _("Number expected."));
1170     }
1171   else
1172     {
1173       bool report_lower_bound = min > -DBL_MAX || too_small;
1174       bool report_upper_bound = max < DBL_MAX || too_big;
1175
1176       if (report_lower_bound && report_upper_bound)
1177         {
1178           if (name)
1179             lex_error (lexer, _("Expected number in (%g,%g) for %s."),
1180                        min, max, name);
1181           else
1182             lex_error (lexer, _("Expected number in (%g,%g)."), min, max);
1183         }
1184       else if (report_lower_bound)
1185         {
1186           if (min == 0)
1187             {
1188               if (name)
1189                 lex_error (lexer, _("Expected positive number for %s."), name);
1190               else
1191                 lex_error (lexer, _("Expected positive number."));
1192             }
1193           else
1194             {
1195               if (name)
1196                 lex_error (lexer, _("Expected number greater than %g for %s."),
1197                            min, name);
1198               else
1199                 lex_error (lexer, _("Expected number greater than %g."), min);
1200             }
1201         }
1202       else if (report_upper_bound)
1203         {
1204           if (name)
1205             lex_error (lexer, _("Expected number less than %g for %s."),
1206                        max, name);
1207           else
1208             lex_error (lexer, _("Expected number less than %g."), max);
1209         }
1210       else
1211         {
1212           if (name)
1213             lex_error (lexer, _("Number expected for %s."), name);
1214           else
1215             lex_error (lexer, _("Number expected."));
1216         }
1217     }
1218   return false;
1219 }
1220
1221 /* If the current token is an identifier, does nothing and returns true.
1222    Otherwise, reports an error and returns false. */
1223 bool
1224 lex_force_id (struct lexer *lexer)
1225 {
1226   if (lex_token (lexer) == T_ID)
1227     return true;
1228
1229   lex_error (lexer, _("expecting identifier"));
1230   return false;
1231 }
1232 \f
1233 /* Token accessors. */
1234
1235 /* Returns the type of LEXER's current token. */
1236 enum token_type
1237 lex_token (const struct lexer *lexer)
1238 {
1239   return lex_next_token (lexer, 0);
1240 }
1241
1242 /* Returns the number in LEXER's current token.
1243
1244    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
1245    tokens this function will always return zero. */
1246 double
1247 lex_tokval (const struct lexer *lexer)
1248 {
1249   return lex_next_tokval (lexer, 0);
1250 }
1251
1252 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
1253
1254    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
1255    this functions this function will always return NULL.
1256
1257    The UTF-8 encoding of the returned string is correct for variable names and
1258    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
1259    data_in() to use it in a "union value".  */
1260 const char *
1261 lex_tokcstr (const struct lexer *lexer)
1262 {
1263   return lex_next_tokcstr (lexer, 0);
1264 }
1265
1266 /* Returns the string in LEXER's current token, UTF-8 encoded.  The string is
1267    null-terminated (but the null terminator is not included in the returned
1268    substring's 'length').
1269
1270    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
1271    this functions this function will always return NULL.
1272
1273    The UTF-8 encoding of the returned string is correct for variable names and
1274    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
1275    data_in() to use it in a "union value".  */
1276 struct substring
1277 lex_tokss (const struct lexer *lexer)
1278 {
1279   return lex_next_tokss (lexer, 0);
1280 }
1281 \f
1282 /* Looking ahead.
1283
1284    A value of 0 for N as an argument to any of these functions refers to the
1285    current token.  Lookahead is limited to the current command.  Any N greater
1286    than the number of tokens remaining in the current command will be treated
1287    as referring to a T_ENDCMD token. */
1288
1289 static const struct lex_token *
1290 lex_next__ (const struct lexer *lexer_, int n)
1291 {
1292   struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
1293   struct lex_source *src = lex_source__ (lexer);
1294
1295   if (src != NULL)
1296     return lex_source_next__ (src, n);
1297   else
1298     {
1299       static const struct lex_token stop_token = { .token = { .type = T_STOP } };
1300       return &stop_token;
1301     }
1302 }
1303
1304 static const struct lex_token *
1305 lex_source_ofs__ (const struct lex_source *src_, int ofs)
1306 {
1307   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1308
1309   if (ofs < 0)
1310     {
1311       static const struct lex_token endcmd_token
1312         = { .token = { .type = T_ENDCMD } };
1313       return &endcmd_token;
1314     }
1315
1316   while (ofs >= src->n_parse)
1317     {
1318       if (src->n_parse > 0)
1319         {
1320           const struct lex_token *t = src->parse[src->n_parse - 1];
1321           if (t->token.type == T_STOP || t->token.type == T_ENDCMD)
1322             return t;
1323         }
1324
1325       lex_source_get_parse (src);
1326     }
1327
1328   return src->parse[ofs];
1329 }
1330
1331 static const struct lex_token *
1332 lex_source_next__ (const struct lex_source *src, int n)
1333 {
1334   return lex_source_ofs__ (src, n + src->parse_ofs);
1335 }
1336
1337 /* Returns the "struct token" of the token N after the current one in LEXER.
1338    The returned pointer can be invalidated by pretty much any succeeding call
1339    into the lexer, although the string pointer within the returned token is
1340    only invalidated by consuming the token (e.g. with lex_get()). */
1341 const struct token *
1342 lex_next (const struct lexer *lexer, int n)
1343 {
1344   return &lex_next__ (lexer, n)->token;
1345 }
1346
1347 /* Returns the type of the token N after the current one in LEXER. */
1348 enum token_type
1349 lex_next_token (const struct lexer *lexer, int n)
1350 {
1351   return lex_next (lexer, n)->type;
1352 }
1353
1354 /* Returns the number in the tokn N after the current one in LEXER.
1355
1356    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
1357    tokens this function will always return zero. */
1358 double
1359 lex_next_tokval (const struct lexer *lexer, int n)
1360 {
1361   return token_number (lex_next (lexer, n));
1362 }
1363
1364 /* Returns the null-terminated string in the token N after the current one, in
1365    UTF-8 encoding.
1366
1367    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
1368    this functions this function will always return NULL.
1369
1370    The UTF-8 encoding of the returned string is correct for variable names and
1371    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
1372    data_in() to use it in a "union value".  */
1373 const char *
1374 lex_next_tokcstr (const struct lexer *lexer, int n)
1375 {
1376   return lex_next_tokss (lexer, n).string;
1377 }
1378
1379 /* Returns the string in the token N after the current one, in UTF-8 encoding.
1380    The string is null-terminated (but the null terminator is not included in
1381    the returned substring's 'length').
1382
1383    Only T_ID, T_MACRO_ID, T_STRING tokens have meaningful strings.  For other
1384    tokens this functions this function will always return NULL.
1385
1386    The UTF-8 encoding of the returned string is correct for variable names and
1387    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
1388    data_in() to use it in a "union value".  */
1389 struct substring
1390 lex_next_tokss (const struct lexer *lexer, int n)
1391 {
1392   return lex_next (lexer, n)->string;
1393 }
1394
1395 /* Returns the offset of the current token within the command being parsed in
1396    LEXER.  This is 0 for the first token in a command, 1 for the second, and so
1397    on.  The return value is useful later for referring to this token in calls
1398    to lex_ofs_*(). */
1399 int
1400 lex_ofs (const struct lexer *lexer)
1401 {
1402   struct lex_source *src = lex_source__ (lexer);
1403   return src ? src->parse_ofs : 0;
1404 }
1405
1406 /* Returns the token within LEXER's current command with offset OFS.  Use
1407    lex_ofs() to find out the offset of the current token. */
1408 const struct token *
1409 lex_ofs_token (const struct lexer *lexer_, int ofs)
1410 {
1411   struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
1412   struct lex_source *src = lex_source__ (lexer);
1413
1414   if (src != NULL)
1415     return &lex_source_next__ (src, ofs - src->parse_ofs)->token;
1416   else
1417     {
1418       static const struct token stop_token = { .type = T_STOP };
1419       return &stop_token;
1420     }
1421 }
1422
1423 /* Allocates and returns a new struct msg_location that spans tokens with
1424    offsets OFS0 through OFS1, inclusive, within the current command in
1425    LEXER.  See lex_ofs() for an explanation of token offsets.
1426
1427    The caller owns and must eventually free the returned object. */
1428 struct msg_location *
1429 lex_ofs_location (const struct lexer *lexer, int ofs0, int ofs1)
1430 {
1431   int ofs = lex_ofs (lexer);
1432   return lex_get_location (lexer, ofs0 - ofs, ofs1 - ofs);
1433 }
1434
1435 /* Returns a msg_point for the first character in the token with offset OFS,
1436    where offset 0 is the first token in the command currently being parsed, 1
1437    the second token, and so on.  These are absolute offsets, not relative to
1438    the token currently being parsed within the command.
1439
1440    Returns zeros for a T_STOP token.
1441  */
1442 struct msg_point
1443 lex_ofs_start_point (const struct lexer *lexer, int ofs)
1444 {
1445   const struct lex_source *src = lex_source__ (lexer);
1446   return (src
1447           ? lex_token_start_point (src, lex_source_ofs__ (src, ofs))
1448           : (struct msg_point) { 0, 0 });
1449 }
1450
1451 /* Returns a msg_point for the last character, inclusive, in the token with
1452    offset OFS, where offset 0 is the first token in the command currently being
1453    parsed, 1 the second token, and so on.  These are absolute offsets, not
1454    relative to the token currently being parsed within the command.
1455
1456    Returns zeros for a T_STOP token.
1457
1458    Most of the time, a single token is wholly within a single line of syntax,
1459    so that the start and end point for a given offset have the same line
1460    number.  There are two exceptions: a T_STRING token can be made up of
1461    multiple segments on adjacent lines connected with "+" punctuators, and a
1462    T_NEG_NUM token can consist of a "-" on one line followed by the number on
1463    the next.
1464  */
1465 struct msg_point
1466 lex_ofs_end_point (const struct lexer *lexer, int ofs)
1467 {
1468   const struct lex_source *src = lex_source__ (lexer);
1469   return (src
1470           ? lex_token_end_point (src, lex_source_ofs__ (src, ofs))
1471           : (struct msg_point) { 0, 0 });
1472 }
1473
1474 /* Returns the text of the syntax in tokens N0 ahead of the current one,
1475    through N1 ahead of the current one, inclusive.  (For example, if N0 and N1
1476    are both zero, this requests the syntax for the current token.)
1477
1478    The caller must eventually free the returned string (with free()).  The
1479    syntax is encoded in UTF-8 and in the original form supplied to the lexer so
1480    that, for example, it may include comments, spaces, and new-lines if it
1481    spans multiple tokens.  Macro expansion, however, has already been
1482    performed. */
1483 char *
1484 lex_next_representation (const struct lexer *lexer, int n0, int n1)
1485 {
1486   const struct lex_source *src = lex_source__ (lexer);
1487   return (src
1488           ? lex_source_syntax__ (src, n0 + src->parse_ofs, n1 + src->parse_ofs)
1489           : xstrdup (""));
1490 }
1491
1492
1493 /* Returns the text of the syntax in tokens with offsets OFS0 to OFS1,
1494    inclusive.  (For example, if OFS0 and OFS1 are both zero, this requests the
1495    syntax for the first token in the current command.)
1496
1497    The caller must eventually free the returned string (with free()).  The
1498    syntax is encoded in UTF-8 and in the original form supplied to the lexer so
1499    that, for example, it may include comments, spaces, and new-lines if it
1500    spans multiple tokens.  Macro expansion, however, has already been
1501    performed. */
1502 char *
1503 lex_ofs_representation (const struct lexer *lexer, int ofs0, int ofs1)
1504 {
1505   const struct lex_source *src = lex_source__ (lexer);
1506   return src ? lex_source_syntax__ (src, ofs0, ofs1) : xstrdup ("");
1507 }
1508
1509 /* Returns true if the token N ahead of the current one was produced by macro
1510    expansion, false otherwise. */
1511 bool
1512 lex_next_is_from_macro (const struct lexer *lexer, int n)
1513 {
1514   return lex_next__ (lexer, n)->macro_rep != NULL;
1515 }
1516
1517 static bool
1518 lex_tokens_match (const struct token *actual, const struct token *expected)
1519 {
1520   if (actual->type != expected->type)
1521     return false;
1522
1523   switch (actual->type)
1524     {
1525     case T_POS_NUM:
1526     case T_NEG_NUM:
1527       return actual->number == expected->number;
1528
1529     case T_ID:
1530       return lex_id_match (expected->string, actual->string);
1531
1532     case T_STRING:
1533       return (actual->string.length == expected->string.length
1534               && !memcmp (actual->string.string, expected->string.string,
1535                           actual->string.length));
1536
1537     default:
1538       return true;
1539     }
1540 }
1541
1542 static size_t
1543 lex_at_phrase__ (struct lexer *lexer, const char *s)
1544 {
1545   struct string_lexer slex;
1546   struct token token;
1547
1548   size_t i = 0;
1549   string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE, true);
1550   while (string_lexer_next (&slex, &token))
1551     {
1552       bool match = lex_tokens_match (lex_next (lexer, i++), &token);
1553       token_uninit (&token);
1554       if (!match)
1555         return 0;
1556     }
1557   return i;
1558 }
1559
1560 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1561    returns true.  Otherwise, returns false.
1562
1563    S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1564    "2SLS", or "END INPUT PROGRAM".  Identifiers may be abbreviated to their
1565    first three letters. */
1566 bool
1567 lex_at_phrase (struct lexer *lexer, const char *s)
1568 {
1569   return lex_at_phrase__ (lexer, s) > 0;
1570 }
1571
1572 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1573    skips it and returns true.  Otherwise, returns false.
1574
1575    S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1576    "2SLS", or "END INPUT PROGRAM".  Identifiers may be abbreviated to their
1577    first three letters. */
1578 bool
1579 lex_match_phrase (struct lexer *lexer, const char *s)
1580 {
1581   size_t n = lex_at_phrase__ (lexer, s);
1582   if (n > 0)
1583     lex_get_n (lexer, n);
1584   return n > 0;
1585 }
1586
1587 /* Returns the 1-based line number of the source text at the byte OFFSET in
1588    SRC. */
1589 static int
1590 lex_source_ofs_to_line_number (const struct lex_source *src, size_t offset)
1591 {
1592   size_t lo = 0;
1593   size_t hi = src->n_lines;
1594   for (;;)
1595     {
1596       size_t mid = (lo + hi) / 2;
1597       if (mid + 1 >= src->n_lines)
1598         return src->n_lines;
1599       else if (offset >= src->lines[mid + 1])
1600         lo = mid;
1601       else if (offset < src->lines[mid])
1602         hi = mid;
1603       else
1604         return mid + 1;
1605     }
1606 }
1607
1608 /* Returns the 1-based column number of the source text at the byte OFFSET in
1609    SRC. */
1610 static int
1611 lex_source_ofs_to_column_number (const struct lex_source *src, size_t offset)
1612 {
1613   const char *newline = memrchr (src->buffer, '\n', offset);
1614   size_t line_ofs = newline ? newline - src->buffer + 1 : 0;
1615   return utf8_count_columns (&src->buffer[line_ofs], offset - line_ofs) + 1;
1616 }
1617
1618 static struct msg_point
1619 lex_source_ofs_to_point__ (const struct lex_source *src, size_t offset)
1620 {
1621   return (struct msg_point) {
1622     .line = lex_source_ofs_to_line_number (src, offset),
1623     .column = lex_source_ofs_to_column_number (src, offset),
1624   };
1625 }
1626
1627 static struct msg_point
1628 lex_token_start_point (const struct lex_source *src,
1629                        const struct lex_token *token)
1630 {
1631   return lex_source_ofs_to_point__ (src, token->token_pos);
1632 }
1633
1634 static struct msg_point
1635 lex_token_end_point (const struct lex_source *src,
1636                      const struct lex_token *token)
1637 {
1638   return lex_source_ofs_to_point__ (src, lex_token_end (token));
1639 }
1640
1641 static struct msg_location
1642 lex_token_location (const struct lex_source *src,
1643                     const struct lex_token *t0,
1644                     const struct lex_token *t1)
1645 {
1646   return (struct msg_location) {
1647     .file_name = intern_new_if_nonnull (src->reader->file_name),
1648     .start = lex_token_start_point (src, t0),
1649     .end = lex_token_end_point (src, t1),
1650   };
1651 }
1652
1653 static struct msg_location *
1654 lex_token_location_rw (const struct lex_source *src,
1655                        const struct lex_token *t0,
1656                        const struct lex_token *t1)
1657 {
1658   struct msg_location location = lex_token_location (src, t0, t1);
1659   return msg_location_dup (&location);
1660 }
1661
1662 static struct msg_location *
1663 lex_source_get_location (const struct lex_source *src, int ofs0, int ofs1)
1664 {
1665   return lex_token_location_rw (src,
1666                                 lex_source_ofs__ (src, ofs0),
1667                                 lex_source_ofs__ (src, ofs1));
1668 }
1669
1670 /* Returns the name of the syntax file from which the current command is drawn.
1671    Returns NULL for a T_STOP token or if the command's source does not have
1672    line numbers.
1673
1674    There is no version of this function that takes an N argument because
1675    lookahead only works to the end of a command and any given command is always
1676    within a single syntax file. */
1677 const char *
1678 lex_get_file_name (const struct lexer *lexer)
1679 {
1680   struct lex_source *src = lex_source__ (lexer);
1681   return src == NULL ? NULL : src->reader->file_name;
1682 }
1683
1684 /* Returns a newly allocated msg_location for the syntax that represents tokens
1685    with 0-based offsets N0...N1, inclusive, from the current token.  The caller
1686    must eventually free the location (with msg_location_destroy()). */
1687 struct msg_location *
1688 lex_get_location (const struct lexer *lexer, int n0, int n1)
1689 {
1690   struct msg_location *loc = xmalloc (sizeof *loc);
1691   *loc = (struct msg_location) {
1692     .file_name = intern_new_if_nonnull (lex_get_file_name (lexer)),
1693     .start = lex_ofs_start_point (lexer, n0 + lex_ofs (lexer)),
1694     .end = lex_ofs_end_point (lexer, n1 + lex_ofs (lexer)),
1695     .src = lex_source__ (lexer),
1696   };
1697   lex_source_ref (loc->src);
1698   return loc;
1699 }
1700
1701 const char *
1702 lex_get_encoding (const struct lexer *lexer)
1703 {
1704   struct lex_source *src = lex_source__ (lexer);
1705   return src == NULL ? NULL : src->reader->encoding;
1706 }
1707
1708 /* Returns the syntax mode for the syntax file from which the current drawn is
1709    drawn.  Returns SEG_MODE_AUTO for a T_STOP token or if the command's source
1710    does not have line numbers.
1711
1712    There is no version of this function that takes an N argument because
1713    lookahead only works to the end of a command and any given command is always
1714    within a single syntax file. */
1715 enum segmenter_mode
1716 lex_get_syntax_mode (const struct lexer *lexer)
1717 {
1718   struct lex_source *src = lex_source__ (lexer);
1719   return src == NULL ? SEG_MODE_AUTO : src->reader->syntax;
1720 }
1721
1722 /* Returns the error mode for the syntax file from which the current drawn is
1723    drawn.  Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1724    source does not have line numbers.
1725
1726    There is no version of this function that takes an N argument because
1727    lookahead only works to the end of a command and any given command is always
1728    within a single syntax file. */
1729 enum lex_error_mode
1730 lex_get_error_mode (const struct lexer *lexer)
1731 {
1732   struct lex_source *src = lex_source__ (lexer);
1733   return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1734 }
1735
1736 /* If the source that LEXER is currently reading has error mode
1737    LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1738    token to be read comes directly from whatever is next read from the stream.
1739
1740    It makes sense to call this function after encountering an error in a
1741    command entered on the console, because usually the user would prefer not to
1742    have cascading errors. */
1743 void
1744 lex_interactive_reset (struct lexer *lexer)
1745 {
1746   struct lex_source *src = lex_source__ (lexer);
1747   if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1748     {
1749       src->length = 0;
1750       src->journal_pos = src->seg_pos = 0;
1751       src->n_lines = 0;
1752       src->suppress_next_newline = false;
1753       src->segmenter = segmenter_init (segmenter_get_mode (&src->segmenter),
1754                                        false);
1755       lex_stage_clear (&src->pp);
1756       lex_stage_clear (&src->merge);
1757       lex_source_clear_parse (src);
1758       lex_source_push_endcmd__ (src);
1759     }
1760 }
1761
1762 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1763 void
1764 lex_discard_rest_of_command (struct lexer *lexer)
1765 {
1766   while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1767     lex_get (lexer);
1768 }
1769
1770 /* Discards all lookahead tokens in LEXER, then discards all input sources
1771    until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1772    runs out of input sources. */
1773 void
1774 lex_discard_noninteractive (struct lexer *lexer)
1775 {
1776   struct lex_source *src = lex_source__ (lexer);
1777
1778   if (src != NULL)
1779     {
1780       lex_stage_clear (&src->pp);
1781       lex_stage_clear (&src->merge);
1782       lex_source_clear_parse (src);
1783
1784       for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1785            src = lex_source__ (lexer))
1786         {
1787           ll_remove (&src->ll);
1788           lex_source_unref (src);
1789         }
1790     }
1791 }
1792 \f
1793 static void
1794 lex_source_expand__ (struct lex_source *src)
1795 {
1796   if (src->length >= src->allocated)
1797     src->buffer = x2realloc (src->buffer, &src->allocated);
1798 }
1799
1800 static void
1801 lex_source_read__ (struct lex_source *src)
1802 {
1803   do
1804     {
1805       lex_source_expand__ (src);
1806
1807       size_t space = src->allocated - src->length;
1808       enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1809       size_t n = src->reader->class->read (src->reader,
1810                                            &src->buffer[src->length],
1811                                            space, prompt);
1812       assert (n <= space);
1813
1814       if (n == 0)
1815         {
1816           /* End of input. */
1817           src->reader->eof = true;
1818           return;
1819         }
1820
1821       src->length += n;
1822     }
1823   while (!memchr (&src->buffer[src->seg_pos], '\n',
1824                   src->length - src->seg_pos));
1825 }
1826
1827 static struct lex_source *
1828 lex_source__ (const struct lexer *lexer)
1829 {
1830   return (ll_is_empty (&lexer->sources) ? NULL
1831           : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1832 }
1833
1834 /* Returns the text of the syntax in SRC for tokens with offsets OFS0 through
1835    OFS1 in the current command, inclusive.  (For example, if OFS0 and OFS1 are
1836    both zero, this requests the syntax for the first token in the current
1837    command.)  The caller must eventually free the returned string (with
1838    free()).  The syntax is encoded in UTF-8 and in the original form supplied
1839    to the lexer so that, for example, it may include comments, spaces, and
1840    new-lines if it spans multiple tokens.  Macro expansion, however, has
1841    already been performed. */
1842 static char *
1843 lex_source_syntax__ (const struct lex_source *src, int ofs0, int ofs1)
1844 {
1845   struct string s = DS_EMPTY_INITIALIZER;
1846   for (size_t i = ofs0; i <= ofs1; )
1847     {
1848       /* Find [I,J) as the longest sequence of tokens not produced by macro
1849          expansion, or otherwise the longest sequence expanded from a single
1850          macro call. */
1851       const struct lex_token *first = lex_source_ofs__ (src, i);
1852       size_t j;
1853       for (j = i + 1; j <= ofs1; j++)
1854         {
1855           const struct lex_token *cur = lex_source_ofs__ (src, j);
1856           if ((first->macro_rep != NULL) != (cur->macro_rep != NULL)
1857               || first->macro_rep != cur->macro_rep)
1858             break;
1859         }
1860       const struct lex_token *last = lex_source_ofs__ (src, j - 1);
1861
1862       /* Now add the syntax for this sequence of tokens to SRC. */
1863       if (!ds_is_empty (&s))
1864         ds_put_byte (&s, ' ');
1865       if (!first->macro_rep)
1866         {
1867           size_t start = first->token_pos;
1868           size_t end = last->token_pos + last->token_len;
1869           ds_put_substring (&s, ss_buffer (&src->buffer[start], end - start));
1870         }
1871       else
1872         {
1873           size_t start = first->ofs;
1874           size_t end = last->ofs + last->len;
1875           ds_put_substring (&s, ss_buffer (first->macro_rep + start,
1876                                            end - start));
1877         }
1878
1879       i = j;
1880     }
1881   return ds_steal_cstr (&s);
1882 }
1883
1884 static bool
1885 lex_source_contains_macro_call (struct lex_source *src, int ofs0, int ofs1)
1886 {
1887   for (int i = ofs0; i <= ofs1; i++)
1888     if (lex_source_ofs__ (src, i)->macro_rep)
1889       return true;
1890   return false;
1891 }
1892
1893 /* If tokens N0...N1 (inclusive) in SRC contains a macro call, this returns the
1894    raw UTF-8 syntax for the macro call (not for the expansion) and for any
1895    other tokens included in that range.  The syntax is encoded in UTF-8 and in
1896    the original form supplied to the lexer so that, for example, it may include
1897    comments, spaces, and new-lines if it spans multiple tokens.
1898
1899    Returns an empty string if the token range doesn't include a macro call.
1900
1901    The caller must not modify or free the returned string. */
1902 static struct substring
1903 lex_source_get_macro_call (struct lex_source *src, int ofs0, int ofs1)
1904 {
1905   if (!lex_source_contains_macro_call (src, ofs0, ofs1))
1906     return ss_empty ();
1907
1908   const struct lex_token *token0 = lex_source_ofs__ (src, ofs0);
1909   const struct lex_token *token1 = lex_source_ofs__ (src, MAX (ofs0, ofs1));
1910   size_t start = token0->token_pos;
1911   size_t end = token1->token_pos + token1->token_len;
1912
1913   return ss_buffer (&src->buffer[start], end - start);
1914 }
1915
1916 static void
1917 lex_source_error_valist (struct lex_source *src, int ofs0, int ofs1,
1918                          const char *format, va_list args)
1919 {
1920   const struct lex_token *token;
1921   struct string s;
1922
1923   ds_init_empty (&s);
1924
1925   token = lex_source_ofs__ (src, ofs0);
1926   if (token->token.type == T_ENDCMD)
1927     ds_put_cstr (&s, _("Syntax error at end of command"));
1928   else
1929     {
1930       /* Get the syntax that caused the error. */
1931       char *raw_syntax = lex_source_syntax__ (src, ofs0, ofs1);
1932       char syntax[64];
1933       str_ellipsize (ss_cstr (raw_syntax), syntax, sizeof syntax);
1934       free (raw_syntax);
1935
1936       /* Get the macro call(s) that expanded to the syntax that caused the
1937          error. */
1938       char call[64];
1939       str_ellipsize (lex_source_get_macro_call (src, ofs0, ofs1),
1940                      call, sizeof call);
1941
1942       if (syntax[0])
1943         {
1944           if (call[0])
1945             ds_put_format (&s,
1946                            _("Syntax error at `%s' (in expansion of `%s')"),
1947                            syntax, call);
1948           else
1949             ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1950         }
1951       else
1952         {
1953           if (call[0])
1954             ds_put_format (&s, _("Syntax error in syntax expanded from `%s'"),
1955                            call);
1956           else
1957             ds_put_cstr (&s, _("Syntax error"));
1958         }
1959     }
1960
1961   if (format)
1962     {
1963       ds_put_cstr (&s, ": ");
1964       ds_put_vformat (&s, format, args);
1965     }
1966   if (ds_last (&s) != '.')
1967     ds_put_byte (&s, '.');
1968
1969   struct msg *m = xmalloc (sizeof *m);
1970   *m = (struct msg) {
1971     .category = MSG_C_SYNTAX,
1972     .severity = MSG_S_ERROR,
1973     .location = lex_source_get_location (src, ofs0, ofs1),
1974     .text = ds_steal_cstr (&s),
1975   };
1976   msg_emit (m);
1977 }
1978
1979 static void
1980 lex_get_error (struct lex_source *src, const struct lex_token *token)
1981 {
1982   char syntax[64];
1983   str_ellipsize (ss_buffer (&src->buffer[token->token_pos], token->token_len),
1984                  syntax, sizeof syntax);
1985
1986   struct string s = DS_EMPTY_INITIALIZER;
1987   ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1988   ds_put_format (&s, ": %s", token->token.string.string);
1989
1990   struct msg *m = xmalloc (sizeof *m);
1991   *m = (struct msg) {
1992     .category = MSG_C_SYNTAX,
1993     .severity = MSG_S_ERROR,
1994     .location = lex_token_location_rw (src, token, token),
1995     .text = ds_steal_cstr (&s),
1996   };
1997   msg_emit (m);
1998 }
1999
2000 /* Attempts to append an additional token to 'pp' in SRC, reading more from the
2001    underlying lex_reader if necessary.  Returns true if a new token was added
2002    to SRC's deque, false otherwise.  The caller should retry failures unless
2003    SRC's 'eof' marker was set to true indicating that there will be no more
2004    tokens from this source. */
2005 static bool
2006 lex_source_try_get_pp (struct lex_source *src)
2007 {
2008   /* Append a new token to SRC and initialize it. */
2009   struct lex_token *token = xmalloc (sizeof *token);
2010   token->token = (struct token) { .type = T_STOP };
2011   token->macro_rep = NULL;
2012   token->ref_cnt = NULL;
2013   token->token_pos = src->seg_pos;
2014
2015   /* Extract a segment. */
2016   const char *segment;
2017   enum segment_type seg_type;
2018   int seg_len;
2019   for (;;)
2020     {
2021       segment = &src->buffer[src->seg_pos];
2022       seg_len = segmenter_push (&src->segmenter, segment,
2023                                 src->length - src->seg_pos,
2024                                 src->reader->eof, &seg_type);
2025       if (seg_len >= 0)
2026         break;
2027
2028       /* The segmenter needs more input to produce a segment. */
2029       assert (!src->reader->eof);
2030       lex_source_read__ (src);
2031     }
2032
2033   /* Update state based on the segment. */
2034   token->token_len = seg_len;
2035   src->seg_pos += seg_len;
2036   if (seg_type == SEG_NEWLINE)
2037     {
2038       if (src->n_lines >= src->allocated_lines)
2039         src->lines = x2nrealloc (src->lines, &src->allocated_lines,
2040                                  sizeof *src->lines);
2041       src->lines[src->n_lines++] = src->seg_pos;
2042     }
2043
2044   /* Get a token from the segment. */
2045   enum tokenize_result result = token_from_segment (
2046     seg_type, ss_buffer (segment, seg_len), &token->token);
2047
2048   /* If we've reached the end of a line, or the end of a command, then pass
2049      the line to the output engine as a syntax text item.  */
2050   int n_lines = seg_type == SEG_NEWLINE;
2051   if (seg_type == SEG_END_COMMAND && !src->suppress_next_newline)
2052     {
2053       n_lines++;
2054       src->suppress_next_newline = true;
2055     }
2056   else if (n_lines > 0 && src->suppress_next_newline)
2057     {
2058       n_lines--;
2059       src->suppress_next_newline = false;
2060     }
2061   for (int i = 0; i < n_lines; i++)
2062     {
2063       /* Beginning of line. */
2064       const char *line = &src->buffer[src->journal_pos];
2065
2066       /* Calculate line length, including \n or \r\n end-of-line if present.
2067
2068          We use src->length even though that may be beyond what we've actually
2069          converted to tokens.  That's because, if we're emitting the line due
2070          to SEG_END_COMMAND, we want to take the whole line through the
2071          newline, not just through the '.'. */
2072       size_t max_len = src->length - src->journal_pos;
2073       const char *newline = memchr (line, '\n', max_len);
2074       size_t line_len = newline ? newline - line + 1 : max_len;
2075
2076       /* Calculate line length excluding end-of-line. */
2077       size_t copy_len = line_len;
2078       if (copy_len > 0 && line[copy_len - 1] == '\n')
2079         copy_len--;
2080       if (copy_len > 0 && line[copy_len - 1] == '\r')
2081         copy_len--;
2082
2083       /* Submit the line as syntax. */
2084       output_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX,
2085                                                    xmemdup0 (line, copy_len),
2086                                                    NULL));
2087
2088       src->journal_pos += line_len;
2089     }
2090
2091   switch (result)
2092     {
2093     case TOKENIZE_ERROR:
2094       lex_get_error (src, token);
2095       /* Fall through. */
2096     case TOKENIZE_EMPTY:
2097       lex_token_destroy (token);
2098       return false;
2099
2100     case TOKENIZE_TOKEN:
2101       if (token->token.type == T_STOP)
2102         {
2103           token->token.type = T_ENDCMD;
2104           src->eof = true;
2105         }
2106       lex_stage_push_last (&src->pp, token);
2107       return true;
2108     }
2109   NOT_REACHED ();
2110 }
2111
2112 /* Attempts to append a new token to SRC.  Returns true if successful, false on
2113    failure.  On failure, the end of SRC has been reached and no more tokens
2114    will be forthcoming from it.
2115
2116    Does not make the new token available for lookahead yet; the caller must
2117    adjust SRC's 'middle' pointer to do so. */
2118 static bool
2119 lex_source_get_pp (struct lex_source *src)
2120 {
2121   while (!src->eof)
2122     if (lex_source_try_get_pp (src))
2123       return true;
2124   return false;
2125 }
2126
2127 static bool
2128 lex_source_try_get_merge (const struct lex_source *src_)
2129 {
2130   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
2131
2132   if (lex_stage_is_empty (&src->pp) && !lex_source_get_pp (src))
2133     return false;
2134
2135   if (!settings_get_mexpand ())
2136     {
2137       lex_stage_shift (&src->merge, &src->pp, lex_stage_count (&src->pp));
2138       return true;
2139     }
2140
2141   /* Now pass tokens one-by-one to the macro expander.
2142
2143      In the common case where there is no macro to expand, the loop is not
2144      entered.  */
2145   struct macro_call *mc;
2146   int n_call = macro_call_create (src->lexer->macros,
2147                                   &lex_stage_first (&src->pp)->token, &mc);
2148   for (int ofs = 1; !n_call; ofs++)
2149     {
2150       if (lex_stage_count (&src->pp) <= ofs && !lex_source_get_pp (src))
2151         {
2152           /* This should not be reachable because we always get a T_ENDCMD at
2153              the end of an input file (transformed from T_STOP by
2154              lex_source_try_get_pp()) and the macro_expander should always
2155              terminate expansion on T_ENDCMD. */
2156           NOT_REACHED ();
2157         }
2158
2159       const struct lex_token *t = lex_stage_nth (&src->pp, ofs);
2160       const struct macro_token mt = {
2161         .token = t->token,
2162         .syntax = ss_buffer (&src->buffer[t->token_pos], t->token_len),
2163       };
2164       const struct msg_location loc = lex_token_location (src, t, t);
2165       n_call = macro_call_add (mc, &mt, &loc);
2166     }
2167   if (n_call < 0)
2168     {
2169       /* False alarm: no macro expansion after all.  Use first token as
2170          lookahead.  We'll retry macro expansion from the second token next
2171          time around. */
2172       macro_call_destroy (mc);
2173       lex_stage_shift (&src->merge, &src->pp, 1);
2174       return true;
2175     }
2176
2177   /* The first 'n_call' tokens in 'pp', which we bracket as C0...C1, inclusive,
2178      are a macro call.  (These are likely to be the only tokens in 'pp'.)
2179      Expand them.  */
2180   const struct lex_token *c0 = lex_stage_first (&src->pp);
2181   const struct lex_token *c1 = lex_stage_nth (&src->pp, n_call - 1);
2182   struct macro_tokens expansion = { .n = 0 };
2183   struct msg_location loc = lex_token_location (src, c0, c1);
2184   macro_call_expand (mc, src->reader->syntax, &loc, &expansion);
2185   macro_call_destroy (mc);
2186
2187   /* Convert the macro expansion into syntax for possible error messages
2188      later. */
2189   size_t *ofs = xnmalloc (expansion.n, sizeof *ofs);
2190   size_t *len = xnmalloc (expansion.n, sizeof *len);
2191   struct string s = DS_EMPTY_INITIALIZER;
2192   macro_tokens_to_syntax (&expansion, &s, ofs, len);
2193
2194   if (settings_get_mprint ())
2195     output_item_submit (text_item_create (TEXT_ITEM_LOG, ds_cstr (&s),
2196                                           _("Macro Expansion")));
2197
2198   /* Append the macro expansion tokens to the lookahead. */
2199   if (expansion.n > 0)
2200     {
2201       char *macro_rep = ds_steal_cstr (&s);
2202       size_t *ref_cnt = xmalloc (sizeof *ref_cnt);
2203       *ref_cnt = expansion.n;
2204       for (size_t i = 0; i < expansion.n; i++)
2205         {
2206           struct lex_token *token = xmalloc (sizeof *token);
2207           *token = (struct lex_token) {
2208             .token = expansion.mts[i].token,
2209             .token_pos = c0->token_pos,
2210             .token_len = (c1->token_pos + c1->token_len) - c0->token_pos,
2211             .macro_rep = macro_rep,
2212             .ofs = ofs[i],
2213             .len = len[i],
2214             .ref_cnt = ref_cnt,
2215           };
2216           lex_stage_push_last (&src->merge, token);
2217
2218           ss_dealloc (&expansion.mts[i].syntax);
2219         }
2220     }
2221   else
2222     ds_destroy (&s);
2223   free (expansion.mts);
2224   free (ofs);
2225   free (len);
2226
2227   /* Destroy the tokens for the call. */
2228   for (size_t i = 0; i < n_call; i++)
2229     lex_stage_pop_first (&src->pp);
2230
2231   return expansion.n > 0;
2232 }
2233
2234 /* Attempts to obtain at least one new token into 'merge' in SRC.
2235
2236    Returns true if successful, false on failure.  In the latter case, SRC is
2237    exhausted and 'src->eof' is now true. */
2238 static bool
2239 lex_source_get_merge (struct lex_source *src)
2240 {
2241   while (!src->eof)
2242     if (lex_source_try_get_merge (src))
2243       return true;
2244   return false;
2245 }
2246
2247 /* Attempts to obtain at least one new token into 'lookahead' in SRC.
2248
2249    Returns true if successful, false on failure.  In the latter case, SRC is
2250    exhausted and 'src->eof' is now true. */
2251 static bool
2252 lex_source_get_parse (struct lex_source *src)
2253 {
2254   struct merger m = MERGER_INIT;
2255   struct token out;
2256   for (size_t i = 0; ; i++)
2257     {
2258       while (lex_stage_count (&src->merge) <= i && !lex_source_get_merge (src))
2259         {
2260           /* We always get a T_ENDCMD at the end of an input file
2261              (transformed from T_STOP by lex_source_try_get_pp()) and
2262              merger_add() should never return -1 on T_ENDCMD. */
2263           assert (lex_stage_is_empty (&src->merge));
2264           return false;
2265         }
2266
2267       int retval = merger_add (&m, &lex_stage_nth (&src->merge, i)->token,
2268                                &out);
2269       if (!retval)
2270         {
2271           lex_source_push_parse (src, lex_stage_take_first (&src->merge));
2272           return true;
2273         }
2274       else if (retval > 0)
2275         {
2276           /* Add a token that merges all the tokens together. */
2277           const struct lex_token *first = lex_stage_first (&src->merge);
2278           const struct lex_token *last = lex_stage_nth (&src->merge,
2279                                                         retval - 1);
2280           bool macro = first->macro_rep && first->macro_rep == last->macro_rep;
2281           struct lex_token *t = xmalloc (sizeof *t);
2282           *t = (struct lex_token) {
2283             .token = out,
2284             .token_pos = first->token_pos,
2285             .token_len = (last->token_pos - first->token_pos) + last->token_len,
2286
2287             /* This works well if all the tokens were not expanded from macros,
2288                or if they came from the same macro expansion.  It just gives up
2289                in the other (corner) cases. */
2290             .macro_rep = macro ? first->macro_rep : NULL,
2291             .ofs = macro ? first->ofs : 0,
2292             .len = macro ? (last->ofs - first->ofs) + last->len : 0,
2293             .ref_cnt = macro ? first->ref_cnt : NULL,
2294           };
2295           if (t->ref_cnt)
2296             ++*t->ref_cnt;
2297           lex_source_push_parse (src, t);
2298
2299           for (int i = 0; i < retval; i++)
2300             lex_stage_pop_first (&src->merge);
2301           return true;
2302         }
2303     }
2304 }
2305 \f
2306 static void
2307 lex_source_push_endcmd__ (struct lex_source *src)
2308 {
2309   assert (src->n_parse == 0);
2310
2311   struct lex_token *token = xmalloc (sizeof *token);
2312   *token = (struct lex_token) { .token = { .type = T_ENDCMD } };
2313   lex_source_push_parse (src, token);
2314 }
2315
2316 static void
2317 lex_source_push_parse (struct lex_source *src, struct lex_token *token)
2318 {
2319   if (src->n_parse >= src->allocated_parse)
2320     src->parse = x2nrealloc (src->parse, &src->allocated_parse,
2321                              sizeof *src->parse);
2322   src->parse[src->n_parse++] = token;
2323 }
2324
2325 static void
2326 lex_source_clear_parse (struct lex_source *src)
2327 {
2328   for (size_t i = 0; i < src->n_parse; i++)
2329     lex_token_destroy (src->parse[i]);
2330   src->n_parse = src->parse_ofs = 0;
2331 }
2332
2333 static struct lex_source *
2334 lex_source_create (struct lexer *lexer, struct lex_reader *reader)
2335 {
2336   size_t allocated_lines = 4;
2337   size_t *lines = xmalloc (allocated_lines * sizeof *lines);
2338   *lines = 0;
2339
2340   struct lex_source *src = xmalloc (sizeof *src);
2341   *src = (struct lex_source) {
2342     .n_refs = 1,
2343     .reader = reader,
2344     .segmenter = segmenter_init (reader->syntax, false),
2345     .lexer = lexer,
2346     .lines = lines,
2347     .n_lines = 1,
2348     .allocated_lines = allocated_lines,
2349   };
2350
2351   lex_source_push_endcmd__ (src);
2352
2353   return src;
2354 }
2355
2356 void
2357 lex_set_message_handler (struct lexer *lexer,
2358                          void (*output_msg) (const struct msg *,
2359                                              struct lexer *))
2360 {
2361   struct msg_handler msg_handler = {
2362     .output_msg = (void (*)(const struct msg *, void *)) output_msg,
2363     .aux = lexer,
2364     .lex_source_ref = lex_source_ref,
2365     .lex_source_unref = lex_source_unref,
2366     .lex_source_get_line = lex_source_get_line,
2367   };
2368   msg_set_handler (&msg_handler);
2369 }
2370
2371 void
2372 lex_source_ref (const struct lex_source *src_)
2373 {
2374   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
2375   if (src)
2376     {
2377       assert (src->n_refs > 0);
2378       src->n_refs++;
2379     }
2380 }
2381
2382 void
2383 lex_source_unref (struct lex_source *src)
2384 {
2385   if (!src)
2386     return;
2387
2388   assert (src->n_refs > 0);
2389   if (--src->n_refs > 0)
2390     return;
2391
2392   char *file_name = src->reader->file_name;
2393   char *encoding = src->reader->encoding;
2394   if (src->reader->class->destroy != NULL)
2395     src->reader->class->destroy (src->reader);
2396   free (file_name);
2397   free (encoding);
2398   free (src->buffer);
2399   free (src->lines);
2400   lex_stage_uninit (&src->pp);
2401   lex_stage_uninit (&src->merge);
2402   lex_source_clear_parse (src);
2403   free (src->parse);
2404   free (src);
2405 }
2406 \f
2407 struct lex_file_reader
2408   {
2409     struct lex_reader reader;
2410     struct u8_istream *istream;
2411   };
2412
2413 static struct lex_reader_class lex_file_reader_class;
2414
2415 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
2416    from stdin if FILE_NAME is "-").  The file is expected to be encoded with
2417    ENCODING, which should take one of the forms accepted by
2418    u8_istream_for_file().  SYNTAX and ERROR become the syntax mode and error
2419    mode of the new reader, respectively.
2420
2421    Returns a null pointer if FILE_NAME cannot be opened. */
2422 struct lex_reader *
2423 lex_reader_for_file (const char *file_name, const char *encoding,
2424                      enum segmenter_mode syntax,
2425                      enum lex_error_mode error)
2426 {
2427   struct lex_file_reader *r;
2428   struct u8_istream *istream;
2429
2430   istream = (!strcmp(file_name, "-")
2431              ? u8_istream_for_fd (encoding, STDIN_FILENO)
2432              : u8_istream_for_file (encoding, file_name, O_RDONLY));
2433   if (istream == NULL)
2434     {
2435       msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
2436       return NULL;
2437     }
2438
2439   r = xmalloc (sizeof *r);
2440   lex_reader_init (&r->reader, &lex_file_reader_class);
2441   r->reader.syntax = syntax;
2442   r->reader.error = error;
2443   r->reader.file_name = xstrdup (file_name);
2444   r->reader.encoding = xstrdup_if_nonnull (encoding);
2445   r->reader.line_number = 1;
2446   r->istream = istream;
2447
2448   return &r->reader;
2449 }
2450
2451 static struct lex_file_reader *
2452 lex_file_reader_cast (struct lex_reader *r)
2453 {
2454   return UP_CAST (r, struct lex_file_reader, reader);
2455 }
2456
2457 static size_t
2458 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
2459                enum prompt_style prompt_style UNUSED)
2460 {
2461   struct lex_file_reader *r = lex_file_reader_cast (r_);
2462   ssize_t n_read = u8_istream_read (r->istream, buf, n);
2463   if (n_read < 0)
2464     {
2465       msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
2466       return 0;
2467     }
2468   return n_read;
2469 }
2470
2471 static void
2472 lex_file_close (struct lex_reader *r_)
2473 {
2474   struct lex_file_reader *r = lex_file_reader_cast (r_);
2475
2476   if (u8_istream_fileno (r->istream) != STDIN_FILENO)
2477     {
2478       if (u8_istream_close (r->istream) != 0)
2479         msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
2480     }
2481   else
2482     u8_istream_free (r->istream);
2483
2484   free (r);
2485 }
2486
2487 static struct lex_reader_class lex_file_reader_class =
2488   {
2489     lex_file_read,
2490     lex_file_close
2491   };
2492 \f
2493 struct lex_string_reader
2494   {
2495     struct lex_reader reader;
2496     struct substring s;
2497     size_t offset;
2498   };
2499
2500 static struct lex_reader_class lex_string_reader_class;
2501
2502 /* Creates and returns a new lex_reader for the contents of S, which must be
2503    encoded in the given ENCODING.  The new reader takes ownership of S and will free it
2504    with ss_dealloc() when it is closed. */
2505 struct lex_reader *
2506 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
2507 {
2508   struct lex_string_reader *r;
2509
2510   r = xmalloc (sizeof *r);
2511   lex_reader_init (&r->reader, &lex_string_reader_class);
2512   r->reader.syntax = SEG_MODE_AUTO;
2513   r->reader.encoding = xstrdup_if_nonnull (encoding);
2514   r->s = s;
2515   r->offset = 0;
2516
2517   return &r->reader;
2518 }
2519
2520 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
2521    which must be encoded in ENCODING.  The caller retains ownership of S. */
2522 struct lex_reader *
2523 lex_reader_for_string (const char *s, const char *encoding)
2524 {
2525   struct substring ss;
2526   ss_alloc_substring (&ss, ss_cstr (s));
2527   return lex_reader_for_substring_nocopy (ss, encoding);
2528 }
2529
2530 /* Formats FORMAT as a printf()-like format string and creates and returns a
2531    new lex_reader for the formatted result.  */
2532 struct lex_reader *
2533 lex_reader_for_format (const char *format, const char *encoding, ...)
2534 {
2535   struct lex_reader *r;
2536   va_list args;
2537
2538   va_start (args, encoding);
2539   r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
2540   va_end (args);
2541
2542   return r;
2543 }
2544
2545 static struct lex_string_reader *
2546 lex_string_reader_cast (struct lex_reader *r)
2547 {
2548   return UP_CAST (r, struct lex_string_reader, reader);
2549 }
2550
2551 static size_t
2552 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
2553                  enum prompt_style prompt_style UNUSED)
2554 {
2555   struct lex_string_reader *r = lex_string_reader_cast (r_);
2556   size_t chunk;
2557
2558   chunk = MIN (n, r->s.length - r->offset);
2559   memcpy (buf, r->s.string + r->offset, chunk);
2560   r->offset += chunk;
2561
2562   return chunk;
2563 }
2564
2565 static void
2566 lex_string_close (struct lex_reader *r_)
2567 {
2568   struct lex_string_reader *r = lex_string_reader_cast (r_);
2569
2570   ss_dealloc (&r->s);
2571   free (r);
2572 }
2573
2574 static struct lex_reader_class lex_string_reader_class =
2575   {
2576     lex_string_read,
2577     lex_string_close
2578   };
2579 \f
2580 struct substring
2581 lex_source_get_line (const struct lex_source *src, int line)
2582 {
2583   if (line < 1 || line > src->n_lines)
2584     return ss_empty ();
2585
2586   size_t ofs = src->lines[line - 1];
2587   size_t end = line >= src->n_lines ? src->length : src->lines[line];
2588   return ss_buffer (&src->buffer[ofs], end - ofs);
2589 }