pintos-os.org Git - pspp/blob - src/language/lexer/lexer.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "language/lexer/lexer.h"
  20
  21 #include <errno.h>
  22 #include <fcntl.h>
  23 #include <limits.h>
  24 #include <math.h>
  25 #include <stdarg.h>
  26 #include <stdlib.h>
  27 #include <string.h>
  28 #include <unictype.h>
  29 #include <unistd.h>
  30 #include <unistr.h>
  31 #include <uniwidth.h>
  32
  33 #include "language/command.h"
  34 #include "language/lexer/macro.h"
  35 #include "language/lexer/scan.h"
  36 #include "language/lexer/segment.h"
  37 #include "language/lexer/token.h"
  38 #include "libpspp/assertion.h"
  39 #include "libpspp/cast.h"
  40 #include "libpspp/deque.h"
  41 #include "libpspp/i18n.h"
  42 #include "libpspp/ll.h"
  43 #include "libpspp/message.h"
  44 #include "libpspp/misc.h"
  45 #include "libpspp/str.h"
  46 #include "libpspp/u8-istream.h"
  47 #include "output/journal.h"
  48 #include "output/output-item.h"
  49
  50 #include "gl/c-ctype.h"
  51 #include "gl/minmax.h"
  52 #include "gl/xalloc.h"
  53 #include "gl/xmemdup0.h"
  54
  55 #include "gettext.h"
  56 #define _(msgid) gettext (msgid)
  57 #define N_(msgid) msgid
  58
  59 /* A token within a lex_source. */
  60 struct lex_token
  61   {
  62     /* The regular token information. */
  63     struct token token;
  64
  65     /* Location of token in terms of the lex_source's buffer.
  66        src->tail <= line_pos <= token_pos <= src->head. */
  67     size_t token_pos;           /* Start of token. */
  68     size_t token_len;           /* Length of source for token in bytes. */
  69     size_t line_pos;            /* Start of line containing token_pos. */
  70     int first_line;             /* Line number at token_pos. */
  71     bool from_macro;
  72   };
  73
  74 /* A source of tokens, corresponding to a syntax file.
  75
  76    This is conceptually a lex_reader wrapped with everything needed to convert
  77    its UTF-8 bytes into tokens. */
  78 struct lex_source
  79   {
  80     struct ll ll;               /* In lexer's list of sources. */
  81     struct lex_reader *reader;
  82     struct lexer *lexer;
  83     struct segmenter segmenter;
  84     bool eof;                   /* True if T_STOP was read from 'reader'. */
  85
  86     /* Buffer of UTF-8 bytes. */
  87     char *buffer;
  88     size_t allocated;           /* Number of bytes allocated. */
  89     size_t tail;                /* &buffer[0] offset into UTF-8 source. */
  90     size_t head;                /* &buffer[head - tail] offset into source. */
  91
  92     /* Positions in source file, tail <= pos <= head for each member here. */
  93     size_t journal_pos;         /* First byte not yet output to journal. */
  94     size_t seg_pos;             /* First byte not yet scanned as token. */
  95     size_t line_pos;            /* First byte of line containing seg_pos. */
  96
  97     int n_newlines;             /* Number of new-lines up to seg_pos. */
  98     bool suppress_next_newline;
  99
 100     /* Tokens. */
 101     struct deque deque;         /* Indexes into 'tokens'. */
 102     struct lex_token *tokens;   /* Lookahead tokens for parser. */
 103   };
 104
 105 static struct lex_source *lex_source_create (struct lexer *,
 106                                              struct lex_reader *);
 107 static void lex_source_destroy (struct lex_source *);
 108
 109 /* Lexer. */
 110 struct lexer
 111   {
 112     struct ll_list sources;     /* Contains "struct lex_source"s. */
 113     struct macro_set *macros;
 114   };
 115
 116 static struct lex_source *lex_source__ (const struct lexer *);
 117 static struct substring lex_source_get_syntax__ (const struct lex_source *,
 118                                                  int n0, int n1);
 119 static const struct lex_token *lex_next__ (const struct lexer *, int n);
 120 static void lex_source_push_endcmd__ (struct lex_source *);
 121
 122 static void lex_source_pop__ (struct lex_source *);
 123 static bool lex_source_get (const struct lex_source *);
 124 static void lex_source_error_valist (struct lex_source *, int n0, int n1,
 125                                      const char *format, va_list)
 126    PRINTF_FORMAT (4, 0);
 127 static const struct lex_token *lex_source_next__ (const struct lex_source *,
 128                                                   int n);
 129 \f
 130 /* Initializes READER with the specified CLASS and otherwise some reasonable
 131    defaults.  The caller should fill in the others members as desired. */
 132 void
 133 lex_reader_init (struct lex_reader *reader,
 134                  const struct lex_reader_class *class)
 135 {
 136   reader->class = class;
 137   reader->syntax = SEG_MODE_AUTO;
 138   reader->error = LEX_ERROR_CONTINUE;
 139   reader->file_name = NULL;
 140   reader->encoding = NULL;
 141   reader->line_number = 0;
 142   reader->eof = false;
 143 }
 144
 145 /* Frees any file name already in READER and replaces it by a copy of
 146    FILE_NAME, or if FILE_NAME is null then clears any existing name. */
 147 void
 148 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
 149 {
 150   free (reader->file_name);
 151   reader->file_name = xstrdup_if_nonnull (file_name);
 152 }
 153 \f
 154 /* Creates and returns a new lexer. */
 155 struct lexer *
 156 lex_create (void)
 157 {
 158   struct lexer *lexer = xmalloc (sizeof *lexer);
 159   *lexer = (struct lexer) {
 160     .sources = LL_INITIALIZER (lexer->sources),
 161     .macros = macro_set_create (),
 162   };
 163   return lexer;
 164 }
 165
 166 /* Destroys LEXER. */
 167 void
 168 lex_destroy (struct lexer *lexer)
 169 {
 170   if (lexer != NULL)
 171     {
 172       struct lex_source *source, *next;
 173
 174       ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
 175         lex_source_destroy (source);
 176       macro_set_destroy (lexer->macros);
 177       free (lexer);
 178     }
 179 }
 180
 181 /* Adds M to LEXER's set of macros.  M replaces any existing macro with the
 182    same name.  Takes ownership of M. */
 183 void
 184 lex_define_macro (struct lexer *lexer, struct macro *m)
 185 {
 186   macro_set_add (lexer->macros, m);
 187 }
 188
 189 /* Inserts READER into LEXER so that the next token read by LEXER comes from
 190    READER.  Before the caller, LEXER must either be empty or at a T_ENDCMD
 191    token. */
 192 void
 193 lex_include (struct lexer *lexer, struct lex_reader *reader)
 194 {
 195   assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
 196   ll_push_head (&lexer->sources, &lex_source_create (lexer, reader)->ll);
 197 }
 198
 199 /* Appends READER to LEXER, so that it will be read after all other current
 200    readers have already been read. */
 201 void
 202 lex_append (struct lexer *lexer, struct lex_reader *reader)
 203 {
 204   ll_push_tail (&lexer->sources, &lex_source_create (lexer, reader)->ll);
 205 }
 206 \f
 207 /* Advancing. */
 208
 209 static struct lex_token *
 210 lex_push_token__ (struct lex_source *src)
 211 {
 212   struct lex_token *token;
 213
 214   if (deque_is_full (&src->deque))
 215     src->tokens = deque_expand (&src->deque, src->tokens, sizeof *src->tokens);
 216
 217   token = &src->tokens[deque_push_front (&src->deque)];
 218   token->token = (struct token) { .type = T_STOP };
 219   token->from_macro = false;
 220   return token;
 221 }
 222
 223 static void
 224 lex_source_pop__ (struct lex_source *src)
 225 {
 226   token_uninit (&src->tokens[deque_pop_back (&src->deque)].token);
 227 }
 228
 229 static void
 230 lex_source_pop_front (struct lex_source *src)
 231 {
 232   token_uninit (&src->tokens[deque_pop_front (&src->deque)].token);
 233 }
 234
 235 /* Advances LEXER to the next token, consuming the current token. */
 236 void
 237 lex_get (struct lexer *lexer)
 238 {
 239   struct lex_source *src;
 240
 241   src = lex_source__ (lexer);
 242   if (src == NULL)
 243     return;
 244
 245   if (!deque_is_empty (&src->deque))
 246     lex_source_pop__ (src);
 247
 248   while (deque_is_empty (&src->deque))
 249     if (!lex_source_get (src))
 250       {
 251         lex_source_destroy (src);
 252         src = lex_source__ (lexer);
 253         if (src == NULL)
 254           return;
 255       }
 256 }
 257 \f
 258 /* Issuing errors. */
 259
 260 /* Prints a syntax error message containing the current token and
 261    given message MESSAGE (if non-null). */
 262 void
 263 lex_error (struct lexer *lexer, const char *format, ...)
 264 {
 265   va_list args;
 266
 267   va_start (args, format);
 268   lex_next_error_valist (lexer, 0, 0, format, args);
 269   va_end (args);
 270 }
 271
 272 /* Prints a syntax error message containing the current token and
 273    given message MESSAGE (if non-null). */
 274 void
 275 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
 276 {
 277   lex_next_error_valist (lexer, 0, 0, format, args);
 278 }
 279
 280 /* Prints a syntax error message containing the current token and
 281    given message MESSAGE (if non-null). */
 282 void
 283 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
 284 {
 285   va_list args;
 286
 287   va_start (args, format);
 288   lex_next_error_valist (lexer, n0, n1, format, args);
 289   va_end (args);
 290 }
 291
 292 /* Prints a syntax error message saying that one of the strings provided as
 293    varargs, up to the first NULL, is expected. */
 294 void
 295 (lex_error_expecting) (struct lexer *lexer, ...)
 296 {
 297   va_list args;
 298
 299   va_start (args, lexer);
 300   lex_error_expecting_valist (lexer, args);
 301   va_end (args);
 302 }
 303
 304 /* Prints a syntax error message saying that one of the options provided in
 305    ARGS, up to the first NULL, is expected. */
 306 void
 307 lex_error_expecting_valist (struct lexer *lexer, va_list args)
 308 {
 309   enum { MAX_OPTIONS = 9 };
 310   const char *options[MAX_OPTIONS];
 311   int n = 0;
 312   while (n < MAX_OPTIONS)
 313     {
 314       const char *option = va_arg (args, const char *);
 315       if (!option)
 316         break;
 317
 318       options[n++] = option;
 319     }
 320   lex_error_expecting_array (lexer, options, n);
 321 }
 322
 323 void
 324 lex_error_expecting_array (struct lexer *lexer, const char **options, size_t n)
 325 {
 326   switch (n)
 327     {
 328     case 0:
 329       lex_error (lexer, NULL);
 330       break;
 331
 332     case 1:
 333       lex_error (lexer, _("expecting %s"), options[0]);
 334       break;
 335
 336     case 2:
 337       lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
 338       break;
 339
 340     case 3:
 341       lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
 342                  options[2]);
 343       break;
 344
 345     case 4:
 346       lex_error (lexer, _("expecting %s, %s, %s, or %s"),
 347                  options[0], options[1], options[2], options[3]);
 348       break;
 349
 350     case 5:
 351       lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
 352                  options[0], options[1], options[2], options[3], options[4]);
 353       break;
 354
 355     case 6:
 356       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
 357                  options[0], options[1], options[2], options[3], options[4],
 358                  options[5]);
 359       break;
 360
 361     case 7:
 362       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
 363                  options[0], options[1], options[2], options[3], options[4],
 364                  options[5], options[6]);
 365       break;
 366
 367     case 8:
 368       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
 369                  options[0], options[1], options[2], options[3], options[4],
 370                  options[5], options[6], options[7]);
 371       break;
 372
 373     default:
 374       lex_error (lexer, NULL);
 375     }
 376 }
 377
 378 /* Reports an error to the effect that subcommand SBC may only be specified
 379    once.
 380
 381    This function does not take a lexer as an argument or use lex_error(),
 382    because the result would ordinarily just be redundant: "Syntax error at
 383    SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
 384    not help the user find the error. */
 385 void
 386 lex_sbc_only_once (const char *sbc)
 387 {
 388   msg (SE, _("Subcommand %s may only be specified once."), sbc);
 389 }
 390
 391 /* Reports an error to the effect that subcommand SBC is missing.
 392
 393    This function does not take a lexer as an argument or use lex_error(),
 394    because a missing subcommand can normally be detected only after the whole
 395    command has been parsed, and so lex_error() would always report "Syntax
 396    error at end of command", which does not help the user find the error. */
 397 void
 398 lex_sbc_missing (const char *sbc)
 399 {
 400   msg (SE, _("Required subcommand %s was not specified."), sbc);
 401 }
 402
 403 /* Reports an error to the effect that specification SPEC may only be specified
 404    once within subcommand SBC. */
 405 void
 406 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
 407 {
 408   lex_error (lexer, _("%s may only be specified once within subcommand %s"),
 409              spec, sbc);
 410 }
 411
 412 /* Reports an error to the effect that specification SPEC is missing within
 413    subcommand SBC. */
 414 void
 415 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
 416 {
 417   lex_error (lexer, _("Required %s specification missing from %s subcommand"),
 418              sbc, spec);
 419 }
 420
 421 /* Prints a syntax error message containing the current token and
 422    given message MESSAGE (if non-null). */
 423 void
 424 lex_next_error_valist (struct lexer *lexer, int n0, int n1,
 425                        const char *format, va_list args)
 426 {
 427   struct lex_source *src = lex_source__ (lexer);
 428
 429   if (src != NULL)
 430     lex_source_error_valist (src, n0, n1, format, args);
 431   else
 432     {
 433       struct string s;
 434
 435       ds_init_empty (&s);
 436       ds_put_format (&s, _("Syntax error at end of input"));
 437       if (format != NULL)
 438         {
 439           ds_put_cstr (&s, ": ");
 440           ds_put_vformat (&s, format, args);
 441         }
 442       ds_put_byte (&s, '.');
 443       msg (SE, "%s", ds_cstr (&s));
 444       ds_destroy (&s);
 445     }
 446 }
 447
 448 /* Checks that we're at end of command.
 449    If so, returns a successful command completion code.
 450    If not, flags a syntax error and returns an error command
 451    completion code. */
 452 int
 453 lex_end_of_command (struct lexer *lexer)
 454 {
 455   if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
 456     {
 457       lex_error (lexer, _("expecting end of command"));
 458       return CMD_FAILURE;
 459     }
 460   else
 461     return CMD_SUCCESS;
 462 }
 463 \f
 464 /* Token testing functions. */
 465
 466 /* Returns true if the current token is a number. */
 467 bool
 468 lex_is_number (const struct lexer *lexer)
 469 {
 470   return lex_next_is_number (lexer, 0);
 471 }
 472
 473 /* Returns true if the current token is a string. */
 474 bool
 475 lex_is_string (const struct lexer *lexer)
 476 {
 477   return lex_next_is_string (lexer, 0);
 478 }
 479
 480 /* Returns the value of the current token, which must be a
 481    floating point number. */
 482 double
 483 lex_number (const struct lexer *lexer)
 484 {
 485   return lex_next_number (lexer, 0);
 486 }
 487
 488 /* Returns true iff the current token is an integer. */
 489 bool
 490 lex_is_integer (const struct lexer *lexer)
 491 {
 492   return lex_next_is_integer (lexer, 0);
 493 }
 494
 495 /* Returns the value of the current token, which must be an
 496    integer. */
 497 long
 498 lex_integer (const struct lexer *lexer)
 499 {
 500   return lex_next_integer (lexer, 0);
 501 }
 502 \f
 503 /* Token testing functions with lookahead.
 504
 505    A value of 0 for N as an argument to any of these functions refers to the
 506    current token.  Lookahead is limited to the current command.  Any N greater
 507    than the number of tokens remaining in the current command will be treated
 508    as referring to a T_ENDCMD token. */
 509
 510 /* Returns true if the token N ahead of the current token is a number. */
 511 bool
 512 lex_next_is_number (const struct lexer *lexer, int n)
 513 {
 514   return token_is_number (lex_next (lexer, n));
 515 }
 516
 517 /* Returns true if the token N ahead of the current token is a string. */
 518 bool
 519 lex_next_is_string (const struct lexer *lexer, int n)
 520 {
 521   return token_is_string (lex_next (lexer, n));
 522 }
 523
 524 /* Returns the value of the token N ahead of the current token, which must be a
 525    floating point number. */
 526 double
 527 lex_next_number (const struct lexer *lexer, int n)
 528 {
 529   return token_number (lex_next (lexer, n));
 530 }
 531
 532 /* Returns true if the token N ahead of the current token is an integer. */
 533 bool
 534 lex_next_is_integer (const struct lexer *lexer, int n)
 535 {
 536   return token_is_integer (lex_next (lexer, n));
 537 }
 538
 539 /* Returns the value of the token N ahead of the current token, which must be
 540    an integer. */
 541 long
 542 lex_next_integer (const struct lexer *lexer, int n)
 543 {
 544   return token_integer (lex_next (lexer, n));
 545 }
 546 \f
 547 /* Token matching functions. */
 548
 549 /* If the current token has the specified TYPE, skips it and returns true.
 550    Otherwise, returns false. */
 551 bool
 552 lex_match (struct lexer *lexer, enum token_type type)
 553 {
 554   if (lex_token (lexer) == type)
 555     {
 556       lex_get (lexer);
 557       return true;
 558     }
 559   else
 560     return false;
 561 }
 562
 563 /* If the current token matches IDENTIFIER, skips it and returns true.
 564    IDENTIFIER may be abbreviated to its first three letters.  Otherwise,
 565    returns false.
 566
 567    IDENTIFIER must be an ASCII string. */
 568 bool
 569 lex_match_id (struct lexer *lexer, const char *identifier)
 570 {
 571   return lex_match_id_n (lexer, identifier, 3);
 572 }
 573
 574 /* If the current token is IDENTIFIER, skips it and returns true.  IDENTIFIER
 575    may be abbreviated to its first N letters.  Otherwise, returns false.
 576
 577    IDENTIFIER must be an ASCII string. */
 578 bool
 579 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
 580 {
 581   if (lex_token (lexer) == T_ID
 582       && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
 583     {
 584       lex_get (lexer);
 585       return true;
 586     }
 587   else
 588     return false;
 589 }
 590
 591 /* If the current token is integer X, skips it and returns true.  Otherwise,
 592    returns false. */
 593 bool
 594 lex_match_int (struct lexer *lexer, int x)
 595 {
 596   if (lex_is_integer (lexer) && lex_integer (lexer) == x)
 597     {
 598       lex_get (lexer);
 599       return true;
 600     }
 601   else
 602     return false;
 603 }
 604 \f
 605 /* Forced matches. */
 606
 607 /* If this token is IDENTIFIER, skips it and returns true.  IDENTIFIER may be
 608    abbreviated to its first 3 letters.  Otherwise, reports an error and returns
 609    false.
 610
 611    IDENTIFIER must be an ASCII string. */
 612 bool
 613 lex_force_match_id (struct lexer *lexer, const char *identifier)
 614 {
 615   if (lex_match_id (lexer, identifier))
 616     return true;
 617   else
 618     {
 619       lex_error_expecting (lexer, identifier);
 620       return false;
 621     }
 622 }
 623
 624 /* If the current token has the specified TYPE, skips it and returns true.
 625    Otherwise, reports an error and returns false. */
 626 bool
 627 lex_force_match (struct lexer *lexer, enum token_type type)
 628 {
 629   if (lex_token (lexer) == type)
 630     {
 631       lex_get (lexer);
 632       return true;
 633     }
 634   else
 635     {
 636       const char *type_string = token_type_to_string (type);
 637       if (type_string)
 638         {
 639           char *s = xasprintf ("`%s'", type_string);
 640           lex_error_expecting (lexer, s);
 641           free (s);
 642         }
 643       else
 644         lex_error_expecting (lexer, token_type_to_name (type));
 645
 646       return false;
 647     }
 648 }
 649
 650 /* If the current token is a string, does nothing and returns true.
 651    Otherwise, reports an error and returns false. */
 652 bool
 653 lex_force_string (struct lexer *lexer)
 654 {
 655   if (lex_is_string (lexer))
 656     return true;
 657   else
 658     {
 659       lex_error (lexer, _("expecting string"));
 660       return false;
 661     }
 662 }
 663
 664 /* If the current token is a string or an identifier, does nothing and returns
 665    true.  Otherwise, reports an error and returns false.
 666
 667    This is meant for use in syntactic situations where we want to encourage the
 668    user to supply a quoted string, but for compatibility we also accept
 669    identifiers.  (One example of such a situation is file names.)  Therefore,
 670    the error message issued when the current token is wrong only says that a
 671    string is expected and doesn't mention that an identifier would also be
 672    accepted. */
 673 bool
 674 lex_force_string_or_id (struct lexer *lexer)
 675 {
 676   return lex_token (lexer) == T_ID || lex_force_string (lexer);
 677 }
 678
 679 /* If the current token is an integer, does nothing and returns true.
 680    Otherwise, reports an error and returns false. */
 681 bool
 682 lex_force_int (struct lexer *lexer)
 683 {
 684   if (lex_is_integer (lexer))
 685     return true;
 686   else
 687     {
 688       lex_error (lexer, _("expecting integer"));
 689       return false;
 690     }
 691 }
 692
 693 /* If the current token is an integer in the range MIN...MAX (inclusive), does
 694    nothing and returns true.  Otherwise, reports an error and returns false.
 695    If NAME is nonnull, then it is used in the error message. */
 696 bool
 697 lex_force_int_range (struct lexer *lexer, const char *name, long min, long max)
 698 {
 699   bool is_integer = lex_is_integer (lexer);
 700   bool too_small = is_integer && lex_integer (lexer) < min;
 701   bool too_big = is_integer && lex_integer (lexer) > max;
 702   if (is_integer && !too_small && !too_big)
 703     return true;
 704
 705   if (min > max)
 706     {
 707       /* Weird, maybe a bug in the caller.  Just report that we needed an
 708          integer. */
 709       if (name)
 710         lex_error (lexer, _("Integer expected for %s."), name);
 711       else
 712         lex_error (lexer, _("Integer expected."));
 713     }
 714   else if (min == max)
 715     {
 716       if (name)
 717         lex_error (lexer, _("Expected %ld for %s."), min, name);
 718       else
 719         lex_error (lexer, _("Expected %ld."), min);
 720     }
 721   else if (min + 1 == max)
 722     {
 723       if (name)
 724         lex_error (lexer, _("Expected %ld or %ld for %s."), min, min + 1, name);
 725       else
 726         lex_error (lexer, _("Expected %ld or %ld."), min, min + 1);
 727     }
 728   else
 729     {
 730       bool report_lower_bound = (min > INT_MIN / 2) || too_small;
 731       bool report_upper_bound = (max < INT_MAX / 2) || too_big;
 732
 733       if (report_lower_bound && report_upper_bound)
 734         {
 735           if (name)
 736             lex_error (lexer,
 737                        _("Expected integer between %ld and %ld for %s."),
 738                        min, max, name);
 739           else
 740             lex_error (lexer, _("Expected integer between %ld and %ld."),
 741                        min, max);
 742         }
 743       else if (report_lower_bound)
 744         {
 745           if (min == 0)
 746             {
 747               if (name)
 748                 lex_error (lexer, _("Expected non-negative integer for %s."),
 749                            name);
 750               else
 751                 lex_error (lexer, _("Expected non-negative integer."));
 752             }
 753           else if (min == 1)
 754             {
 755               if (name)
 756                 lex_error (lexer, _("Expected positive integer for %s."),
 757                            name);
 758               else
 759                 lex_error (lexer, _("Expected positive integer."));
 760             }
 761         }
 762       else if (report_upper_bound)
 763         {
 764           if (name)
 765             lex_error (lexer,
 766                        _("Expected integer less than or equal to %ld for %s."),
 767                        max, name);
 768           else
 769             lex_error (lexer, _("Expected integer less than or equal to %ld."),
 770                        max);
 771         }
 772       else
 773         {
 774           if (name)
 775             lex_error (lexer, _("Integer expected for %s."), name);
 776           else
 777             lex_error (lexer, _("Integer expected."));
 778         }
 779     }
 780   return false;
 781 }
 782
 783 /* If the current token is a number, does nothing and returns true.
 784    Otherwise, reports an error and returns false. */
 785 bool
 786 lex_force_num (struct lexer *lexer)
 787 {
 788   if (lex_is_number (lexer))
 789     return true;
 790
 791   lex_error (lexer, _("expecting number"));
 792   return false;
 793 }
 794
 795 /* If the current token is an identifier, does nothing and returns true.
 796    Otherwise, reports an error and returns false. */
 797 bool
 798 lex_force_id (struct lexer *lexer)
 799 {
 800   if (lex_token (lexer) == T_ID)
 801     return true;
 802
 803   lex_error (lexer, _("expecting identifier"));
 804   return false;
 805 }
 806 \f
 807 /* Token accessors. */
 808
 809 /* Returns the type of LEXER's current token. */
 810 enum token_type
 811 lex_token (const struct lexer *lexer)
 812 {
 813   return lex_next_token (lexer, 0);
 814 }
 815
 816 /* Returns the number in LEXER's current token.
 817
 818    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
 819    tokens this function will always return zero. */
 820 double
 821 lex_tokval (const struct lexer *lexer)
 822 {
 823   return lex_next_tokval (lexer, 0);
 824 }
 825
 826 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
 827
 828    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 829    this functions this function will always return NULL.
 830
 831    The UTF-8 encoding of the returned string is correct for variable names and
 832    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 833    data_in() to use it in a "union value".  */
 834 const char *
 835 lex_tokcstr (const struct lexer *lexer)
 836 {
 837   return lex_next_tokcstr (lexer, 0);
 838 }
 839
 840 /* Returns the string in LEXER's current token, UTF-8 encoded.  The string is
 841    null-terminated (but the null terminator is not included in the returned
 842    substring's 'length').
 843
 844    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 845    this functions this function will always return NULL.
 846
 847    The UTF-8 encoding of the returned string is correct for variable names and
 848    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 849    data_in() to use it in a "union value".  */
 850 struct substring
 851 lex_tokss (const struct lexer *lexer)
 852 {
 853   return lex_next_tokss (lexer, 0);
 854 }
 855 \f
 856 /* Looking ahead.
 857
 858    A value of 0 for N as an argument to any of these functions refers to the
 859    current token.  Lookahead is limited to the current command.  Any N greater
 860    than the number of tokens remaining in the current command will be treated
 861    as referring to a T_ENDCMD token. */
 862
 863 static const struct lex_token *
 864 lex_next__ (const struct lexer *lexer_, int n)
 865 {
 866   struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
 867   struct lex_source *src = lex_source__ (lexer);
 868
 869   if (src != NULL)
 870     return lex_source_next__ (src, n);
 871   else
 872     {
 873       static const struct lex_token stop_token = { .token = { .type = T_STOP } };
 874       return &stop_token;
 875     }
 876 }
 877
 878 static const struct lex_token *
 879 lex_source_front (const struct lex_source *src)
 880 {
 881   return &src->tokens[deque_front (&src->deque, 0)];
 882 }
 883
 884 static const struct lex_token *
 885 lex_source_next__ (const struct lex_source *src, int n)
 886 {
 887   while (deque_count (&src->deque) <= n)
 888     {
 889       if (!deque_is_empty (&src->deque))
 890         {
 891           const struct lex_token *front = lex_source_front (src);
 892           if (front->token.type == T_STOP || front->token.type == T_ENDCMD)
 893             return front;
 894         }
 895
 896       lex_source_get (src);
 897     }
 898
 899   return &src->tokens[deque_back (&src->deque, n)];
 900 }
 901
 902 /* Returns the "struct token" of the token N after the current one in LEXER.
 903    The returned pointer can be invalidated by pretty much any succeeding call
 904    into the lexer, although the string pointer within the returned token is
 905    only invalidated by consuming the token (e.g. with lex_get()). */
 906 const struct token *
 907 lex_next (const struct lexer *lexer, int n)
 908 {
 909   return &lex_next__ (lexer, n)->token;
 910 }
 911
 912 /* Returns the type of the token N after the current one in LEXER. */
 913 enum token_type
 914 lex_next_token (const struct lexer *lexer, int n)
 915 {
 916   return lex_next (lexer, n)->type;
 917 }
 918
 919 /* Returns the number in the tokn N after the current one in LEXER.
 920
 921    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
 922    tokens this function will always return zero. */
 923 double
 924 lex_next_tokval (const struct lexer *lexer, int n)
 925 {
 926   return token_number (lex_next (lexer, n));
 927 }
 928
 929 /* Returns the null-terminated string in the token N after the current one, in
 930    UTF-8 encoding.
 931
 932    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 933    this functions this function will always return NULL.
 934
 935    The UTF-8 encoding of the returned string is correct for variable names and
 936    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 937    data_in() to use it in a "union value".  */
 938 const char *
 939 lex_next_tokcstr (const struct lexer *lexer, int n)
 940 {
 941   return lex_next_tokss (lexer, n).string;
 942 }
 943
 944 /* Returns the string in the token N after the current one, in UTF-8 encoding.
 945    The string is null-terminated (but the null terminator is not included in
 946    the returned substring's 'length').
 947
 948    Only T_ID, T_MACRO_ID, T_STRING tokens have meaningful strings.  For other
 949    tokens this functions this function will always return NULL.
 950
 951    The UTF-8 encoding of the returned string is correct for variable names and
 952    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 953    data_in() to use it in a "union value".  */
 954 struct substring
 955 lex_next_tokss (const struct lexer *lexer, int n)
 956 {
 957   return lex_next (lexer, n)->string;
 958 }
 959
 960 struct substring
 961 lex_next_representation (const struct lexer *lexer, int n0, int n1)
 962 {
 963   return lex_source_get_syntax__ (lex_source__ (lexer), n0, n1);
 964 }
 965
 966 bool
 967 lex_next_is_from_macro (const struct lexer *lexer, int n)
 968 {
 969   return lex_next__ (lexer, n)->from_macro;
 970 }
 971
 972 static bool
 973 lex_tokens_match (const struct token *actual, const struct token *expected)
 974 {
 975   if (actual->type != expected->type)
 976     return false;
 977
 978   switch (actual->type)
 979     {
 980     case T_POS_NUM:
 981     case T_NEG_NUM:
 982       return actual->number == expected->number;
 983
 984     case T_ID:
 985       return lex_id_match (expected->string, actual->string);
 986
 987     case T_STRING:
 988       return (actual->string.length == expected->string.length
 989               && !memcmp (actual->string.string, expected->string.string,
 990                           actual->string.length));
 991
 992     default:
 993       return true;
 994     }
 995 }
 996
 997 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
 998    skips it and returns true.  Otherwise, returns false.
 999
1000    S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1001    "2SLS", or "END INPUT PROGRAM".  Identifiers may be abbreviated to their
1002    first three letters. */
1003 bool
1004 lex_match_phrase (struct lexer *lexer, const char *s)
1005 {
1006   struct string_lexer slex;
1007   struct token token;
1008   int i;
1009
1010   i = 0;
1011   string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE, true);
1012   while (string_lexer_next (&slex, &token))
1013     if (token.type != SCAN_SKIP)
1014       {
1015         bool match = lex_tokens_match (lex_next (lexer, i++), &token);
1016         token_uninit (&token);
1017         if (!match)
1018           return false;
1019       }
1020
1021   while (i-- > 0)
1022     lex_get (lexer);
1023   return true;
1024 }
1025
1026 static int
1027 lex_source_get_first_line_number (const struct lex_source *src, int n)
1028 {
1029   return lex_source_next__ (src, n)->first_line;
1030 }
1031
1032 static int
1033 count_newlines (char *s, size_t length)
1034 {
1035   int n_newlines = 0;
1036   char *newline;
1037
1038   while ((newline = memchr (s, '\n', length)) != NULL)
1039     {
1040       n_newlines++;
1041       length -= (newline + 1) - s;
1042       s = newline + 1;
1043     }
1044
1045   return n_newlines;
1046 }
1047
1048 static int
1049 lex_source_get_last_line_number (const struct lex_source *src, int n)
1050 {
1051   const struct lex_token *token = lex_source_next__ (src, n);
1052
1053   if (token->first_line == 0)
1054     return 0;
1055   else
1056     {
1057       char *token_str = &src->buffer[token->token_pos - src->tail];
1058       return token->first_line + count_newlines (token_str, token->token_len) + 1;
1059     }
1060 }
1061
1062 static int
1063 count_columns (const char *s_, size_t length)
1064 {
1065   const uint8_t *s = CHAR_CAST (const uint8_t *, s_);
1066   int columns;
1067   size_t ofs;
1068   int mblen;
1069
1070   columns = 0;
1071   for (ofs = 0; ofs < length; ofs += mblen)
1072     {
1073       ucs4_t uc;
1074
1075       mblen = u8_mbtouc (&uc, s + ofs, length - ofs);
1076       if (uc != '\t')
1077         {
1078           int width = uc_width (uc, "UTF-8");
1079           if (width > 0)
1080             columns += width;
1081         }
1082       else
1083         columns = ROUND_UP (columns + 1, 8);
1084     }
1085
1086   return columns + 1;
1087 }
1088
1089 static int
1090 lex_source_get_first_column (const struct lex_source *src, int n)
1091 {
1092   const struct lex_token *token = lex_source_next__ (src, n);
1093   return count_columns (&src->buffer[token->line_pos - src->tail],
1094                         token->token_pos - token->line_pos);
1095 }
1096
1097 static int
1098 lex_source_get_last_column (const struct lex_source *src, int n)
1099 {
1100   const struct lex_token *token = lex_source_next__ (src, n);
1101   char *start, *end, *newline;
1102
1103   start = &src->buffer[token->line_pos - src->tail];
1104   end = &src->buffer[(token->token_pos + token->token_len) - src->tail];
1105   newline = memrchr (start, '\n', end - start);
1106   if (newline != NULL)
1107     start = newline + 1;
1108   return count_columns (start, end - start);
1109 }
1110
1111 /* Returns the 1-based line number of the start of the syntax that represents
1112    the token N after the current one in LEXER.  Returns 0 for a T_STOP token or
1113    if the token is drawn from a source that does not have line numbers. */
1114 int
1115 lex_get_first_line_number (const struct lexer *lexer, int n)
1116 {
1117   const struct lex_source *src = lex_source__ (lexer);
1118   return src != NULL ? lex_source_get_first_line_number (src, n) : 0;
1119 }
1120
1121 /* Returns the 1-based line number of the end of the syntax that represents the
1122    token N after the current one in LEXER, plus 1.  Returns 0 for a T_STOP
1123    token or if the token is drawn from a source that does not have line
1124    numbers.
1125
1126    Most of the time, a single token is wholly within a single line of syntax,
1127    but there are two exceptions: a T_STRING token can be made up of multiple
1128    segments on adjacent lines connected with "+" punctuators, and a T_NEG_NUM
1129    token can consist of a "-" on one line followed by the number on the next.
1130  */
1131 int
1132 lex_get_last_line_number (const struct lexer *lexer, int n)
1133 {
1134   const struct lex_source *src = lex_source__ (lexer);
1135   return src != NULL ? lex_source_get_last_line_number (src, n) : 0;
1136 }
1137
1138 /* Returns the 1-based column number of the start of the syntax that represents
1139    the token N after the current one in LEXER.  Returns 0 for a T_STOP
1140    token.
1141
1142    Column numbers are measured according to the width of characters as shown in
1143    a typical fixed-width font, in which CJK characters have width 2 and
1144    combining characters have width 0.  */
1145 int
1146 lex_get_first_column (const struct lexer *lexer, int n)
1147 {
1148   const struct lex_source *src = lex_source__ (lexer);
1149   return src != NULL ? lex_source_get_first_column (src, n) : 0;
1150 }
1151
1152 /* Returns the 1-based column number of the end of the syntax that represents
1153    the token N after the current one in LEXER, plus 1.  Returns 0 for a T_STOP
1154    token.
1155
1156    Column numbers are measured according to the width of characters as shown in
1157    a typical fixed-width font, in which CJK characters have width 2 and
1158    combining characters have width 0.  */
1159 int
1160 lex_get_last_column (const struct lexer *lexer, int n)
1161 {
1162   const struct lex_source *src = lex_source__ (lexer);
1163   return src != NULL ? lex_source_get_last_column (src, n) : 0;
1164 }
1165
1166 /* Returns the name of the syntax file from which the current command is drawn.
1167    Returns NULL for a T_STOP token or if the command's source does not have
1168    line numbers.
1169
1170    There is no version of this function that takes an N argument because
1171    lookahead only works to the end of a command and any given command is always
1172    within a single syntax file. */
1173 const char *
1174 lex_get_file_name (const struct lexer *lexer)
1175 {
1176   struct lex_source *src = lex_source__ (lexer);
1177   return src == NULL ? NULL : src->reader->file_name;
1178 }
1179
1180 const char *
1181 lex_get_encoding (const struct lexer *lexer)
1182 {
1183   struct lex_source *src = lex_source__ (lexer);
1184   return src == NULL ? NULL : src->reader->encoding;
1185 }
1186
1187 /* Returns the syntax mode for the syntax file from which the current drawn is
1188    drawn.  Returns SEG_MODE_AUTO for a T_STOP token or if the command's source
1189    does not have line numbers.
1190
1191    There is no version of this function that takes an N argument because
1192    lookahead only works to the end of a command and any given command is always
1193    within a single syntax file. */
1194 enum segmenter_mode
1195 lex_get_syntax_mode (const struct lexer *lexer)
1196 {
1197   struct lex_source *src = lex_source__ (lexer);
1198   return src == NULL ? SEG_MODE_AUTO : src->reader->syntax;
1199 }
1200
1201 /* Returns the error mode for the syntax file from which the current drawn is
1202    drawn.  Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1203    source does not have line numbers.
1204
1205    There is no version of this function that takes an N argument because
1206    lookahead only works to the end of a command and any given command is always
1207    within a single syntax file. */
1208 enum lex_error_mode
1209 lex_get_error_mode (const struct lexer *lexer)
1210 {
1211   struct lex_source *src = lex_source__ (lexer);
1212   return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1213 }
1214
1215 /* If the source that LEXER is currently reading has error mode
1216    LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1217    token to be read comes directly from whatever is next read from the stream.
1218
1219    It makes sense to call this function after encountering an error in a
1220    command entered on the console, because usually the user would prefer not to
1221    have cascading errors. */
1222 void
1223 lex_interactive_reset (struct lexer *lexer)
1224 {
1225   struct lex_source *src = lex_source__ (lexer);
1226   if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1227     {
1228       src->head = src->tail = 0;
1229       src->journal_pos = src->seg_pos = src->line_pos = 0;
1230       src->n_newlines = 0;
1231       src->suppress_next_newline = false;
1232       src->segmenter = segmenter_init (segmenter_get_mode (&src->segmenter),
1233                                        false);
1234       while (!deque_is_empty (&src->deque))
1235         lex_source_pop__ (src);
1236       lex_source_push_endcmd__ (src);
1237     }
1238 }
1239
1240 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1241 void
1242 lex_discard_rest_of_command (struct lexer *lexer)
1243 {
1244   while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1245     lex_get (lexer);
1246 }
1247
1248 /* Discards all lookahead tokens in LEXER, then discards all input sources
1249    until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1250    runs out of input sources. */
1251 void
1252 lex_discard_noninteractive (struct lexer *lexer)
1253 {
1254   struct lex_source *src = lex_source__ (lexer);
1255
1256   if (src != NULL)
1257     {
1258       while (!deque_is_empty (&src->deque))
1259         lex_source_pop__ (src);
1260
1261       for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1262            src = lex_source__ (lexer))
1263         lex_source_destroy (src);
1264     }
1265 }
1266 \f
1267 static size_t
1268 lex_source_max_tail__ (const struct lex_source *src)
1269 {
1270   const struct lex_token *token;
1271   size_t max_tail;
1272
1273   assert (src->seg_pos >= src->line_pos);
1274   max_tail = MIN (src->journal_pos, src->line_pos);
1275
1276   /* Use the oldest token also.  (We know that src->deque cannot be empty
1277      because we are in the process of adding a new token, which is already
1278      initialized enough to use here.) */
1279   token = &src->tokens[deque_back (&src->deque, 0)];
1280   assert (token->token_pos >= token->line_pos);
1281   max_tail = MIN (max_tail, token->line_pos);
1282
1283   return max_tail;
1284 }
1285
1286 static void
1287 lex_source_expand__ (struct lex_source *src)
1288 {
1289   if (src->head - src->tail >= src->allocated)
1290     {
1291       size_t max_tail = lex_source_max_tail__ (src);
1292       if (max_tail > src->tail)
1293         {
1294           /* Advance the tail, freeing up room at the head. */
1295           memmove (src->buffer, src->buffer + (max_tail - src->tail),
1296                    src->head - max_tail);
1297           src->tail = max_tail;
1298         }
1299       else
1300         {
1301           /* Buffer is completely full.  Expand it. */
1302           src->buffer = x2realloc (src->buffer, &src->allocated);
1303         }
1304     }
1305   else
1306     {
1307       /* There's space available at the head of the buffer.  Nothing to do. */
1308     }
1309 }
1310
1311 static void
1312 lex_source_read__ (struct lex_source *src)
1313 {
1314   do
1315     {
1316       lex_source_expand__ (src);
1317
1318       size_t head_ofs = src->head - src->tail;
1319       size_t space = src->allocated - head_ofs;
1320       enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1321       size_t n = src->reader->class->read (src->reader, &src->buffer[head_ofs],
1322                                            space, prompt);
1323       assert (n <= space);
1324
1325       if (n == 0)
1326         {
1327           /* End of input. */
1328           src->reader->eof = true;
1329           lex_source_expand__ (src);
1330           return;
1331         }
1332
1333       src->head += n;
1334     }
1335   while (!memchr (&src->buffer[src->seg_pos - src->tail], '\n',
1336                   src->head - src->seg_pos));
1337 }
1338
1339 static struct lex_source *
1340 lex_source__ (const struct lexer *lexer)
1341 {
1342   return (ll_is_empty (&lexer->sources) ? NULL
1343           : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1344 }
1345
1346 static struct substring
1347 lex_tokens_get_syntax__ (const struct lex_source *src,
1348                          const struct lex_token *token0,
1349                          const struct lex_token *token1)
1350 {
1351   size_t start = token0->token_pos;
1352   size_t end = token1->token_pos + token1->token_len;
1353
1354   return ss_buffer (&src->buffer[start - src->tail], end - start);
1355 }
1356
1357 static struct substring
1358 lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
1359 {
1360   return lex_tokens_get_syntax__ (src,
1361                                   lex_source_next__ (src, n0),
1362                                   lex_source_next__ (src, MAX (n0, n1)));
1363 }
1364
1365 static void
1366 lex_ellipsize__ (struct substring in, char *out, size_t out_size)
1367 {
1368   size_t out_maxlen;
1369   size_t out_len;
1370   int mblen;
1371
1372   assert (out_size >= 16);
1373   out_maxlen = out_size - 1;
1374   if (in.length > out_maxlen - 3)
1375     out_maxlen -= 3;
1376
1377   for (out_len = 0; out_len < in.length; out_len += mblen)
1378     {
1379       if (in.string[out_len] == '\n'
1380           || in.string[out_len] == '\0'
1381           || (in.string[out_len] == '\r'
1382               && out_len + 1 < in.length
1383               && in.string[out_len + 1] == '\n'))
1384         break;
1385
1386       mblen = u8_mblen (CHAR_CAST (const uint8_t *, in.string + out_len),
1387                         in.length - out_len);
1388
1389       if (mblen < 0)
1390         break;
1391
1392       if (out_len + mblen > out_maxlen)
1393         break;
1394     }
1395
1396   memcpy (out, in.string, out_len);
1397   strcpy (&out[out_len], out_len < in.length ? "..." : "");
1398 }
1399
1400 static void
1401 lex_source_error_valist (struct lex_source *src, int n0, int n1,
1402                          const char *format, va_list args)
1403 {
1404   const struct lex_token *token;
1405   struct string s;
1406
1407   ds_init_empty (&s);
1408
1409   token = lex_source_next__ (src, n0);
1410   if (token->token.type == T_ENDCMD)
1411     ds_put_cstr (&s, _("Syntax error at end of command"));
1412   else if (token->from_macro)
1413     {
1414       /* XXX this isn't ideal, we should get the actual syntax */
1415       char *syntax = token_to_string (&token->token);
1416       if (syntax)
1417         ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1418       else
1419         ds_put_cstr (&s, _("Syntax error"));
1420       free (syntax);
1421     }
1422   else
1423     {
1424       struct substring syntax = lex_source_get_syntax__ (src, n0, n1);
1425       if (!ss_is_empty (syntax))
1426         {
1427           char syntax_cstr[64];
1428
1429           lex_ellipsize__ (syntax, syntax_cstr, sizeof syntax_cstr);
1430           ds_put_format (&s, _("Syntax error at `%s'"), syntax_cstr);
1431         }
1432       else
1433         ds_put_cstr (&s, _("Syntax error"));
1434     }
1435
1436   if (format)
1437     {
1438       ds_put_cstr (&s, ": ");
1439       ds_put_vformat (&s, format, args);
1440     }
1441   if (ds_last (&s) != '.')
1442     ds_put_byte (&s, '.');
1443
1444   struct msg m = {
1445     .category = MSG_C_SYNTAX,
1446     .severity = MSG_S_ERROR,
1447     .file_name = src->reader->file_name,
1448     .first_line = lex_source_get_first_line_number (src, n0),
1449     .last_line = lex_source_get_last_line_number (src, n1),
1450     .first_column = lex_source_get_first_column (src, n0),
1451     .last_column = lex_source_get_last_column (src, n1),
1452     .text = ds_steal_cstr (&s),
1453   };
1454   msg_emit (&m);
1455 }
1456
1457 static void PRINTF_FORMAT (2, 3)
1458 lex_get_error (struct lex_source *src, const char *format, ...)
1459 {
1460   va_list args;
1461   int n;
1462
1463   va_start (args, format);
1464
1465   n = deque_count (&src->deque) - 1;
1466   lex_source_error_valist (src, n, n, format, args);
1467   lex_source_pop_front (src);
1468
1469   va_end (args);
1470 }
1471
1472 /* Attempts to append an additional token into SRC's deque, reading more from
1473    the underlying lex_reader if necessary.  Returns true if a new token was
1474    added to SRC's deque, false otherwise. */
1475 static bool
1476 lex_source_try_get (struct lex_source *src)
1477 {
1478   /* State maintained while scanning tokens.  Usually we only need a single
1479      state, but scanner_push() can return SCAN_SAVE to indicate that the state
1480      needs to be saved and possibly restored later with SCAN_BACK. */
1481   struct state
1482     {
1483       struct segmenter segmenter;
1484       enum segment_type last_segment;
1485       int newlines;             /* Number of newlines encountered so far. */
1486       /* Maintained here so we can update lex_source's similar members when we
1487          finish. */
1488       size_t line_pos;
1489       size_t seg_pos;
1490     };
1491
1492   /* Initialize state. */
1493   struct state state =
1494     {
1495       .segmenter = src->segmenter,
1496       .newlines = 0,
1497       .seg_pos = src->seg_pos,
1498       .line_pos = src->line_pos,
1499     };
1500   struct state saved = state;
1501
1502   /* Append a new token to SRC and initialize it. */
1503   struct lex_token *token = lex_push_token__ (src);
1504   struct scanner scanner;
1505   scanner_init (&scanner, &token->token);
1506   token->line_pos = src->line_pos;
1507   token->token_pos = src->seg_pos;
1508   if (src->reader->line_number > 0)
1509     token->first_line = src->reader->line_number + src->n_newlines;
1510   else
1511     token->first_line = 0;
1512
1513   /* Extract segments and pass them through the scanner until we obtain a
1514      token. */
1515   for (;;)
1516     {
1517       /* Extract a segment. */
1518       const char *segment = &src->buffer[state.seg_pos - src->tail];
1519       size_t seg_maxlen = src->head - state.seg_pos;
1520       enum segment_type type;
1521       int seg_len = segmenter_push (&state.segmenter, segment, seg_maxlen,
1522                                     src->reader->eof, &type);
1523       if (seg_len < 0)
1524         {
1525           /* The segmenter needs more input to produce a segment. */
1526           assert (!src->reader->eof);
1527           lex_source_read__ (src);
1528           continue;
1529         }
1530
1531       /* Update state based on the segment. */
1532       state.last_segment = type;
1533       state.seg_pos += seg_len;
1534       if (type == SEG_NEWLINE)
1535         {
1536           state.newlines++;
1537           state.line_pos = state.seg_pos;
1538         }
1539
1540       /* Pass the segment into the scanner and try to get a token out. */
1541       enum scan_result result = scanner_push (&scanner, type,
1542                                               ss_buffer (segment, seg_len),
1543                                               &token->token);
1544       if (result == SCAN_SAVE)
1545         saved = state;
1546       else if (result == SCAN_BACK)
1547         {
1548           state = saved;
1549           break;
1550         }
1551       else if (result == SCAN_DONE)
1552         break;
1553     }
1554
1555   /* If we've reached the end of a line, or the end of a command, then pass
1556      the line to the output engine as a syntax text item.  */
1557   int n_lines = state.newlines;
1558   if (state.last_segment == SEG_END_COMMAND && !src->suppress_next_newline)
1559     {
1560       n_lines++;
1561       src->suppress_next_newline = true;
1562     }
1563   else if (n_lines > 0 && src->suppress_next_newline)
1564     {
1565       n_lines--;
1566       src->suppress_next_newline = false;
1567     }
1568   for (int i = 0; i < n_lines; i++)
1569     {
1570       /* Beginning of line. */
1571       const char *line = &src->buffer[src->journal_pos - src->tail];
1572
1573       /* Calculate line length, including \n or \r\n end-of-line if present.
1574
1575          We use src->head even though that may be beyond what we've actually
1576          converted to tokens (which is only through state.line_pos).  That's
1577          because, if we're emitting the line due to SEG_END_COMMAND, we want to
1578          take the whole line through the newline, not just through the '.'. */
1579       size_t max_len = src->head - src->journal_pos;
1580       const char *newline = memchr (line, '\n', max_len);
1581       size_t line_len = newline ? newline - line + 1 : max_len;
1582
1583       /* Calculate line length excluding end-of-line. */
1584       size_t copy_len = line_len;
1585       if (copy_len > 0 && line[copy_len - 1] == '\n')
1586         copy_len--;
1587       if (copy_len > 0 && line[copy_len - 1] == '\r')
1588         copy_len--;
1589
1590       /* Submit the line as syntax. */
1591       output_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX,
1592                                                    xmemdup0 (line, copy_len),
1593                                                    NULL));
1594
1595       src->journal_pos += line_len;
1596     }
1597
1598   token->token_len = state.seg_pos - src->seg_pos;
1599
1600   src->segmenter = state.segmenter;
1601   src->seg_pos = state.seg_pos;
1602   src->line_pos = state.line_pos;
1603   src->n_newlines += state.newlines;
1604
1605   switch (token->token.type)
1606     {
1607     default:
1608       return true;
1609
1610     case T_STOP:
1611       token->token.type = T_ENDCMD;
1612       src->eof = true;
1613       return true;
1614
1615     case SCAN_BAD_HEX_LENGTH:
1616       lex_get_error (src, _("String of hex digits has %d characters, which "
1617                             "is not a multiple of 2"),
1618                      (int) token->token.number);
1619       return false;
1620
1621     case SCAN_BAD_HEX_DIGIT:
1622     case SCAN_BAD_UNICODE_DIGIT:
1623       lex_get_error (src, _("`%c' is not a valid hex digit"),
1624                      (int) token->token.number);
1625       return false;
1626
1627     case SCAN_BAD_UNICODE_LENGTH:
1628       lex_get_error (src, _("Unicode string contains %d bytes, which is "
1629                             "not in the valid range of 1 to 8 bytes"),
1630                      (int) token->token.number);
1631       return false;
1632
1633     case SCAN_BAD_UNICODE_CODE_POINT:
1634       lex_get_error (src, _("U+%04X is not a valid Unicode code point"),
1635                      (int) token->token.number);
1636       return false;
1637
1638     case SCAN_EXPECTED_QUOTE:
1639       lex_get_error (src, _("Unterminated string constant"));
1640       return false;
1641
1642     case SCAN_EXPECTED_EXPONENT:
1643       lex_get_error (src, _("Missing exponent following `%s'"),
1644                      token->token.string.string);
1645       return false;
1646
1647     case SCAN_UNEXPECTED_CHAR:
1648       {
1649         char c_name[16];
1650         lex_get_error (src, _("Bad character %s in input"),
1651                        uc_name (token->token.number, c_name));
1652         return false;
1653       }
1654
1655     case SCAN_SKIP:
1656       lex_source_pop_front (src);
1657       return false;
1658     }
1659
1660   NOT_REACHED ();
1661 }
1662
1663 static bool
1664 lex_source_get__ (struct lex_source *src)
1665 {
1666   for (;;)
1667     {
1668       if (src->eof)
1669         return false;
1670       else if (lex_source_try_get (src))
1671         return true;
1672     }
1673 }
1674
1675 static bool
1676 lex_source_get (const struct lex_source *src_)
1677 {
1678   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1679
1680   size_t old_count = deque_count (&src->deque);
1681   if (!lex_source_get__ (src))
1682     return false;
1683
1684   if (!settings_get_mexpand ())
1685     return true;
1686
1687   struct macro_expander *me;
1688   int retval = macro_expander_create (src->lexer->macros,
1689                                       &lex_source_front (src)->token,
1690                                       &me);
1691   while (!retval)
1692     {
1693       if (!lex_source_get__ (src))
1694         {
1695           /* This should not be reachable because we always get a T_ENDCMD at
1696              the end of an input file (transformed from T_STOP by
1697              lex_source_try_get()) and the macro_expander should always
1698              terminate expansion on T_ENDCMD. */
1699           NOT_REACHED ();
1700         }
1701
1702       const struct lex_token *front = lex_source_front (src);
1703       const struct macro_token mt = {
1704         .token = front->token,
1705         .representation = lex_tokens_get_syntax__ (src, front, front)
1706       };
1707       retval = macro_expander_add (me, &mt);
1708     }
1709   if (retval < 0)
1710     {
1711       /* XXX handle case where there's a macro invocation starting from some
1712          later token we've already obtained */
1713       macro_expander_destroy (me);
1714       return true;
1715     }
1716
1717   /* XXX handle case where the macro invocation doesn't use all the tokens */
1718   while (deque_count (&src->deque) > old_count)
1719     lex_source_pop_front (src);
1720
1721   struct macro_tokens expansion = { .n = 0 };
1722   macro_expander_get_expansion (me, &expansion);
1723   macro_expander_destroy (me);
1724
1725   if (settings_get_mprint ())
1726     {
1727       struct string mprint = DS_EMPTY_INITIALIZER;
1728       macro_tokens_to_representation (&expansion, &mprint);
1729       output_item_submit (text_item_create (TEXT_ITEM_LOG, ds_cstr (&mprint),
1730                                             _("Macro Expansion")));
1731       ds_destroy (&mprint);
1732     }
1733
1734   for (size_t i = 0; i < expansion.n; i++)
1735     {
1736       *lex_push_token__ (src) = (struct lex_token) {
1737         .token = expansion.mts[i].token,
1738         .from_macro = true,
1739         /* XXX the rest */
1740       };
1741
1742       ss_dealloc (&expansion.mts[i].representation); /* XXX should feed into lexer */
1743     }
1744   free (expansion.mts);
1745
1746   return true;
1747 }
1748 \f
1749 static void
1750 lex_source_push_endcmd__ (struct lex_source *src)
1751 {
1752   *lex_push_token__ (src) = (struct lex_token) { .token = { .type = T_ENDCMD } };
1753 }
1754
1755 static struct lex_source *
1756 lex_source_create (struct lexer *lexer, struct lex_reader *reader)
1757 {
1758   struct lex_source *src;
1759
1760   src = xzalloc (sizeof *src);
1761   src->reader = reader;
1762   src->segmenter = segmenter_init (reader->syntax, false);
1763   src->lexer = lexer;
1764   src->tokens = deque_init (&src->deque, 4, sizeof *src->tokens);
1765
1766   lex_source_push_endcmd__ (src);
1767
1768   return src;
1769 }
1770
1771 static void
1772 lex_source_destroy (struct lex_source *src)
1773 {
1774   char *file_name = src->reader->file_name;
1775   char *encoding = src->reader->encoding;
1776   if (src->reader->class->destroy != NULL)
1777     src->reader->class->destroy (src->reader);
1778   free (file_name);
1779   free (encoding);
1780   free (src->buffer);
1781   while (!deque_is_empty (&src->deque))
1782     lex_source_pop__ (src);
1783   free (src->tokens);
1784   ll_remove (&src->ll);
1785   free (src);
1786 }
1787 \f
1788 struct lex_file_reader
1789   {
1790     struct lex_reader reader;
1791     struct u8_istream *istream;
1792   };
1793
1794 static struct lex_reader_class lex_file_reader_class;
1795
1796 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
1797    from stdin if FILE_NAME is "-").  The file is expected to be encoded with
1798    ENCODING, which should take one of the forms accepted by
1799    u8_istream_for_file().  SYNTAX and ERROR become the syntax mode and error
1800    mode of the new reader, respectively.
1801
1802    Returns a null pointer if FILE_NAME cannot be opened. */
1803 struct lex_reader *
1804 lex_reader_for_file (const char *file_name, const char *encoding,
1805                      enum segmenter_mode syntax,
1806                      enum lex_error_mode error)
1807 {
1808   struct lex_file_reader *r;
1809   struct u8_istream *istream;
1810
1811   istream = (!strcmp(file_name, "-")
1812              ? u8_istream_for_fd (encoding, STDIN_FILENO)
1813              : u8_istream_for_file (encoding, file_name, O_RDONLY));
1814   if (istream == NULL)
1815     {
1816       msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
1817       return NULL;
1818     }
1819
1820   r = xmalloc (sizeof *r);
1821   lex_reader_init (&r->reader, &lex_file_reader_class);
1822   r->reader.syntax = syntax;
1823   r->reader.error = error;
1824   r->reader.file_name = xstrdup (file_name);
1825   r->reader.encoding = xstrdup_if_nonnull (encoding);
1826   r->reader.line_number = 1;
1827   r->istream = istream;
1828
1829   return &r->reader;
1830 }
1831
1832 static struct lex_file_reader *
1833 lex_file_reader_cast (struct lex_reader *r)
1834 {
1835   return UP_CAST (r, struct lex_file_reader, reader);
1836 }
1837
1838 static size_t
1839 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
1840                enum prompt_style prompt_style UNUSED)
1841 {
1842   struct lex_file_reader *r = lex_file_reader_cast (r_);
1843   ssize_t n_read = u8_istream_read (r->istream, buf, n);
1844   if (n_read < 0)
1845     {
1846       msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
1847       return 0;
1848     }
1849   return n_read;
1850 }
1851
1852 static void
1853 lex_file_close (struct lex_reader *r_)
1854 {
1855   struct lex_file_reader *r = lex_file_reader_cast (r_);
1856
1857   if (u8_istream_fileno (r->istream) != STDIN_FILENO)
1858     {
1859       if (u8_istream_close (r->istream) != 0)
1860         msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
1861     }
1862   else
1863     u8_istream_free (r->istream);
1864
1865   free (r);
1866 }
1867
1868 static struct lex_reader_class lex_file_reader_class =
1869   {
1870     lex_file_read,
1871     lex_file_close
1872   };
1873 \f
1874 struct lex_string_reader
1875   {
1876     struct lex_reader reader;
1877     struct substring s;
1878     size_t offset;
1879   };
1880
1881 static struct lex_reader_class lex_string_reader_class;
1882
1883 /* Creates and returns a new lex_reader for the contents of S, which must be
1884    encoded in the given ENCODING.  The new reader takes ownership of S and will free it
1885    with ss_dealloc() when it is closed. */
1886 struct lex_reader *
1887 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
1888 {
1889   struct lex_string_reader *r;
1890
1891   r = xmalloc (sizeof *r);
1892   lex_reader_init (&r->reader, &lex_string_reader_class);
1893   r->reader.syntax = SEG_MODE_AUTO;
1894   r->reader.encoding = xstrdup_if_nonnull (encoding);
1895   r->s = s;
1896   r->offset = 0;
1897
1898   return &r->reader;
1899 }
1900
1901 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
1902    which must be encoded in ENCODING.  The caller retains ownership of S. */
1903 struct lex_reader *
1904 lex_reader_for_string (const char *s, const char *encoding)
1905 {
1906   struct substring ss;
1907   ss_alloc_substring (&ss, ss_cstr (s));
1908   return lex_reader_for_substring_nocopy (ss, encoding);
1909 }
1910
1911 /* Formats FORMAT as a printf()-like format string and creates and returns a
1912    new lex_reader for the formatted result.  */
1913 struct lex_reader *
1914 lex_reader_for_format (const char *format, const char *encoding, ...)
1915 {
1916   struct lex_reader *r;
1917   va_list args;
1918
1919   va_start (args, encoding);
1920   r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
1921   va_end (args);
1922
1923   return r;
1924 }
1925
1926 static struct lex_string_reader *
1927 lex_string_reader_cast (struct lex_reader *r)
1928 {
1929   return UP_CAST (r, struct lex_string_reader, reader);
1930 }
1931
1932 static size_t
1933 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
1934                  enum prompt_style prompt_style UNUSED)
1935 {
1936   struct lex_string_reader *r = lex_string_reader_cast (r_);
1937   size_t chunk;
1938
1939   chunk = MIN (n, r->s.length - r->offset);
1940   memcpy (buf, r->s.string + r->offset, chunk);
1941   r->offset += chunk;
1942
1943   return chunk;
1944 }
1945
1946 static void
1947 lex_string_close (struct lex_reader *r_)
1948 {
1949   struct lex_string_reader *r = lex_string_reader_cast (r_);
1950
1951   ss_dealloc (&r->s);
1952   free (r);
1953 }
1954
1955 static struct lex_reader_class lex_string_reader_class =
1956   {
1957     lex_string_read,
1958     lex_string_close
1959   };