pintos-os.org Git - pspp/blob - src/language/lexer/lexer.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "language/lexer/lexer.h"
  20
  21 #include <errno.h>
  22 #include <fcntl.h>
  23 #include <limits.h>
  24 #include <math.h>
  25 #include <stdarg.h>
  26 #include <stdlib.h>
  27 #include <string.h>
  28 #include <unictype.h>
  29 #include <unistd.h>
  30 #include <unistr.h>
  31 #include <uniwidth.h>
  32
  33 #include "language/command.h"
  34 #include "language/lexer/scan.h"
  35 #include "language/lexer/segment.h"
  36 #include "language/lexer/token.h"
  37 #include "libpspp/assertion.h"
  38 #include "libpspp/cast.h"
  39 #include "libpspp/deque.h"
  40 #include "libpspp/i18n.h"
  41 #include "libpspp/ll.h"
  42 #include "libpspp/message.h"
  43 #include "libpspp/misc.h"
  44 #include "libpspp/str.h"
  45 #include "libpspp/u8-istream.h"
  46 #include "output/journal.h"
  47 #include "output/output-item.h"
  48
  49 #include "gl/c-ctype.h"
  50 #include "gl/minmax.h"
  51 #include "gl/xalloc.h"
  52 #include "gl/xmemdup0.h"
  53
  54 #include "gettext.h"
  55 #define _(msgid) gettext (msgid)
  56 #define N_(msgid) msgid
  57
  58 /* A token within a lex_source. */
  59 struct lex_token
  60   {
  61     /* The regular token information. */
  62     struct token token;
  63
  64     /* Location of token in terms of the lex_source's buffer.
  65        src->tail <= line_pos <= token_pos <= src->head. */
  66     size_t token_pos;           /* Start of token. */
  67     size_t token_len;           /* Length of source for token in bytes. */
  68     size_t line_pos;            /* Start of line containing token_pos. */
  69     int first_line;             /* Line number at token_pos. */
  70   };
  71
  72 /* A source of tokens, corresponding to a syntax file.
  73
  74    This is conceptually a lex_reader wrapped with everything needed to convert
  75    its UTF-8 bytes into tokens. */
  76 struct lex_source
  77   {
  78     struct ll ll;               /* In lexer's list of sources. */
  79     struct lex_reader *reader;
  80     struct segmenter segmenter;
  81     bool eof;                   /* True if T_STOP was read from 'reader'. */
  82
  83     /* Buffer of UTF-8 bytes. */
  84     char *buffer;
  85     size_t allocated;           /* Number of bytes allocated. */
  86     size_t tail;                /* &buffer[0] offset into UTF-8 source. */
  87     size_t head;                /* &buffer[head - tail] offset into source. */
  88
  89     /* Positions in source file, tail <= pos <= head for each member here. */
  90     size_t journal_pos;         /* First byte not yet output to journal. */
  91     size_t seg_pos;             /* First byte not yet scanned as token. */
  92     size_t line_pos;            /* First byte of line containing seg_pos. */
  93
  94     int n_newlines;             /* Number of new-lines up to seg_pos. */
  95     bool suppress_next_newline;
  96
  97     /* Tokens. */
  98     struct deque deque;         /* Indexes into 'tokens'. */
  99     struct lex_token *tokens;   /* Lookahead tokens for parser. */
 100   };
 101
 102 static struct lex_source *lex_source_create (struct lex_reader *);
 103 static void lex_source_destroy (struct lex_source *);
 104
 105 /* Lexer. */
 106 struct lexer
 107   {
 108     struct ll_list sources;     /* Contains "struct lex_source"s. */
 109   };
 110
 111 static struct lex_source *lex_source__ (const struct lexer *);
 112 static const struct lex_token *lex_next__ (const struct lexer *, int n);
 113 static void lex_source_push_endcmd__ (struct lex_source *);
 114
 115 static void lex_source_pop__ (struct lex_source *);
 116 static bool lex_source_get__ (const struct lex_source *);
 117 static void lex_source_error_valist (struct lex_source *, int n0, int n1,
 118                                      const char *format, va_list)
 119    PRINTF_FORMAT (4, 0);
 120 static const struct lex_token *lex_source_next__ (const struct lex_source *,
 121                                                   int n);
 122 \f
 123 /* Initializes READER with the specified CLASS and otherwise some reasonable
 124    defaults.  The caller should fill in the others members as desired. */
 125 void
 126 lex_reader_init (struct lex_reader *reader,
 127                  const struct lex_reader_class *class)
 128 {
 129   reader->class = class;
 130   reader->syntax = LEX_SYNTAX_AUTO;
 131   reader->error = LEX_ERROR_CONTINUE;
 132   reader->file_name = NULL;
 133   reader->encoding = NULL;
 134   reader->line_number = 0;
 135   reader->eof = false;
 136 }
 137
 138 /* Frees any file name already in READER and replaces it by a copy of
 139    FILE_NAME, or if FILE_NAME is null then clears any existing name. */
 140 void
 141 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
 142 {
 143   free (reader->file_name);
 144   reader->file_name = xstrdup_if_nonnull (file_name);
 145 }
 146 \f
 147 /* Creates and returns a new lexer. */
 148 struct lexer *
 149 lex_create (void)
 150 {
 151   struct lexer *lexer = xzalloc (sizeof *lexer);
 152   ll_init (&lexer->sources);
 153   return lexer;
 154 }
 155
 156 /* Destroys LEXER. */
 157 void
 158 lex_destroy (struct lexer *lexer)
 159 {
 160   if (lexer != NULL)
 161     {
 162       struct lex_source *source, *next;
 163
 164       ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
 165         lex_source_destroy (source);
 166       free (lexer);
 167     }
 168 }
 169
 170 /* Inserts READER into LEXER so that the next token read by LEXER comes from
 171    READER.  Before the caller, LEXER must either be empty or at a T_ENDCMD
 172    token. */
 173 void
 174 lex_include (struct lexer *lexer, struct lex_reader *reader)
 175 {
 176   assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
 177   ll_push_head (&lexer->sources, &lex_source_create (reader)->ll);
 178 }
 179
 180 /* Appends READER to LEXER, so that it will be read after all other current
 181    readers have already been read. */
 182 void
 183 lex_append (struct lexer *lexer, struct lex_reader *reader)
 184 {
 185   ll_push_tail (&lexer->sources, &lex_source_create (reader)->ll);
 186 }
 187 \f
 188 /* Advancing. */
 189
 190 static struct lex_token *
 191 lex_push_token__ (struct lex_source *src)
 192 {
 193   struct lex_token *token;
 194
 195   if (deque_is_full (&src->deque))
 196     src->tokens = deque_expand (&src->deque, src->tokens, sizeof *src->tokens);
 197
 198   token = &src->tokens[deque_push_front (&src->deque)];
 199   token_init (&token->token);
 200   return token;
 201 }
 202
 203 static void
 204 lex_source_pop__ (struct lex_source *src)
 205 {
 206   token_destroy (&src->tokens[deque_pop_back (&src->deque)].token);
 207 }
 208
 209 static void
 210 lex_source_pop_front (struct lex_source *src)
 211 {
 212   token_destroy (&src->tokens[deque_pop_front (&src->deque)].token);
 213 }
 214
 215 /* Advances LEXER to the next token, consuming the current token. */
 216 void
 217 lex_get (struct lexer *lexer)
 218 {
 219   struct lex_source *src;
 220
 221   src = lex_source__ (lexer);
 222   if (src == NULL)
 223     return;
 224
 225   if (!deque_is_empty (&src->deque))
 226     lex_source_pop__ (src);
 227
 228   while (deque_is_empty (&src->deque))
 229     if (!lex_source_get__ (src))
 230       {
 231         lex_source_destroy (src);
 232         src = lex_source__ (lexer);
 233         if (src == NULL)
 234           return;
 235       }
 236 }
 237 \f
 238 /* Issuing errors. */
 239
 240 /* Prints a syntax error message containing the current token and
 241    given message MESSAGE (if non-null). */
 242 void
 243 lex_error (struct lexer *lexer, const char *format, ...)
 244 {
 245   va_list args;
 246
 247   va_start (args, format);
 248   lex_next_error_valist (lexer, 0, 0, format, args);
 249   va_end (args);
 250 }
 251
 252 /* Prints a syntax error message containing the current token and
 253    given message MESSAGE (if non-null). */
 254 void
 255 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
 256 {
 257   lex_next_error_valist (lexer, 0, 0, format, args);
 258 }
 259
 260 /* Prints a syntax error message containing the current token and
 261    given message MESSAGE (if non-null). */
 262 void
 263 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
 264 {
 265   va_list args;
 266
 267   va_start (args, format);
 268   lex_next_error_valist (lexer, n0, n1, format, args);
 269   va_end (args);
 270 }
 271
 272 /* Prints a syntax error message saying that one of the strings provided as
 273    varargs, up to the first NULL, is expected. */
 274 void
 275 (lex_error_expecting) (struct lexer *lexer, ...)
 276 {
 277   va_list args;
 278
 279   va_start (args, lexer);
 280   lex_error_expecting_valist (lexer, args);
 281   va_end (args);
 282 }
 283
 284 /* Prints a syntax error message saying that one of the options provided in
 285    ARGS, up to the first NULL, is expected. */
 286 void
 287 lex_error_expecting_valist (struct lexer *lexer, va_list args)
 288 {
 289   enum { MAX_OPTIONS = 9 };
 290   const char *options[MAX_OPTIONS];
 291   int n = 0;
 292   while (n < MAX_OPTIONS)
 293     {
 294       const char *option = va_arg (args, const char *);
 295       if (!option)
 296         break;
 297
 298       options[n++] = option;
 299     }
 300   lex_error_expecting_array (lexer, options, n);
 301 }
 302
 303 void
 304 lex_error_expecting_array (struct lexer *lexer, const char **options, size_t n)
 305 {
 306   switch (n)
 307     {
 308     case 0:
 309       lex_error (lexer, NULL);
 310       break;
 311
 312     case 1:
 313       lex_error (lexer, _("expecting %s"), options[0]);
 314       break;
 315
 316     case 2:
 317       lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
 318       break;
 319
 320     case 3:
 321       lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
 322                  options[2]);
 323       break;
 324
 325     case 4:
 326       lex_error (lexer, _("expecting %s, %s, %s, or %s"),
 327                  options[0], options[1], options[2], options[3]);
 328       break;
 329
 330     case 5:
 331       lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
 332                  options[0], options[1], options[2], options[3], options[4]);
 333       break;
 334
 335     case 6:
 336       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
 337                  options[0], options[1], options[2], options[3], options[4],
 338                  options[5]);
 339       break;
 340
 341     case 7:
 342       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
 343                  options[0], options[1], options[2], options[3], options[4],
 344                  options[5], options[6]);
 345       break;
 346
 347     case 8:
 348       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
 349                  options[0], options[1], options[2], options[3], options[4],
 350                  options[5], options[6], options[7]);
 351       break;
 352
 353     default:
 354       lex_error (lexer, NULL);
 355     }
 356 }
 357
 358 /* Reports an error to the effect that subcommand SBC may only be specified
 359    once.
 360
 361    This function does not take a lexer as an argument or use lex_error(),
 362    because the result would ordinarily just be redundant: "Syntax error at
 363    SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
 364    not help the user find the error. */
 365 void
 366 lex_sbc_only_once (const char *sbc)
 367 {
 368   msg (SE, _("Subcommand %s may only be specified once."), sbc);
 369 }
 370
 371 /* Reports an error to the effect that subcommand SBC is missing.
 372
 373    This function does not take a lexer as an argument or use lex_error(),
 374    because a missing subcommand can normally be detected only after the whole
 375    command has been parsed, and so lex_error() would always report "Syntax
 376    error at end of command", which does not help the user find the error. */
 377 void
 378 lex_sbc_missing (const char *sbc)
 379 {
 380   msg (SE, _("Required subcommand %s was not specified."), sbc);
 381 }
 382
 383 /* Reports an error to the effect that specification SPEC may only be specified
 384    once within subcommand SBC. */
 385 void
 386 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
 387 {
 388   lex_error (lexer, _("%s may only be specified once within subcommand %s"),
 389              spec, sbc);
 390 }
 391
 392 /* Reports an error to the effect that specification SPEC is missing within
 393    subcommand SBC. */
 394 void
 395 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
 396 {
 397   lex_error (lexer, _("Required %s specification missing from %s subcommand"),
 398              sbc, spec);
 399 }
 400
 401 /* Prints a syntax error message containing the current token and
 402    given message MESSAGE (if non-null). */
 403 void
 404 lex_next_error_valist (struct lexer *lexer, int n0, int n1,
 405                        const char *format, va_list args)
 406 {
 407   struct lex_source *src = lex_source__ (lexer);
 408
 409   if (src != NULL)
 410     lex_source_error_valist (src, n0, n1, format, args);
 411   else
 412     {
 413       struct string s;
 414
 415       ds_init_empty (&s);
 416       ds_put_format (&s, _("Syntax error at end of input"));
 417       if (format != NULL)
 418         {
 419           ds_put_cstr (&s, ": ");
 420           ds_put_vformat (&s, format, args);
 421         }
 422       ds_put_byte (&s, '.');
 423       msg (SE, "%s", ds_cstr (&s));
 424       ds_destroy (&s);
 425     }
 426 }
 427
 428 /* Checks that we're at end of command.
 429    If so, returns a successful command completion code.
 430    If not, flags a syntax error and returns an error command
 431    completion code. */
 432 int
 433 lex_end_of_command (struct lexer *lexer)
 434 {
 435   if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
 436     {
 437       lex_error (lexer, _("expecting end of command"));
 438       return CMD_FAILURE;
 439     }
 440   else
 441     return CMD_SUCCESS;
 442 }
 443 \f
 444 /* Token testing functions. */
 445
 446 /* Returns true if the current token is a number. */
 447 bool
 448 lex_is_number (const struct lexer *lexer)
 449 {
 450   return lex_next_is_number (lexer, 0);
 451 }
 452
 453 /* Returns true if the current token is a string. */
 454 bool
 455 lex_is_string (const struct lexer *lexer)
 456 {
 457   return lex_next_is_string (lexer, 0);
 458 }
 459
 460 /* Returns the value of the current token, which must be a
 461    floating point number. */
 462 double
 463 lex_number (const struct lexer *lexer)
 464 {
 465   return lex_next_number (lexer, 0);
 466 }
 467
 468 /* Returns true iff the current token is an integer. */
 469 bool
 470 lex_is_integer (const struct lexer *lexer)
 471 {
 472   return lex_next_is_integer (lexer, 0);
 473 }
 474
 475 /* Returns the value of the current token, which must be an
 476    integer. */
 477 long
 478 lex_integer (const struct lexer *lexer)
 479 {
 480   return lex_next_integer (lexer, 0);
 481 }
 482 \f
 483 /* Token testing functions with lookahead.
 484
 485    A value of 0 for N as an argument to any of these functions refers to the
 486    current token.  Lookahead is limited to the current command.  Any N greater
 487    than the number of tokens remaining in the current command will be treated
 488    as referring to a T_ENDCMD token. */
 489
 490 /* Returns true if the token N ahead of the current token is a number. */
 491 bool
 492 lex_next_is_number (const struct lexer *lexer, int n)
 493 {
 494   enum token_type next_token = lex_next_token (lexer, n);
 495   return next_token == T_POS_NUM || next_token == T_NEG_NUM;
 496 }
 497
 498 /* Returns true if the token N ahead of the current token is a string. */
 499 bool
 500 lex_next_is_string (const struct lexer *lexer, int n)
 501 {
 502   return lex_next_token (lexer, n) == T_STRING;
 503 }
 504
 505 /* Returns the value of the token N ahead of the current token, which must be a
 506    floating point number. */
 507 double
 508 lex_next_number (const struct lexer *lexer, int n)
 509 {
 510   assert (lex_next_is_number (lexer, n));
 511   return lex_next_tokval (lexer, n);
 512 }
 513
 514 /* Returns true if the token N ahead of the current token is an integer. */
 515 bool
 516 lex_next_is_integer (const struct lexer *lexer, int n)
 517 {
 518   double value;
 519
 520   if (!lex_next_is_number (lexer, n))
 521     return false;
 522
 523   value = lex_next_tokval (lexer, n);
 524   return value > LONG_MIN && value <= LONG_MAX && floor (value) == value;
 525 }
 526
 527 /* Returns the value of the token N ahead of the current token, which must be
 528    an integer. */
 529 long
 530 lex_next_integer (const struct lexer *lexer, int n)
 531 {
 532   assert (lex_next_is_integer (lexer, n));
 533   return lex_next_tokval (lexer, n);
 534 }
 535 \f
 536 /* Token matching functions. */
 537
 538 /* If the current token has the specified TYPE, skips it and returns true.
 539    Otherwise, returns false. */
 540 bool
 541 lex_match (struct lexer *lexer, enum token_type type)
 542 {
 543   if (lex_token (lexer) == type)
 544     {
 545       lex_get (lexer);
 546       return true;
 547     }
 548   else
 549     return false;
 550 }
 551
 552 /* If the current token matches IDENTIFIER, skips it and returns true.
 553    IDENTIFIER may be abbreviated to its first three letters.  Otherwise,
 554    returns false.
 555
 556    IDENTIFIER must be an ASCII string. */
 557 bool
 558 lex_match_id (struct lexer *lexer, const char *identifier)
 559 {
 560   return lex_match_id_n (lexer, identifier, 3);
 561 }
 562
 563 /* If the current token is IDENTIFIER, skips it and returns true.  IDENTIFIER
 564    may be abbreviated to its first N letters.  Otherwise, returns false.
 565
 566    IDENTIFIER must be an ASCII string. */
 567 bool
 568 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
 569 {
 570   if (lex_token (lexer) == T_ID
 571       && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
 572     {
 573       lex_get (lexer);
 574       return true;
 575     }
 576   else
 577     return false;
 578 }
 579
 580 /* If the current token is integer X, skips it and returns true.  Otherwise,
 581    returns false. */
 582 bool
 583 lex_match_int (struct lexer *lexer, int x)
 584 {
 585   if (lex_is_integer (lexer) && lex_integer (lexer) == x)
 586     {
 587       lex_get (lexer);
 588       return true;
 589     }
 590   else
 591     return false;
 592 }
 593 \f
 594 /* Forced matches. */
 595
 596 /* If this token is IDENTIFIER, skips it and returns true.  IDENTIFIER may be
 597    abbreviated to its first 3 letters.  Otherwise, reports an error and returns
 598    false.
 599
 600    IDENTIFIER must be an ASCII string. */
 601 bool
 602 lex_force_match_id (struct lexer *lexer, const char *identifier)
 603 {
 604   if (lex_match_id (lexer, identifier))
 605     return true;
 606   else
 607     {
 608       lex_error_expecting (lexer, identifier);
 609       return false;
 610     }
 611 }
 612
 613 /* If the current token has the specified TYPE, skips it and returns true.
 614    Otherwise, reports an error and returns false. */
 615 bool
 616 lex_force_match (struct lexer *lexer, enum token_type type)
 617 {
 618   if (lex_token (lexer) == type)
 619     {
 620       lex_get (lexer);
 621       return true;
 622     }
 623   else
 624     {
 625       const char *type_string = token_type_to_string (type);
 626       if (type_string)
 627         {
 628           char *s = xasprintf ("`%s'", type_string);
 629           lex_error_expecting (lexer, s);
 630           free (s);
 631         }
 632       else
 633         lex_error_expecting (lexer, token_type_to_name (type));
 634
 635       return false;
 636     }
 637 }
 638
 639 /* If the current token is a string, does nothing and returns true.
 640    Otherwise, reports an error and returns false. */
 641 bool
 642 lex_force_string (struct lexer *lexer)
 643 {
 644   if (lex_is_string (lexer))
 645     return true;
 646   else
 647     {
 648       lex_error (lexer, _("expecting string"));
 649       return false;
 650     }
 651 }
 652
 653 /* If the current token is a string or an identifier, does nothing and returns
 654    true.  Otherwise, reports an error and returns false.
 655
 656    This is meant for use in syntactic situations where we want to encourage the
 657    user to supply a quoted string, but for compatibility we also accept
 658    identifiers.  (One example of such a situation is file names.)  Therefore,
 659    the error message issued when the current token is wrong only says that a
 660    string is expected and doesn't mention that an identifier would also be
 661    accepted. */
 662 bool
 663 lex_force_string_or_id (struct lexer *lexer)
 664 {
 665   return lex_token (lexer) == T_ID || lex_force_string (lexer);
 666 }
 667
 668 /* If the current token is an integer, does nothing and returns true.
 669    Otherwise, reports an error and returns false. */
 670 bool
 671 lex_force_int (struct lexer *lexer)
 672 {
 673   if (lex_is_integer (lexer))
 674     return true;
 675   else
 676     {
 677       lex_error (lexer, _("expecting integer"));
 678       return false;
 679     }
 680 }
 681
 682 /* If the current token is an integer in the range MIN...MAX (inclusive), does
 683    nothing and returns true.  Otherwise, reports an error and returns false.
 684    If NAME is nonnull, then it is used in the error message. */
 685 bool
 686 lex_force_int_range (struct lexer *lexer, const char *name, long min, long max)
 687 {
 688   bool is_integer = lex_is_integer (lexer);
 689   bool too_small = is_integer && lex_integer (lexer) < min;
 690   bool too_big = is_integer && lex_integer (lexer) > max;
 691   if (is_integer && !too_small && !too_big)
 692     return true;
 693
 694   if (min > max)
 695     {
 696       /* Weird, maybe a bug in the caller.  Just report that we needed an
 697          integer. */
 698       if (name)
 699         lex_error (lexer, _("Integer expected for %s."), name);
 700       else
 701         lex_error (lexer, _("Integer expected."));
 702     }
 703   else if (min == max)
 704     {
 705       if (name)
 706         lex_error (lexer, _("Expected %ld for %s."), min, name);
 707       else
 708         lex_error (lexer, _("Expected %ld."), min);
 709     }
 710   else if (min + 1 == max)
 711     {
 712       if (name)
 713         lex_error (lexer, _("Expected %ld or %ld for %s."), min, min + 1, name);
 714       else
 715         lex_error (lexer, _("Expected %ld or %ld."), min, min + 1);
 716     }
 717   else
 718     {
 719       bool report_lower_bound = (min > INT_MIN / 2) || too_small;
 720       bool report_upper_bound = (max < INT_MAX / 2) || too_big;
 721
 722       if (report_lower_bound && report_upper_bound)
 723         {
 724           if (name)
 725             lex_error (lexer,
 726                        _("Expected integer between %ld and %ld for %s."),
 727                        min, max, name);
 728           else
 729             lex_error (lexer, _("Expected integer between %ld and %ld."),
 730                        min, max);
 731         }
 732       else if (report_lower_bound)
 733         {
 734           if (min == 0)
 735             {
 736               if (name)
 737                 lex_error (lexer, _("Expected non-negative integer for %s."),
 738                            name);
 739               else
 740                 lex_error (lexer, _("Expected non-negative integer."));
 741             }
 742           else if (min == 1)
 743             {
 744               if (name)
 745                 lex_error (lexer, _("Expected positive integer for %s."),
 746                            name);
 747               else
 748                 lex_error (lexer, _("Expected positive integer."));
 749             }
 750         }
 751       else if (report_upper_bound)
 752         {
 753           if (name)
 754             lex_error (lexer,
 755                        _("Expected integer less than or equal to %ld for %s."),
 756                        max, name);
 757           else
 758             lex_error (lexer, _("Expected integer less than or equal to %ld."),
 759                        max);
 760         }
 761       else
 762         {
 763           if (name)
 764             lex_error (lexer, _("Integer expected for %s."), name);
 765           else
 766             lex_error (lexer, _("Integer expected."));
 767         }
 768     }
 769   return false;
 770 }
 771
 772 /* If the current token is a number, does nothing and returns true.
 773    Otherwise, reports an error and returns false. */
 774 bool
 775 lex_force_num (struct lexer *lexer)
 776 {
 777   if (lex_is_number (lexer))
 778     return true;
 779
 780   lex_error (lexer, _("expecting number"));
 781   return false;
 782 }
 783
 784 /* If the current token is an identifier, does nothing and returns true.
 785    Otherwise, reports an error and returns false. */
 786 bool
 787 lex_force_id (struct lexer *lexer)
 788 {
 789   if (lex_token (lexer) == T_ID)
 790     return true;
 791
 792   lex_error (lexer, _("expecting identifier"));
 793   return false;
 794 }
 795 \f
 796 /* Token accessors. */
 797
 798 /* Returns the type of LEXER's current token. */
 799 enum token_type
 800 lex_token (const struct lexer *lexer)
 801 {
 802   return lex_next_token (lexer, 0);
 803 }
 804
 805 /* Returns the number in LEXER's current token.
 806
 807    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
 808    tokens this function will always return zero. */
 809 double
 810 lex_tokval (const struct lexer *lexer)
 811 {
 812   return lex_next_tokval (lexer, 0);
 813 }
 814
 815 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
 816
 817    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 818    this functions this function will always return NULL.
 819
 820    The UTF-8 encoding of the returned string is correct for variable names and
 821    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 822    data_in() to use it in a "union value".  */
 823 const char *
 824 lex_tokcstr (const struct lexer *lexer)
 825 {
 826   return lex_next_tokcstr (lexer, 0);
 827 }
 828
 829 /* Returns the string in LEXER's current token, UTF-8 encoded.  The string is
 830    null-terminated (but the null terminator is not included in the returned
 831    substring's 'length').
 832
 833    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 834    this functions this function will always return NULL.
 835
 836    The UTF-8 encoding of the returned string is correct for variable names and
 837    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 838    data_in() to use it in a "union value".  */
 839 struct substring
 840 lex_tokss (const struct lexer *lexer)
 841 {
 842   return lex_next_tokss (lexer, 0);
 843 }
 844 \f
 845 /* Looking ahead.
 846
 847    A value of 0 for N as an argument to any of these functions refers to the
 848    current token.  Lookahead is limited to the current command.  Any N greater
 849    than the number of tokens remaining in the current command will be treated
 850    as referring to a T_ENDCMD token. */
 851
 852 static const struct lex_token *
 853 lex_next__ (const struct lexer *lexer_, int n)
 854 {
 855   struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
 856   struct lex_source *src = lex_source__ (lexer);
 857
 858   if (src != NULL)
 859     return lex_source_next__ (src, n);
 860   else
 861     {
 862       static const struct lex_token stop_token =
 863         { TOKEN_INITIALIZER (T_STOP, 0.0, ""), 0, 0, 0, 0 };
 864
 865       return &stop_token;
 866     }
 867 }
 868
 869 static const struct lex_token *
 870 lex_source_next__ (const struct lex_source *src, int n)
 871 {
 872   while (deque_count (&src->deque) <= n)
 873     {
 874       if (!deque_is_empty (&src->deque))
 875         {
 876           struct lex_token *front;
 877
 878           front = &src->tokens[deque_front (&src->deque, 0)];
 879           if (front->token.type == T_STOP || front->token.type == T_ENDCMD)
 880             return front;
 881         }
 882
 883       lex_source_get__ (src);
 884     }
 885
 886   return &src->tokens[deque_back (&src->deque, n)];
 887 }
 888
 889 /* Returns the "struct token" of the token N after the current one in LEXER.
 890    The returned pointer can be invalidated by pretty much any succeeding call
 891    into the lexer, although the string pointer within the returned token is
 892    only invalidated by consuming the token (e.g. with lex_get()). */
 893 const struct token *
 894 lex_next (const struct lexer *lexer, int n)
 895 {
 896   return &lex_next__ (lexer, n)->token;
 897 }
 898
 899 /* Returns the type of the token N after the current one in LEXER. */
 900 enum token_type
 901 lex_next_token (const struct lexer *lexer, int n)
 902 {
 903   return lex_next (lexer, n)->type;
 904 }
 905
 906 /* Returns the number in the tokn N after the current one in LEXER.
 907
 908    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
 909    tokens this function will always return zero. */
 910 double
 911 lex_next_tokval (const struct lexer *lexer, int n)
 912 {
 913   const struct token *token = lex_next (lexer, n);
 914   return token->number;
 915 }
 916
 917 /* Returns the null-terminated string in the token N after the current one, in
 918    UTF-8 encoding.
 919
 920    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 921    this functions this function will always return NULL.
 922
 923    The UTF-8 encoding of the returned string is correct for variable names and
 924    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 925    data_in() to use it in a "union value".  */
 926 const char *
 927 lex_next_tokcstr (const struct lexer *lexer, int n)
 928 {
 929   return lex_next_tokss (lexer, n).string;
 930 }
 931
 932 /* Returns the string in the token N after the current one, in UTF-8 encoding.
 933    The string is null-terminated (but the null terminator is not included in
 934    the returned substring's 'length').
 935
 936    Only T_ID, T_MACRO_ID, T_STRING tokens have meaningful strings.  For other
 937    tokens this functions this function will always return NULL.
 938
 939    The UTF-8 encoding of the returned string is correct for variable names and
 940    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 941    data_in() to use it in a "union value".  */
 942 struct substring
 943 lex_next_tokss (const struct lexer *lexer, int n)
 944 {
 945   return lex_next (lexer, n)->string;
 946 }
 947
 948 static bool
 949 lex_tokens_match (const struct token *actual, const struct token *expected)
 950 {
 951   if (actual->type != expected->type)
 952     return false;
 953
 954   switch (actual->type)
 955     {
 956     case T_POS_NUM:
 957     case T_NEG_NUM:
 958       return actual->number == expected->number;
 959
 960     case T_ID:
 961       return lex_id_match (expected->string, actual->string);
 962
 963     case T_STRING:
 964       return (actual->string.length == expected->string.length
 965               && !memcmp (actual->string.string, expected->string.string,
 966                           actual->string.length));
 967
 968     default:
 969       return true;
 970     }
 971 }
 972
 973 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
 974    skips it and returns true.  Otherwise, returns false.
 975
 976    S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
 977    "2SLS", or "END INPUT PROGRAM".  Identifiers may be abbreviated to their
 978    first three letters. */
 979 bool
 980 lex_match_phrase (struct lexer *lexer, const char *s)
 981 {
 982   struct string_lexer slex;
 983   struct token token;
 984   int i;
 985
 986   i = 0;
 987   string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE);
 988   while (string_lexer_next (&slex, &token))
 989     if (token.type != SCAN_SKIP)
 990       {
 991         bool match = lex_tokens_match (lex_next (lexer, i++), &token);
 992         token_destroy (&token);
 993         if (!match)
 994           return false;
 995       }
 996
 997   while (i-- > 0)
 998     lex_get (lexer);
 999   return true;
1000 }
1001
1002 static int
1003 lex_source_get_first_line_number (const struct lex_source *src, int n)
1004 {
1005   return lex_source_next__ (src, n)->first_line;
1006 }
1007
1008 static int
1009 count_newlines (char *s, size_t length)
1010 {
1011   int n_newlines = 0;
1012   char *newline;
1013
1014   while ((newline = memchr (s, '\n', length)) != NULL)
1015     {
1016       n_newlines++;
1017       length -= (newline + 1) - s;
1018       s = newline + 1;
1019     }
1020
1021   return n_newlines;
1022 }
1023
1024 static int
1025 lex_source_get_last_line_number (const struct lex_source *src, int n)
1026 {
1027   const struct lex_token *token = lex_source_next__ (src, n);
1028
1029   if (token->first_line == 0)
1030     return 0;
1031   else
1032     {
1033       char *token_str = &src->buffer[token->token_pos - src->tail];
1034       return token->first_line + count_newlines (token_str, token->token_len) + 1;
1035     }
1036 }
1037
1038 static int
1039 count_columns (const char *s_, size_t length)
1040 {
1041   const uint8_t *s = CHAR_CAST (const uint8_t *, s_);
1042   int columns;
1043   size_t ofs;
1044   int mblen;
1045
1046   columns = 0;
1047   for (ofs = 0; ofs < length; ofs += mblen)
1048     {
1049       ucs4_t uc;
1050
1051       mblen = u8_mbtouc (&uc, s + ofs, length - ofs);
1052       if (uc != '\t')
1053         {
1054           int width = uc_width (uc, "UTF-8");
1055           if (width > 0)
1056             columns += width;
1057         }
1058       else
1059         columns = ROUND_UP (columns + 1, 8);
1060     }
1061
1062   return columns + 1;
1063 }
1064
1065 static int
1066 lex_source_get_first_column (const struct lex_source *src, int n)
1067 {
1068   const struct lex_token *token = lex_source_next__ (src, n);
1069   return count_columns (&src->buffer[token->line_pos - src->tail],
1070                         token->token_pos - token->line_pos);
1071 }
1072
1073 static int
1074 lex_source_get_last_column (const struct lex_source *src, int n)
1075 {
1076   const struct lex_token *token = lex_source_next__ (src, n);
1077   char *start, *end, *newline;
1078
1079   start = &src->buffer[token->line_pos - src->tail];
1080   end = &src->buffer[(token->token_pos + token->token_len) - src->tail];
1081   newline = memrchr (start, '\n', end - start);
1082   if (newline != NULL)
1083     start = newline + 1;
1084   return count_columns (start, end - start);
1085 }
1086
1087 /* Returns the 1-based line number of the start of the syntax that represents
1088    the token N after the current one in LEXER.  Returns 0 for a T_STOP token or
1089    if the token is drawn from a source that does not have line numbers. */
1090 int
1091 lex_get_first_line_number (const struct lexer *lexer, int n)
1092 {
1093   const struct lex_source *src = lex_source__ (lexer);
1094   return src != NULL ? lex_source_get_first_line_number (src, n) : 0;
1095 }
1096
1097 /* Returns the 1-based line number of the end of the syntax that represents the
1098    token N after the current one in LEXER, plus 1.  Returns 0 for a T_STOP
1099    token or if the token is drawn from a source that does not have line
1100    numbers.
1101
1102    Most of the time, a single token is wholly within a single line of syntax,
1103    but there are two exceptions: a T_STRING token can be made up of multiple
1104    segments on adjacent lines connected with "+" punctuators, and a T_NEG_NUM
1105    token can consist of a "-" on one line followed by the number on the next.
1106  */
1107 int
1108 lex_get_last_line_number (const struct lexer *lexer, int n)
1109 {
1110   const struct lex_source *src = lex_source__ (lexer);
1111   return src != NULL ? lex_source_get_last_line_number (src, n) : 0;
1112 }
1113
1114 /* Returns the 1-based column number of the start of the syntax that represents
1115    the token N after the current one in LEXER.  Returns 0 for a T_STOP
1116    token.
1117
1118    Column numbers are measured according to the width of characters as shown in
1119    a typical fixed-width font, in which CJK characters have width 2 and
1120    combining characters have width 0.  */
1121 int
1122 lex_get_first_column (const struct lexer *lexer, int n)
1123 {
1124   const struct lex_source *src = lex_source__ (lexer);
1125   return src != NULL ? lex_source_get_first_column (src, n) : 0;
1126 }
1127
1128 /* Returns the 1-based column number of the end of the syntax that represents
1129    the token N after the current one in LEXER, plus 1.  Returns 0 for a T_STOP
1130    token.
1131
1132    Column numbers are measured according to the width of characters as shown in
1133    a typical fixed-width font, in which CJK characters have width 2 and
1134    combining characters have width 0.  */
1135 int
1136 lex_get_last_column (const struct lexer *lexer, int n)
1137 {
1138   const struct lex_source *src = lex_source__ (lexer);
1139   return src != NULL ? lex_source_get_last_column (src, n) : 0;
1140 }
1141
1142 /* Returns the name of the syntax file from which the current command is drawn.
1143    Returns NULL for a T_STOP token or if the command's source does not have
1144    line numbers.
1145
1146    There is no version of this function that takes an N argument because
1147    lookahead only works to the end of a command and any given command is always
1148    within a single syntax file. */
1149 const char *
1150 lex_get_file_name (const struct lexer *lexer)
1151 {
1152   struct lex_source *src = lex_source__ (lexer);
1153   return src == NULL ? NULL : src->reader->file_name;
1154 }
1155
1156 const char *
1157 lex_get_encoding (const struct lexer *lexer)
1158 {
1159   struct lex_source *src = lex_source__ (lexer);
1160   return src == NULL ? NULL : src->reader->encoding;
1161 }
1162
1163
1164 /* Returns the syntax mode for the syntax file from which the current drawn is
1165    drawn.  Returns LEX_SYNTAX_AUTO for a T_STOP token or if the command's
1166    source does not have line numbers.
1167
1168    There is no version of this function that takes an N argument because
1169    lookahead only works to the end of a command and any given command is always
1170    within a single syntax file. */
1171 enum lex_syntax_mode
1172 lex_get_syntax_mode (const struct lexer *lexer)
1173 {
1174   struct lex_source *src = lex_source__ (lexer);
1175   return src == NULL ? LEX_SYNTAX_AUTO : src->reader->syntax;
1176 }
1177
1178 /* Returns the error mode for the syntax file from which the current drawn is
1179    drawn.  Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1180    source does not have line numbers.
1181
1182    There is no version of this function that takes an N argument because
1183    lookahead only works to the end of a command and any given command is always
1184    within a single syntax file. */
1185 enum lex_error_mode
1186 lex_get_error_mode (const struct lexer *lexer)
1187 {
1188   struct lex_source *src = lex_source__ (lexer);
1189   return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1190 }
1191
1192 /* If the source that LEXER is currently reading has error mode
1193    LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1194    token to be read comes directly from whatever is next read from the stream.
1195
1196    It makes sense to call this function after encountering an error in a
1197    command entered on the console, because usually the user would prefer not to
1198    have cascading errors. */
1199 void
1200 lex_interactive_reset (struct lexer *lexer)
1201 {
1202   struct lex_source *src = lex_source__ (lexer);
1203   if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1204     {
1205       src->head = src->tail = 0;
1206       src->journal_pos = src->seg_pos = src->line_pos = 0;
1207       src->n_newlines = 0;
1208       src->suppress_next_newline = false;
1209       segmenter_init (&src->segmenter, segmenter_get_mode (&src->segmenter));
1210       while (!deque_is_empty (&src->deque))
1211         lex_source_pop__ (src);
1212       lex_source_push_endcmd__ (src);
1213     }
1214 }
1215
1216 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1217 void
1218 lex_discard_rest_of_command (struct lexer *lexer)
1219 {
1220   while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1221     lex_get (lexer);
1222 }
1223
1224 /* Discards all lookahead tokens in LEXER, then discards all input sources
1225    until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1226    runs out of input sources. */
1227 void
1228 lex_discard_noninteractive (struct lexer *lexer)
1229 {
1230   struct lex_source *src = lex_source__ (lexer);
1231
1232   if (src != NULL)
1233     {
1234       while (!deque_is_empty (&src->deque))
1235         lex_source_pop__ (src);
1236
1237       for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1238            src = lex_source__ (lexer))
1239         lex_source_destroy (src);
1240     }
1241 }
1242 \f
1243 static size_t
1244 lex_source_max_tail__ (const struct lex_source *src)
1245 {
1246   const struct lex_token *token;
1247   size_t max_tail;
1248
1249   assert (src->seg_pos >= src->line_pos);
1250   max_tail = MIN (src->journal_pos, src->line_pos);
1251
1252   /* Use the oldest token also.  (We know that src->deque cannot be empty
1253      because we are in the process of adding a new token, which is already
1254      initialized enough to use here.) */
1255   token = &src->tokens[deque_back (&src->deque, 0)];
1256   assert (token->token_pos >= token->line_pos);
1257   max_tail = MIN (max_tail, token->line_pos);
1258
1259   return max_tail;
1260 }
1261
1262 static void
1263 lex_source_expand__ (struct lex_source *src)
1264 {
1265   if (src->head - src->tail >= src->allocated)
1266     {
1267       size_t max_tail = lex_source_max_tail__ (src);
1268       if (max_tail > src->tail)
1269         {
1270           /* Advance the tail, freeing up room at the head. */
1271           memmove (src->buffer, src->buffer + (max_tail - src->tail),
1272                    src->head - max_tail);
1273           src->tail = max_tail;
1274         }
1275       else
1276         {
1277           /* Buffer is completely full.  Expand it. */
1278           src->buffer = x2realloc (src->buffer, &src->allocated);
1279         }
1280     }
1281   else
1282     {
1283       /* There's space available at the head of the buffer.  Nothing to do. */
1284     }
1285 }
1286
1287 static void
1288 lex_source_read__ (struct lex_source *src)
1289 {
1290   do
1291     {
1292       lex_source_expand__ (src);
1293
1294       size_t head_ofs = src->head - src->tail;
1295       size_t space = src->allocated - head_ofs;
1296       enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1297       size_t n = src->reader->class->read (src->reader, &src->buffer[head_ofs],
1298                                            space, prompt);
1299       assert (n <= space);
1300
1301       if (n == 0)
1302         {
1303           /* End of input. */
1304           src->reader->eof = true;
1305           lex_source_expand__ (src);
1306           return;
1307         }
1308
1309       src->head += n;
1310     }
1311   while (!memchr (&src->buffer[src->seg_pos - src->tail], '\n',
1312                   src->head - src->seg_pos));
1313 }
1314
1315 static struct lex_source *
1316 lex_source__ (const struct lexer *lexer)
1317 {
1318   return (ll_is_empty (&lexer->sources) ? NULL
1319           : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1320 }
1321
1322 static struct substring
1323 lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
1324 {
1325   const struct lex_token *token0 = lex_source_next__ (src, n0);
1326   const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1));
1327   size_t start = token0->token_pos;
1328   size_t end = token1->token_pos + token1->token_len;
1329
1330   return ss_buffer (&src->buffer[start - src->tail], end - start);
1331 }
1332
1333 static void
1334 lex_ellipsize__ (struct substring in, char *out, size_t out_size)
1335 {
1336   size_t out_maxlen;
1337   size_t out_len;
1338   int mblen;
1339
1340   assert (out_size >= 16);
1341   out_maxlen = out_size - 1;
1342   if (in.length > out_maxlen - 3)
1343     out_maxlen -= 3;
1344
1345   for (out_len = 0; out_len < in.length; out_len += mblen)
1346     {
1347       if (in.string[out_len] == '\n'
1348           || in.string[out_len] == '\0'
1349           || (in.string[out_len] == '\r'
1350               && out_len + 1 < in.length
1351               && in.string[out_len + 1] == '\n'))
1352         break;
1353
1354       mblen = u8_mblen (CHAR_CAST (const uint8_t *, in.string + out_len),
1355                         in.length - out_len);
1356
1357       if (mblen < 0)
1358         break;
1359
1360       if (out_len + mblen > out_maxlen)
1361         break;
1362     }
1363
1364   memcpy (out, in.string, out_len);
1365   strcpy (&out[out_len], out_len < in.length ? "..." : "");
1366 }
1367
1368 static void
1369 lex_source_error_valist (struct lex_source *src, int n0, int n1,
1370                          const char *format, va_list args)
1371 {
1372   const struct lex_token *token;
1373   struct string s;
1374
1375   ds_init_empty (&s);
1376
1377   token = lex_source_next__ (src, n0);
1378   if (token->token.type == T_ENDCMD)
1379     ds_put_cstr (&s, _("Syntax error at end of command"));
1380   else
1381     {
1382       struct substring syntax = lex_source_get_syntax__ (src, n0, n1);
1383       if (!ss_is_empty (syntax))
1384         {
1385           char syntax_cstr[64];
1386
1387           lex_ellipsize__ (syntax, syntax_cstr, sizeof syntax_cstr);
1388           ds_put_format (&s, _("Syntax error at `%s'"), syntax_cstr);
1389         }
1390       else
1391         ds_put_cstr (&s, _("Syntax error"));
1392     }
1393
1394   if (format)
1395     {
1396       ds_put_cstr (&s, ": ");
1397       ds_put_vformat (&s, format, args);
1398     }
1399   if (ds_last (&s) != '.')
1400     ds_put_byte (&s, '.');
1401
1402   struct msg m = {
1403     .category = MSG_C_SYNTAX,
1404     .severity = MSG_S_ERROR,
1405     .file_name = src->reader->file_name,
1406     .first_line = lex_source_get_first_line_number (src, n0),
1407     .last_line = lex_source_get_last_line_number (src, n1),
1408     .first_column = lex_source_get_first_column (src, n0),
1409     .last_column = lex_source_get_last_column (src, n1),
1410     .text = ds_steal_cstr (&s),
1411   };
1412   msg_emit (&m);
1413 }
1414
1415 static void PRINTF_FORMAT (2, 3)
1416 lex_get_error (struct lex_source *src, const char *format, ...)
1417 {
1418   va_list args;
1419   int n;
1420
1421   va_start (args, format);
1422
1423   n = deque_count (&src->deque) - 1;
1424   lex_source_error_valist (src, n, n, format, args);
1425   lex_source_pop_front (src);
1426
1427   va_end (args);
1428 }
1429
1430 /* Attempts to append an additional token into SRC's deque, reading more from
1431    the underlying lex_reader if necessary..  Returns true if successful, false
1432    if the deque already represents (a suffix of) the whole lex_reader's
1433    contents, */
1434 static bool
1435 lex_source_get__ (const struct lex_source *src_)
1436 {
1437   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1438   if (src->eof)
1439     return false;
1440
1441   /* State maintained while scanning tokens.  Usually we only need a single
1442      state, but scanner_push() can return SCAN_SAVE to indicate that the state
1443      needs to be saved and possibly restored later with SCAN_BACK. */
1444   struct state
1445     {
1446       struct segmenter segmenter;
1447       enum segment_type last_segment;
1448       int newlines;             /* Number of newlines encountered so far. */
1449       /* Maintained here so we can update lex_source's similar members when we
1450          finish. */
1451       size_t line_pos;
1452       size_t seg_pos;
1453     };
1454
1455   /* Initialize state. */
1456   struct state state =
1457     {
1458       .segmenter = src->segmenter,
1459       .newlines = 0,
1460       .seg_pos = src->seg_pos,
1461       .line_pos = src->line_pos,
1462     };
1463   struct state saved = state;
1464
1465   /* Append a new token to SRC and initialize it. */
1466   struct lex_token *token = lex_push_token__ (src);
1467   struct scanner scanner;
1468   scanner_init (&scanner, &token->token);
1469   token->line_pos = src->line_pos;
1470   token->token_pos = src->seg_pos;
1471   if (src->reader->line_number > 0)
1472     token->first_line = src->reader->line_number + src->n_newlines;
1473   else
1474     token->first_line = 0;
1475
1476   /* Extract segments and pass them through the scanner until we obtain a
1477      token. */
1478   for (;;)
1479     {
1480       /* Extract a segment. */
1481       const char *segment = &src->buffer[state.seg_pos - src->tail];
1482       size_t seg_maxlen = src->head - state.seg_pos;
1483       enum segment_type type;
1484       int seg_len = segmenter_push (&state.segmenter, segment, seg_maxlen,
1485                                     src->reader->eof, &type);
1486       if (seg_len < 0)
1487         {
1488           /* The segmenter needs more input to produce a segment. */
1489           assert (!src->reader->eof);
1490           lex_source_read__ (src);
1491           continue;
1492         }
1493
1494       /* Update state based on the segment. */
1495       state.last_segment = type;
1496       state.seg_pos += seg_len;
1497       if (type == SEG_NEWLINE)
1498         {
1499           state.newlines++;
1500           state.line_pos = state.seg_pos;
1501         }
1502
1503       /* Pass the segment into the scanner and try to get a token out. */
1504       enum scan_result result = scanner_push (&scanner, type,
1505                                               ss_buffer (segment, seg_len),
1506                                               &token->token);
1507       if (result == SCAN_SAVE)
1508         saved = state;
1509       else if (result == SCAN_BACK)
1510         {
1511           state = saved;
1512           break;
1513         }
1514       else if (result == SCAN_DONE)
1515         break;
1516     }
1517
1518   /* If we've reached the end of a line, or the end of a command, then pass
1519      the line to the output engine as a syntax text item.  */
1520   int n_lines = state.newlines;
1521   if (state.last_segment == SEG_END_COMMAND && !src->suppress_next_newline)
1522     {
1523       n_lines++;
1524       src->suppress_next_newline = true;
1525     }
1526   else if (n_lines > 0 && src->suppress_next_newline)
1527     {
1528       n_lines--;
1529       src->suppress_next_newline = false;
1530     }
1531   for (int i = 0; i < n_lines; i++)
1532     {
1533       /* Beginning of line. */
1534       const char *line = &src->buffer[src->journal_pos - src->tail];
1535
1536       /* Calculate line length, including \n or \r\n end-of-line if present.
1537
1538          We use src->head even though that may be beyond what we've actually
1539          converted to tokens (which is only through state.line_pos).  That's
1540          because, if we're emitting the line due to SEG_END_COMMAND, we want to
1541          take the whole line through the newline, not just through the '.'. */
1542       size_t max_len = src->head - src->journal_pos;
1543       const char *newline = memchr (line, '\n', max_len);
1544       size_t line_len = newline ? newline - line + 1 : max_len;
1545
1546       /* Calculate line length excluding end-of-line. */
1547       size_t copy_len = line_len;
1548       if (copy_len > 0 && line[copy_len - 1] == '\n')
1549         copy_len--;
1550       if (copy_len > 0 && line[copy_len - 1] == '\r')
1551         copy_len--;
1552
1553       /* Submit the line as syntax. */
1554       output_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX,
1555                                                    xmemdup0 (line, copy_len),
1556                                                    NULL));
1557
1558       src->journal_pos += line_len;
1559     }
1560
1561   token->token_len = state.seg_pos - src->seg_pos;
1562
1563   src->segmenter = state.segmenter;
1564   src->seg_pos = state.seg_pos;
1565   src->line_pos = state.line_pos;
1566   src->n_newlines += state.newlines;
1567
1568   switch (token->token.type)
1569     {
1570     default:
1571       break;
1572
1573     case T_STOP:
1574       token->token.type = T_ENDCMD;
1575       src->eof = true;
1576       break;
1577
1578     case SCAN_BAD_HEX_LENGTH:
1579       lex_get_error (src, _("String of hex digits has %d characters, which "
1580                             "is not a multiple of 2"),
1581                      (int) token->token.number);
1582       break;
1583
1584     case SCAN_BAD_HEX_DIGIT:
1585     case SCAN_BAD_UNICODE_DIGIT:
1586       lex_get_error (src, _("`%c' is not a valid hex digit"),
1587                      (int) token->token.number);
1588       break;
1589
1590     case SCAN_BAD_UNICODE_LENGTH:
1591       lex_get_error (src, _("Unicode string contains %d bytes, which is "
1592                             "not in the valid range of 1 to 8 bytes"),
1593                      (int) token->token.number);
1594       break;
1595
1596     case SCAN_BAD_UNICODE_CODE_POINT:
1597       lex_get_error (src, _("U+%04X is not a valid Unicode code point"),
1598                      (int) token->token.number);
1599       break;
1600
1601     case SCAN_EXPECTED_QUOTE:
1602       lex_get_error (src, _("Unterminated string constant"));
1603       break;
1604
1605     case SCAN_EXPECTED_EXPONENT:
1606       lex_get_error (src, _("Missing exponent following `%s'"),
1607                      token->token.string.string);
1608       break;
1609
1610     case SCAN_UNEXPECTED_DOT:
1611       lex_get_error (src, _("Unexpected `.' in middle of command"));
1612       break;
1613
1614     case SCAN_UNEXPECTED_CHAR:
1615       {
1616         char c_name[16];
1617         lex_get_error (src, _("Bad character %s in input"),
1618                        uc_name (token->token.number, c_name));
1619       }
1620       break;
1621
1622     case SCAN_SKIP:
1623       lex_source_pop_front (src);
1624       break;
1625     }
1626
1627   return true;
1628 }
1629 \f
1630 static void
1631 lex_source_push_endcmd__ (struct lex_source *src)
1632 {
1633   struct lex_token *token = lex_push_token__ (src);
1634   token->token.type = T_ENDCMD;
1635   token->token_pos = 0;
1636   token->token_len = 0;
1637   token->line_pos = 0;
1638   token->first_line = 0;
1639 }
1640
1641 static struct lex_source *
1642 lex_source_create (struct lex_reader *reader)
1643 {
1644   struct lex_source *src;
1645   enum segmenter_mode mode;
1646
1647   src = xzalloc (sizeof *src);
1648   src->reader = reader;
1649
1650   if (reader->syntax == LEX_SYNTAX_AUTO)
1651     mode = SEG_MODE_AUTO;
1652   else if (reader->syntax == LEX_SYNTAX_INTERACTIVE)
1653     mode = SEG_MODE_INTERACTIVE;
1654   else if (reader->syntax == LEX_SYNTAX_BATCH)
1655     mode = SEG_MODE_BATCH;
1656   else
1657     NOT_REACHED ();
1658   segmenter_init (&src->segmenter, mode);
1659
1660   src->tokens = deque_init (&src->deque, 4, sizeof *src->tokens);
1661
1662   lex_source_push_endcmd__ (src);
1663
1664   return src;
1665 }
1666
1667 static void
1668 lex_source_destroy (struct lex_source *src)
1669 {
1670   char *file_name = src->reader->file_name;
1671   char *encoding = src->reader->encoding;
1672   if (src->reader->class->destroy != NULL)
1673     src->reader->class->destroy (src->reader);
1674   free (file_name);
1675   free (encoding);
1676   free (src->buffer);
1677   while (!deque_is_empty (&src->deque))
1678     lex_source_pop__ (src);
1679   free (src->tokens);
1680   ll_remove (&src->ll);
1681   free (src);
1682 }
1683 \f
1684 struct lex_file_reader
1685   {
1686     struct lex_reader reader;
1687     struct u8_istream *istream;
1688   };
1689
1690 static struct lex_reader_class lex_file_reader_class;
1691
1692 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
1693    from stdin if FILE_NAME is "-").  The file is expected to be encoded with
1694    ENCODING, which should take one of the forms accepted by
1695    u8_istream_for_file().  SYNTAX and ERROR become the syntax mode and error
1696    mode of the new reader, respectively.
1697
1698    Returns a null pointer if FILE_NAME cannot be opened. */
1699 struct lex_reader *
1700 lex_reader_for_file (const char *file_name, const char *encoding,
1701                      enum lex_syntax_mode syntax,
1702                      enum lex_error_mode error)
1703 {
1704   struct lex_file_reader *r;
1705   struct u8_istream *istream;
1706
1707   istream = (!strcmp(file_name, "-")
1708              ? u8_istream_for_fd (encoding, STDIN_FILENO)
1709              : u8_istream_for_file (encoding, file_name, O_RDONLY));
1710   if (istream == NULL)
1711     {
1712       msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
1713       return NULL;
1714     }
1715
1716   r = xmalloc (sizeof *r);
1717   lex_reader_init (&r->reader, &lex_file_reader_class);
1718   r->reader.syntax = syntax;
1719   r->reader.error = error;
1720   r->reader.file_name = xstrdup (file_name);
1721   r->reader.encoding = xstrdup_if_nonnull (encoding);
1722   r->reader.line_number = 1;
1723   r->istream = istream;
1724
1725   return &r->reader;
1726 }
1727
1728 static struct lex_file_reader *
1729 lex_file_reader_cast (struct lex_reader *r)
1730 {
1731   return UP_CAST (r, struct lex_file_reader, reader);
1732 }
1733
1734 static size_t
1735 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
1736                enum prompt_style prompt_style UNUSED)
1737 {
1738   struct lex_file_reader *r = lex_file_reader_cast (r_);
1739   ssize_t n_read = u8_istream_read (r->istream, buf, n);
1740   if (n_read < 0)
1741     {
1742       msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
1743       return 0;
1744     }
1745   return n_read;
1746 }
1747
1748 static void
1749 lex_file_close (struct lex_reader *r_)
1750 {
1751   struct lex_file_reader *r = lex_file_reader_cast (r_);
1752
1753   if (u8_istream_fileno (r->istream) != STDIN_FILENO)
1754     {
1755       if (u8_istream_close (r->istream) != 0)
1756         msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
1757     }
1758   else
1759     u8_istream_free (r->istream);
1760
1761   free (r);
1762 }
1763
1764 static struct lex_reader_class lex_file_reader_class =
1765   {
1766     lex_file_read,
1767     lex_file_close
1768   };
1769 \f
1770 struct lex_string_reader
1771   {
1772     struct lex_reader reader;
1773     struct substring s;
1774     size_t offset;
1775   };
1776
1777 static struct lex_reader_class lex_string_reader_class;
1778
1779 /* Creates and returns a new lex_reader for the contents of S, which must be
1780    encoded in the given ENCODING.  The new reader takes ownership of S and will free it
1781    with ss_dealloc() when it is closed. */
1782 struct lex_reader *
1783 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
1784 {
1785   struct lex_string_reader *r;
1786
1787   r = xmalloc (sizeof *r);
1788   lex_reader_init (&r->reader, &lex_string_reader_class);
1789   r->reader.syntax = LEX_SYNTAX_AUTO;
1790   r->reader.encoding = xstrdup_if_nonnull (encoding);
1791   r->s = s;
1792   r->offset = 0;
1793
1794   return &r->reader;
1795 }
1796
1797 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
1798    which must be encoded in ENCODING.  The caller retains ownership of S. */
1799 struct lex_reader *
1800 lex_reader_for_string (const char *s, const char *encoding)
1801 {
1802   struct substring ss;
1803   ss_alloc_substring (&ss, ss_cstr (s));
1804   return lex_reader_for_substring_nocopy (ss, encoding);
1805 }
1806
1807 /* Formats FORMAT as a printf()-like format string and creates and returns a
1808    new lex_reader for the formatted result.  */
1809 struct lex_reader *
1810 lex_reader_for_format (const char *format, const char *encoding, ...)
1811 {
1812   struct lex_reader *r;
1813   va_list args;
1814
1815   va_start (args, encoding);
1816   r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
1817   va_end (args);
1818
1819   return r;
1820 }
1821
1822 static struct lex_string_reader *
1823 lex_string_reader_cast (struct lex_reader *r)
1824 {
1825   return UP_CAST (r, struct lex_string_reader, reader);
1826 }
1827
1828 static size_t
1829 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
1830                  enum prompt_style prompt_style UNUSED)
1831 {
1832   struct lex_string_reader *r = lex_string_reader_cast (r_);
1833   size_t chunk;
1834
1835   chunk = MIN (n, r->s.length - r->offset);
1836   memcpy (buf, r->s.string + r->offset, chunk);
1837   r->offset += chunk;
1838
1839   return chunk;
1840 }
1841
1842 static void
1843 lex_string_close (struct lex_reader *r_)
1844 {
1845   struct lex_string_reader *r = lex_string_reader_cast (r_);
1846
1847   ss_dealloc (&r->s);
1848   free (r);
1849 }
1850
1851 static struct lex_reader_class lex_string_reader_class =
1852   {
1853     lex_string_read,
1854     lex_string_close
1855   };