pintos-os.org Git - pspp/blob - src/language/lexer/lexer.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "language/lexer/lexer.h"
  20
  21 #include <errno.h>
  22 #include <fcntl.h>
  23 #include <limits.h>
  24 #include <math.h>
  25 #include <stdarg.h>
  26 #include <stdlib.h>
  27 #include <string.h>
  28 #include <unictype.h>
  29 #include <unistd.h>
  30 #include <unistr.h>
  31 #include <uniwidth.h>
  32
  33 #include "language/command.h"
  34 #include "language/lexer/scan.h"
  35 #include "language/lexer/segment.h"
  36 #include "language/lexer/token.h"
  37 #include "libpspp/assertion.h"
  38 #include "libpspp/cast.h"
  39 #include "libpspp/deque.h"
  40 #include "libpspp/i18n.h"
  41 #include "libpspp/ll.h"
  42 #include "libpspp/message.h"
  43 #include "libpspp/misc.h"
  44 #include "libpspp/str.h"
  45 #include "libpspp/u8-istream.h"
  46 #include "output/journal.h"
  47 #include "output/text-item.h"
  48
  49 #include "gl/c-ctype.h"
  50 #include "gl/minmax.h"
  51 #include "gl/xalloc.h"
  52 #include "gl/xmemdup0.h"
  53
  54 #include "gettext.h"
  55 #define _(msgid) gettext (msgid)
  56 #define N_(msgid) msgid
  57
  58 /* A token within a lex_source. */
  59 struct lex_token
  60   {
  61     /* The regular token information. */
  62     struct token token;
  63
  64     /* Location of token in terms of the lex_source's buffer.
  65        src->tail <= line_pos <= token_pos <= src->head. */
  66     size_t token_pos;           /* Start of token. */
  67     size_t token_len;           /* Length of source for token in bytes. */
  68     size_t line_pos;            /* Start of line containing token_pos. */
  69     int first_line;             /* Line number at token_pos. */
  70   };
  71
  72 /* A source of tokens, corresponding to a syntax file.
  73
  74    This is conceptually a lex_reader wrapped with everything needed to convert
  75    its UTF-8 bytes into tokens. */
  76 struct lex_source
  77   {
  78     struct ll ll;               /* In lexer's list of sources. */
  79     struct lex_reader *reader;
  80     struct segmenter segmenter;
  81     bool eof;                   /* True if T_STOP was read from 'reader'. */
  82
  83     /* Buffer of UTF-8 bytes. */
  84     char *buffer;
  85     size_t allocated;           /* Number of bytes allocated. */
  86     size_t tail;                /* &buffer[0] offset into UTF-8 source. */
  87     size_t head;                /* &buffer[head - tail] offset into source. */
  88
  89     /* Positions in source file, tail <= pos <= head for each member here. */
  90     size_t journal_pos;         /* First byte not yet output to journal. */
  91     size_t seg_pos;             /* First byte not yet scanned as token. */
  92     size_t line_pos;            /* First byte of line containing seg_pos. */
  93
  94     int n_newlines;             /* Number of new-lines up to seg_pos. */
  95     bool suppress_next_newline;
  96
  97     /* Tokens. */
  98     struct deque deque;         /* Indexes into 'tokens'. */
  99     struct lex_token *tokens;   /* Lookahead tokens for parser. */
 100   };
 101
 102 static struct lex_source *lex_source_create (struct lex_reader *);
 103 static void lex_source_destroy (struct lex_source *);
 104
 105 /* Lexer. */
 106 struct lexer
 107   {
 108     struct ll_list sources;     /* Contains "struct lex_source"s. */
 109   };
 110
 111 static struct lex_source *lex_source__ (const struct lexer *);
 112 static const struct lex_token *lex_next__ (const struct lexer *, int n);
 113 static void lex_source_push_endcmd__ (struct lex_source *);
 114
 115 static void lex_source_pop__ (struct lex_source *);
 116 static bool lex_source_get__ (const struct lex_source *);
 117 static void lex_source_error_valist (struct lex_source *, int n0, int n1,
 118                                      const char *format, va_list)
 119    PRINTF_FORMAT (4, 0);
 120 static const struct lex_token *lex_source_next__ (const struct lex_source *,
 121                                                   int n);
 122 \f
 123 /* Initializes READER with the specified CLASS and otherwise some reasonable
 124    defaults.  The caller should fill in the others members as desired. */
 125 void
 126 lex_reader_init (struct lex_reader *reader,
 127                  const struct lex_reader_class *class)
 128 {
 129   reader->class = class;
 130   reader->syntax = LEX_SYNTAX_AUTO;
 131   reader->error = LEX_ERROR_CONTINUE;
 132   reader->file_name = NULL;
 133   reader->encoding = NULL;
 134   reader->line_number = 0;
 135 }
 136
 137 /* Frees any file name already in READER and replaces it by a copy of
 138    FILE_NAME, or if FILE_NAME is null then clears any existing name. */
 139 void
 140 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
 141 {
 142   free (reader->file_name);
 143   reader->file_name = file_name != NULL ? xstrdup (file_name) : NULL;
 144 }
 145 \f
 146 /* Creates and returns a new lexer. */
 147 struct lexer *
 148 lex_create (void)
 149 {
 150   struct lexer *lexer = xzalloc (sizeof *lexer);
 151   ll_init (&lexer->sources);
 152   return lexer;
 153 }
 154
 155 /* Destroys LEXER. */
 156 void
 157 lex_destroy (struct lexer *lexer)
 158 {
 159   if (lexer != NULL)
 160     {
 161       struct lex_source *source, *next;
 162
 163       ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
 164         lex_source_destroy (source);
 165       free (lexer);
 166     }
 167 }
 168
 169 /* Inserts READER into LEXER so that the next token read by LEXER comes from
 170    READER.  Before the caller, LEXER must either be empty or at a T_ENDCMD
 171    token. */
 172 void
 173 lex_include (struct lexer *lexer, struct lex_reader *reader)
 174 {
 175   assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
 176   ll_push_head (&lexer->sources, &lex_source_create (reader)->ll);
 177 }
 178
 179 /* Appends READER to LEXER, so that it will be read after all other current
 180    readers have already been read. */
 181 void
 182 lex_append (struct lexer *lexer, struct lex_reader *reader)
 183 {
 184   ll_push_tail (&lexer->sources, &lex_source_create (reader)->ll);
 185 }
 186 \f
 187 /* Advancing. */
 188
 189 static struct lex_token *
 190 lex_push_token__ (struct lex_source *src)
 191 {
 192   struct lex_token *token;
 193
 194   if (deque_is_full (&src->deque))
 195     src->tokens = deque_expand (&src->deque, src->tokens, sizeof *src->tokens);
 196
 197   token = &src->tokens[deque_push_front (&src->deque)];
 198   token_init (&token->token);
 199   return token;
 200 }
 201
 202 static void
 203 lex_source_pop__ (struct lex_source *src)
 204 {
 205   token_destroy (&src->tokens[deque_pop_back (&src->deque)].token);
 206 }
 207
 208 static void
 209 lex_source_pop_front (struct lex_source *src)
 210 {
 211   token_destroy (&src->tokens[deque_pop_front (&src->deque)].token);
 212 }
 213
 214 /* Advances LEXER to the next token, consuming the current token. */
 215 void
 216 lex_get (struct lexer *lexer)
 217 {
 218   struct lex_source *src;
 219
 220   src = lex_source__ (lexer);
 221   if (src == NULL)
 222     return;
 223
 224   if (!deque_is_empty (&src->deque))
 225     lex_source_pop__ (src);
 226
 227   while (deque_is_empty (&src->deque))
 228     if (!lex_source_get__ (src))
 229       {
 230         lex_source_destroy (src);
 231         src = lex_source__ (lexer);
 232         if (src == NULL)
 233           return;
 234       }
 235 }
 236 \f
 237 /* Issuing errors. */
 238
 239 /* Prints a syntax error message containing the current token and
 240    given message MESSAGE (if non-null). */
 241 void
 242 lex_error (struct lexer *lexer, const char *format, ...)
 243 {
 244   va_list args;
 245
 246   va_start (args, format);
 247   lex_next_error_valist (lexer, 0, 0, format, args);
 248   va_end (args);
 249 }
 250
 251 /* Prints a syntax error message containing the current token and
 252    given message MESSAGE (if non-null). */
 253 void
 254 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
 255 {
 256   lex_next_error_valist (lexer, 0, 0, format, args);
 257 }
 258
 259 /* Prints a syntax error message containing the current token and
 260    given message MESSAGE (if non-null). */
 261 void
 262 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
 263 {
 264   va_list args;
 265
 266   va_start (args, format);
 267   lex_next_error_valist (lexer, n0, n1, format, args);
 268   va_end (args);
 269 }
 270
 271 /* Prints a syntax error message saying that OPTION0 or one of the other
 272    strings following it, up to the first NULL, is expected. */
 273 void
 274 lex_error_expecting (struct lexer *lexer, const char *option0, ...)
 275 {
 276   enum { MAX_OPTIONS = 8 };
 277   const char *options[MAX_OPTIONS + 1];
 278   va_list args;
 279   int n;
 280
 281   va_start (args, option0);
 282   options[0] = option0;
 283   n = 0;
 284   while (n + 1 < MAX_OPTIONS && options[n] != NULL)
 285     options[++n] = va_arg (args, const char *);
 286   va_end (args);
 287
 288   switch (n)
 289     {
 290     case 0:
 291       lex_error (lexer, NULL);
 292       break;
 293
 294     case 1:
 295       lex_error (lexer, _("expecting %s"), options[0]);
 296       break;
 297
 298     case 2:
 299       lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
 300       break;
 301
 302     case 3:
 303       lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
 304                  options[2]);
 305       break;
 306
 307     case 4:
 308       lex_error (lexer, _("expecting %s, %s, %s, or %s"),
 309                  options[0], options[1], options[2], options[3]);
 310       break;
 311
 312     case 5:
 313       lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
 314                  options[0], options[1], options[2], options[3], options[4]);
 315       break;
 316
 317     case 6:
 318       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
 319                  options[0], options[1], options[2], options[3], options[4],
 320                  options[5]);
 321       break;
 322
 323     case 7:
 324       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
 325                  options[0], options[1], options[2], options[3], options[4],
 326                  options[5], options[6]);
 327       break;
 328
 329     case 8:
 330       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
 331                  options[0], options[1], options[2], options[3], options[4],
 332                  options[5], options[6], options[7]);
 333       break;
 334
 335     default:
 336       NOT_REACHED ();
 337     }
 338 }
 339
 340 /* Reports an error to the effect that subcommand SBC may only be specified
 341    once.
 342
 343    This function does not take a lexer as an argument or use lex_error(),
 344    because the result would ordinarily just be redundant: "Syntax error at
 345    SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
 346    not help the user find the error. */
 347 void
 348 lex_sbc_only_once (const char *sbc)
 349 {
 350   msg (SE, _("Subcommand %s may only be specified once."), sbc);
 351 }
 352
 353 /* Reports an error to the effect that subcommand SBC is missing.
 354
 355    This function does not take a lexer as an argument or use lex_error(),
 356    because a missing subcommand can normally be detected only after the whole
 357    command has been parsed, and so lex_error() would always report "Syntax
 358    error at end of command", which does not help the user find the error. */
 359 void
 360 lex_sbc_missing (const char *sbc)
 361 {
 362   msg (SE, _("Required subcommand %s was not specified."), sbc);
 363 }
 364
 365 /* Reports an error to the effect that specification SPEC may only be specified
 366    once within subcommand SBC. */
 367 void
 368 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
 369 {
 370   lex_error (lexer, _("%s may only be specified once within subcommand %s"),
 371              spec, sbc);
 372 }
 373
 374 /* Reports an error to the effect that specification SPEC is missing within
 375    subcommand SBC. */
 376 void
 377 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
 378 {
 379   lex_error (lexer, _("Required %s specification missing from %s subcommand"),
 380              sbc, spec);
 381 }
 382
 383 /* Prints a syntax error message containing the current token and
 384    given message MESSAGE (if non-null). */
 385 void
 386 lex_next_error_valist (struct lexer *lexer, int n0, int n1,
 387                        const char *format, va_list args)
 388 {
 389   struct lex_source *src = lex_source__ (lexer);
 390
 391   if (src != NULL)
 392     lex_source_error_valist (src, n0, n1, format, args);
 393   else
 394     {
 395       struct string s;
 396
 397       ds_init_empty (&s);
 398       ds_put_format (&s, _("Syntax error at end of input"));
 399       if (format != NULL)
 400         {
 401           ds_put_cstr (&s, ": ");
 402           ds_put_vformat (&s, format, args);
 403         }
 404       ds_put_byte (&s, '.');
 405       msg (SE, "%s", ds_cstr (&s));
 406       ds_destroy (&s);
 407     }
 408 }
 409
 410 /* Checks that we're at end of command.
 411    If so, returns a successful command completion code.
 412    If not, flags a syntax error and returns an error command
 413    completion code. */
 414 int
 415 lex_end_of_command (struct lexer *lexer)
 416 {
 417   if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
 418     {
 419       lex_error (lexer, _("expecting end of command"));
 420       return CMD_FAILURE;
 421     }
 422   else
 423     return CMD_SUCCESS;
 424 }
 425 \f
 426 /* Token testing functions. */
 427
 428 /* Returns true if the current token is a number. */
 429 bool
 430 lex_is_number (struct lexer *lexer)
 431 {
 432   return lex_next_is_number (lexer, 0);
 433 }
 434
 435 /* Returns true if the current token is a string. */
 436 bool
 437 lex_is_string (struct lexer *lexer)
 438 {
 439   return lex_next_is_string (lexer, 0);
 440 }
 441
 442 /* Returns the value of the current token, which must be a
 443    floating point number. */
 444 double
 445 lex_number (struct lexer *lexer)
 446 {
 447   return lex_next_number (lexer, 0);
 448 }
 449
 450 /* Returns true iff the current token is an integer. */
 451 bool
 452 lex_is_integer (struct lexer *lexer)
 453 {
 454   return lex_next_is_integer (lexer, 0);
 455 }
 456
 457 /* Returns the value of the current token, which must be an
 458    integer. */
 459 long
 460 lex_integer (struct lexer *lexer)
 461 {
 462   return lex_next_integer (lexer, 0);
 463 }
 464 \f
 465 /* Token testing functions with lookahead.
 466
 467    A value of 0 for N as an argument to any of these functions refers to the
 468    current token.  Lookahead is limited to the current command.  Any N greater
 469    than the number of tokens remaining in the current command will be treated
 470    as referring to a T_ENDCMD token. */
 471
 472 /* Returns true if the token N ahead of the current token is a number. */
 473 bool
 474 lex_next_is_number (struct lexer *lexer, int n)
 475 {
 476   enum token_type next_token = lex_next_token (lexer, n);
 477   return next_token == T_POS_NUM || next_token == T_NEG_NUM;
 478 }
 479
 480 /* Returns true if the token N ahead of the current token is a string. */
 481 bool
 482 lex_next_is_string (struct lexer *lexer, int n)
 483 {
 484   return lex_next_token (lexer, n) == T_STRING;
 485 }
 486
 487 /* Returns the value of the token N ahead of the current token, which must be a
 488    floating point number. */
 489 double
 490 lex_next_number (struct lexer *lexer, int n)
 491 {
 492   assert (lex_next_is_number (lexer, n));
 493   return lex_next_tokval (lexer, n);
 494 }
 495
 496 /* Returns true if the token N ahead of the current token is an integer. */
 497 bool
 498 lex_next_is_integer (struct lexer *lexer, int n)
 499 {
 500   double value;
 501
 502   if (!lex_next_is_number (lexer, n))
 503     return false;
 504
 505   value = lex_next_tokval (lexer, n);
 506   return value > LONG_MIN && value <= LONG_MAX && floor (value) == value;
 507 }
 508
 509 /* Returns the value of the token N ahead of the current token, which must be
 510    an integer. */
 511 long
 512 lex_next_integer (struct lexer *lexer, int n)
 513 {
 514   assert (lex_next_is_integer (lexer, n));
 515   return lex_next_tokval (lexer, n);
 516 }
 517 \f
 518 /* Token matching functions. */
 519
 520 /* If the current token has the specified TYPE, skips it and returns true.
 521    Otherwise, returns false. */
 522 bool
 523 lex_match (struct lexer *lexer, enum token_type type)
 524 {
 525   if (lex_token (lexer) == type)
 526     {
 527       lex_get (lexer);
 528       return true;
 529     }
 530   else
 531     return false;
 532 }
 533
 534 /* If the current token matches IDENTIFIER, skips it and returns true.
 535    IDENTIFIER may be abbreviated to its first three letters.  Otherwise,
 536    returns false.
 537
 538    IDENTIFIER must be an ASCII string. */
 539 bool
 540 lex_match_id (struct lexer *lexer, const char *identifier)
 541 {
 542   return lex_match_id_n (lexer, identifier, 3);
 543 }
 544
 545 /* If the current token is IDENTIFIER, skips it and returns true.  IDENTIFIER
 546    may be abbreviated to its first N letters.  Otherwise, returns false.
 547
 548    IDENTIFIER must be an ASCII string. */
 549 bool
 550 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
 551 {
 552   if (lex_token (lexer) == T_ID
 553       && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
 554     {
 555       lex_get (lexer);
 556       return true;
 557     }
 558   else
 559     return false;
 560 }
 561
 562 /* If the current token is integer X, skips it and returns true.  Otherwise,
 563    returns false. */
 564 bool
 565 lex_match_int (struct lexer *lexer, int x)
 566 {
 567   if (lex_is_integer (lexer) && lex_integer (lexer) == x)
 568     {
 569       lex_get (lexer);
 570       return true;
 571     }
 572   else
 573     return false;
 574 }
 575 \f
 576 /* Forced matches. */
 577
 578 /* If this token is IDENTIFIER, skips it and returns true.  IDENTIFIER may be
 579    abbreviated to its first 3 letters.  Otherwise, reports an error and returns
 580    false.
 581
 582    IDENTIFIER must be an ASCII string. */
 583 bool
 584 lex_force_match_id (struct lexer *lexer, const char *identifier)
 585 {
 586   if (lex_match_id (lexer, identifier))
 587     return true;
 588   else
 589     {
 590       lex_error_expecting (lexer, identifier, NULL_SENTINEL);
 591       return false;
 592     }
 593 }
 594
 595 /* If the current token has the specified TYPE, skips it and returns true.
 596    Otherwise, reports an error and returns false. */
 597 bool
 598 lex_force_match (struct lexer *lexer, enum token_type type)
 599 {
 600   if (lex_token (lexer) == type)
 601     {
 602       lex_get (lexer);
 603       return true;
 604     }
 605   else
 606     {
 607       char *s = xasprintf ("`%s'", token_type_to_string (type));
 608       lex_error_expecting (lexer, s, NULL_SENTINEL);
 609       free (s);
 610       return false;
 611     }
 612 }
 613
 614 /* If the current token is a string, does nothing and returns true.
 615    Otherwise, reports an error and returns false. */
 616 bool
 617 lex_force_string (struct lexer *lexer)
 618 {
 619   if (lex_is_string (lexer))
 620     return true;
 621   else
 622     {
 623       lex_error (lexer, _("expecting string"));
 624       return false;
 625     }
 626 }
 627
 628 /* If the current token is a string or an identifier, does nothing and returns
 629    true.  Otherwise, reports an error and returns false.
 630
 631    This is meant for use in syntactic situations where we want to encourage the
 632    user to supply a quoted string, but for compatibility we also accept
 633    identifiers.  (One example of such a situation is file names.)  Therefore,
 634    the error message issued when the current token is wrong only says that a
 635    string is expected and doesn't mention that an identifier would also be
 636    accepted. */
 637 bool
 638 lex_force_string_or_id (struct lexer *lexer)
 639 {
 640   return lex_is_integer (lexer) || lex_force_string (lexer);
 641 }
 642
 643 /* If the current token is an integer, does nothing and returns true.
 644    Otherwise, reports an error and returns false. */
 645 bool
 646 lex_force_int (struct lexer *lexer)
 647 {
 648   if (lex_is_integer (lexer))
 649     return true;
 650   else
 651     {
 652       lex_error (lexer, _("expecting integer"));
 653       return false;
 654     }
 655 }
 656
 657 /* If the current token is a number, does nothing and returns true.
 658    Otherwise, reports an error and returns false. */
 659 bool
 660 lex_force_num (struct lexer *lexer)
 661 {
 662   if (lex_is_number (lexer))
 663     return true;
 664
 665   lex_error (lexer, _("expecting number"));
 666   return false;
 667 }
 668
 669 /* If the current token is an identifier, does nothing and returns true.
 670    Otherwise, reports an error and returns false. */
 671 bool
 672 lex_force_id (struct lexer *lexer)
 673 {
 674   if (lex_token (lexer) == T_ID)
 675     return true;
 676
 677   lex_error (lexer, _("expecting identifier"));
 678   return false;
 679 }
 680 \f
 681 /* Token accessors. */
 682
 683 /* Returns the type of LEXER's current token. */
 684 enum token_type
 685 lex_token (const struct lexer *lexer)
 686 {
 687   return lex_next_token (lexer, 0);
 688 }
 689
 690 /* Returns the number in LEXER's current token.
 691
 692    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
 693    tokens this function will always return zero. */
 694 double
 695 lex_tokval (const struct lexer *lexer)
 696 {
 697   return lex_next_tokval (lexer, 0);
 698 }
 699
 700 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
 701
 702    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 703    this functions this function will always return NULL.
 704
 705    The UTF-8 encoding of the returned string is correct for variable names and
 706    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 707    data_in() to use it in a "union value".  */
 708 const char *
 709 lex_tokcstr (const struct lexer *lexer)
 710 {
 711   return lex_next_tokcstr (lexer, 0);
 712 }
 713
 714 /* Returns the string in LEXER's current token, UTF-8 encoded.  The string is
 715    null-terminated (but the null terminator is not included in the returned
 716    substring's 'length').
 717
 718    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 719    this functions this function will always return NULL.
 720
 721    The UTF-8 encoding of the returned string is correct for variable names and
 722    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 723    data_in() to use it in a "union value".  */
 724 struct substring
 725 lex_tokss (const struct lexer *lexer)
 726 {
 727   return lex_next_tokss (lexer, 0);
 728 }
 729 \f
 730 /* Looking ahead.
 731
 732    A value of 0 for N as an argument to any of these functions refers to the
 733    current token.  Lookahead is limited to the current command.  Any N greater
 734    than the number of tokens remaining in the current command will be treated
 735    as referring to a T_ENDCMD token. */
 736
 737 static const struct lex_token *
 738 lex_next__ (const struct lexer *lexer_, int n)
 739 {
 740   struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
 741   struct lex_source *src = lex_source__ (lexer);
 742
 743   if (src != NULL)
 744     return lex_source_next__ (src, n);
 745   else
 746     {
 747       static const struct lex_token stop_token =
 748         { TOKEN_INITIALIZER (T_STOP, 0.0, ""), 0, 0, 0, 0 };
 749
 750       return &stop_token;
 751     }
 752 }
 753
 754 static const struct lex_token *
 755 lex_source_next__ (const struct lex_source *src, int n)
 756 {
 757   while (deque_count (&src->deque) <= n)
 758     {
 759       if (!deque_is_empty (&src->deque))
 760         {
 761           struct lex_token *front;
 762
 763           front = &src->tokens[deque_front (&src->deque, 0)];
 764           if (front->token.type == T_STOP || front->token.type == T_ENDCMD)
 765             return front;
 766         }
 767
 768       lex_source_get__ (src);
 769     }
 770
 771   return &src->tokens[deque_back (&src->deque, n)];
 772 }
 773
 774 /* Returns the "struct token" of the token N after the current one in LEXER.
 775    The returned pointer can be invalidated by pretty much any succeeding call
 776    into the lexer, although the string pointer within the returned token is
 777    only invalidated by consuming the token (e.g. with lex_get()). */
 778 const struct token *
 779 lex_next (const struct lexer *lexer, int n)
 780 {
 781   return &lex_next__ (lexer, n)->token;
 782 }
 783
 784 /* Returns the type of the token N after the current one in LEXER. */
 785 enum token_type
 786 lex_next_token (const struct lexer *lexer, int n)
 787 {
 788   return lex_next (lexer, n)->type;
 789 }
 790
 791 /* Returns the number in the tokn N after the current one in LEXER.
 792
 793    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
 794    tokens this function will always return zero. */
 795 double
 796 lex_next_tokval (const struct lexer *lexer, int n)
 797 {
 798   const struct token *token = lex_next (lexer, n);
 799   return token->number;
 800 }
 801
 802 /* Returns the null-terminated string in the token N after the current one, in
 803    UTF-8 encoding.
 804
 805    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 806    this functions this function will always return NULL.
 807
 808    The UTF-8 encoding of the returned string is correct for variable names and
 809    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 810    data_in() to use it in a "union value".  */
 811 const char *
 812 lex_next_tokcstr (const struct lexer *lexer, int n)
 813 {
 814   return lex_next_tokss (lexer, n).string;
 815 }
 816
 817 /* Returns the string in the token N after the current one, in UTF-8 encoding.
 818    The string is null-terminated (but the null terminator is not included in
 819    the returned substring's 'length').
 820
 821    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 822    this functions this function will always return NULL.
 823
 824    The UTF-8 encoding of the returned string is correct for variable names and
 825    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 826    data_in() to use it in a "union value".  */
 827 struct substring
 828 lex_next_tokss (const struct lexer *lexer, int n)
 829 {
 830   return lex_next (lexer, n)->string;
 831 }
 832
 833 static bool
 834 lex_tokens_match (const struct token *actual, const struct token *expected)
 835 {
 836   if (actual->type != expected->type)
 837     return false;
 838
 839   switch (actual->type)
 840     {
 841     case T_POS_NUM:
 842     case T_NEG_NUM:
 843       return actual->number == expected->number;
 844
 845     case T_ID:
 846       return lex_id_match (expected->string, actual->string);
 847
 848     case T_STRING:
 849       return (actual->string.length == expected->string.length
 850               && !memcmp (actual->string.string, expected->string.string,
 851                           actual->string.length));
 852
 853     default:
 854       return true;
 855     }
 856 }
 857
 858 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
 859    skips it and returns true.  Otherwise, returns false.
 860
 861    S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
 862    "2SLS", or "END INPUT PROGRAM".  Identifiers may be abbreviated to their
 863    first three letters. */
 864 bool
 865 lex_match_phrase (struct lexer *lexer, const char *s)
 866 {
 867   struct string_lexer slex;
 868   struct token token;
 869   int i;
 870
 871   i = 0;
 872   string_lexer_init (&slex, s, SEG_MODE_INTERACTIVE);
 873   while (string_lexer_next (&slex, &token))
 874     if (token.type != SCAN_SKIP)
 875       {
 876         bool match = lex_tokens_match (lex_next (lexer, i++), &token);
 877         token_destroy (&token);
 878         if (!match)
 879           return false;
 880       }
 881
 882   while (i-- > 0)
 883     lex_get (lexer);
 884   return true;
 885 }
 886
 887 static int
 888 lex_source_get_first_line_number (const struct lex_source *src, int n)
 889 {
 890   return lex_source_next__ (src, n)->first_line;
 891 }
 892
 893 static int
 894 count_newlines (char *s, size_t length)
 895 {
 896   int n_newlines = 0;
 897   char *newline;
 898
 899   while ((newline = memchr (s, '\n', length)) != NULL)
 900     {
 901       n_newlines++;
 902       length -= (newline + 1) - s;
 903       s = newline + 1;
 904     }
 905
 906   return n_newlines;
 907 }
 908
 909 static int
 910 lex_source_get_last_line_number (const struct lex_source *src, int n)
 911 {
 912   const struct lex_token *token = lex_source_next__ (src, n);
 913
 914   if (token->first_line == 0)
 915     return 0;
 916   else
 917     {
 918       char *token_str = &src->buffer[token->token_pos - src->tail];
 919       return token->first_line + count_newlines (token_str, token->token_len) + 1;
 920     }
 921 }
 922
 923 static int
 924 count_columns (const char *s_, size_t length)
 925 {
 926   const uint8_t *s = CHAR_CAST (const uint8_t *, s_);
 927   int columns;
 928   size_t ofs;
 929   int mblen;
 930
 931   columns = 0;
 932   for (ofs = 0; ofs < length; ofs += mblen)
 933     {
 934       ucs4_t uc;
 935
 936       mblen = u8_mbtouc (&uc, s + ofs, length - ofs);
 937       if (uc != '\t')
 938         {
 939           int width = uc_width (uc, "UTF-8");
 940           if (width > 0)
 941             columns += width;
 942         }
 943       else
 944         columns = ROUND_UP (columns + 1, 8);
 945     }
 946
 947   return columns + 1;
 948 }
 949
 950 static int
 951 lex_source_get_first_column (const struct lex_source *src, int n)
 952 {
 953   const struct lex_token *token = lex_source_next__ (src, n);
 954   return count_columns (&src->buffer[token->line_pos - src->tail],
 955                         token->token_pos - token->line_pos);
 956 }
 957
 958 static int
 959 lex_source_get_last_column (const struct lex_source *src, int n)
 960 {
 961   const struct lex_token *token = lex_source_next__ (src, n);
 962   char *start, *end, *newline;
 963
 964   start = &src->buffer[token->line_pos - src->tail];
 965   end = &src->buffer[(token->token_pos + token->token_len) - src->tail];
 966   newline = memrchr (start, '\n', end - start);
 967   if (newline != NULL)
 968     start = newline + 1;
 969   return count_columns (start, end - start);
 970 }
 971
 972 /* Returns the 1-based line number of the start of the syntax that represents
 973    the token N after the current one in LEXER.  Returns 0 for a T_STOP token or
 974    if the token is drawn from a source that does not have line numbers. */
 975 int
 976 lex_get_first_line_number (const struct lexer *lexer, int n)
 977 {
 978   const struct lex_source *src = lex_source__ (lexer);
 979   return src != NULL ? lex_source_get_first_line_number (src, n) : 0;
 980 }
 981
 982 /* Returns the 1-based line number of the end of the syntax that represents the
 983    token N after the current one in LEXER, plus 1.  Returns 0 for a T_STOP
 984    token or if the token is drawn from a source that does not have line
 985    numbers.
 986
 987    Most of the time, a single token is wholly within a single line of syntax,
 988    but there are two exceptions: a T_STRING token can be made up of multiple
 989    segments on adjacent lines connected with "+" punctuators, and a T_NEG_NUM
 990    token can consist of a "-" on one line followed by the number on the next.
 991  */
 992 int
 993 lex_get_last_line_number (const struct lexer *lexer, int n)
 994 {
 995   const struct lex_source *src = lex_source__ (lexer);
 996   return src != NULL ? lex_source_get_last_line_number (src, n) : 0;
 997 }
 998
 999 /* Returns the 1-based column number of the start of the syntax that represents
1000    the token N after the current one in LEXER.  Returns 0 for a T_STOP
1001    token.
1002
1003    Column numbers are measured according to the width of characters as shown in
1004    a typical fixed-width font, in which CJK characters have width 2 and
1005    combining characters have width 0.  */
1006 int
1007 lex_get_first_column (const struct lexer *lexer, int n)
1008 {
1009   const struct lex_source *src = lex_source__ (lexer);
1010   return src != NULL ? lex_source_get_first_column (src, n) : 0;
1011 }
1012
1013 /* Returns the 1-based column number of the end of the syntax that represents
1014    the token N after the current one in LEXER, plus 1.  Returns 0 for a T_STOP
1015    token.
1016
1017    Column numbers are measured according to the width of characters as shown in
1018    a typical fixed-width font, in which CJK characters have width 2 and
1019    combining characters have width 0.  */
1020 int
1021 lex_get_last_column (const struct lexer *lexer, int n)
1022 {
1023   const struct lex_source *src = lex_source__ (lexer);
1024   return src != NULL ? lex_source_get_last_column (src, n) : 0;
1025 }
1026
1027 /* Returns the name of the syntax file from which the current command is drawn.
1028    Returns NULL for a T_STOP token or if the command's source does not have
1029    line numbers.
1030
1031    There is no version of this function that takes an N argument because
1032    lookahead only works to the end of a command and any given command is always
1033    within a single syntax file. */
1034 const char *
1035 lex_get_file_name (const struct lexer *lexer)
1036 {
1037   struct lex_source *src = lex_source__ (lexer);
1038   return src == NULL ? NULL : src->reader->file_name;
1039 }
1040
1041 const char *
1042 lex_get_encoding (const struct lexer *lexer)
1043 {
1044   struct lex_source *src = lex_source__ (lexer);
1045   return src == NULL ? NULL : src->reader->encoding;
1046 }
1047
1048
1049 /* Returns the syntax mode for the syntax file from which the current drawn is
1050    drawn.  Returns LEX_SYNTAX_AUTO for a T_STOP token or if the command's
1051    source does not have line numbers.
1052
1053    There is no version of this function that takes an N argument because
1054    lookahead only works to the end of a command and any given command is always
1055    within a single syntax file. */
1056 enum lex_syntax_mode
1057 lex_get_syntax_mode (const struct lexer *lexer)
1058 {
1059   struct lex_source *src = lex_source__ (lexer);
1060   return src == NULL ? LEX_SYNTAX_AUTO : src->reader->syntax;
1061 }
1062
1063 /* Returns the error mode for the syntax file from which the current drawn is
1064    drawn.  Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1065    source does not have line numbers.
1066
1067    There is no version of this function that takes an N argument because
1068    lookahead only works to the end of a command and any given command is always
1069    within a single syntax file. */
1070 enum lex_error_mode
1071 lex_get_error_mode (const struct lexer *lexer)
1072 {
1073   struct lex_source *src = lex_source__ (lexer);
1074   return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1075 }
1076
1077 /* If the source that LEXER is currently reading has error mode
1078    LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1079    token to be read comes directly from whatever is next read from the stream.
1080
1081    It makes sense to call this function after encountering an error in a
1082    command entered on the console, because usually the user would prefer not to
1083    have cascading errors. */
1084 void
1085 lex_interactive_reset (struct lexer *lexer)
1086 {
1087   struct lex_source *src = lex_source__ (lexer);
1088   if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1089     {
1090       src->head = src->tail = 0;
1091       src->journal_pos = src->seg_pos = src->line_pos = 0;
1092       src->n_newlines = 0;
1093       src->suppress_next_newline = false;
1094       segmenter_init (&src->segmenter, segmenter_get_mode (&src->segmenter));
1095       while (!deque_is_empty (&src->deque))
1096         lex_source_pop__ (src);
1097       lex_source_push_endcmd__ (src);
1098     }
1099 }
1100
1101 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1102 void
1103 lex_discard_rest_of_command (struct lexer *lexer)
1104 {
1105   while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1106     lex_get (lexer);
1107 }
1108
1109 /* Discards all lookahead tokens in LEXER, then discards all input sources
1110    until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1111    runs out of input sources. */
1112 void
1113 lex_discard_noninteractive (struct lexer *lexer)
1114 {
1115   struct lex_source *src = lex_source__ (lexer);
1116
1117   if (src != NULL)
1118     {
1119       while (!deque_is_empty (&src->deque))
1120         lex_source_pop__ (src);
1121
1122       for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1123            src = lex_source__ (lexer))
1124         lex_source_destroy (src);
1125     }
1126 }
1127 \f
1128 static size_t
1129 lex_source_max_tail__ (const struct lex_source *src)
1130 {
1131   const struct lex_token *token;
1132   size_t max_tail;
1133
1134   assert (src->seg_pos >= src->line_pos);
1135   max_tail = MIN (src->journal_pos, src->line_pos);
1136
1137   /* Use the oldest token also.  (We know that src->deque cannot be empty
1138      because we are in the process of adding a new token, which is already
1139      initialized enough to use here.) */
1140   token = &src->tokens[deque_back (&src->deque, 0)];
1141   assert (token->token_pos >= token->line_pos);
1142   max_tail = MIN (max_tail, token->line_pos);
1143
1144   return max_tail;
1145 }
1146
1147 static void
1148 lex_source_expand__ (struct lex_source *src)
1149 {
1150   if (src->head - src->tail >= src->allocated)
1151     {
1152       size_t max_tail = lex_source_max_tail__ (src);
1153       if (max_tail > src->tail)
1154         {
1155           /* Advance the tail, freeing up room at the head. */
1156           memmove (src->buffer, src->buffer + (max_tail - src->tail),
1157                    src->head - max_tail);
1158           src->tail = max_tail;
1159         }
1160       else
1161         {
1162           /* Buffer is completely full.  Expand it. */
1163           src->buffer = x2realloc (src->buffer, &src->allocated);
1164         }
1165     }
1166   else
1167     {
1168       /* There's space available at the head of the buffer.  Nothing to do. */
1169     }
1170 }
1171
1172 static void
1173 lex_source_read__ (struct lex_source *src)
1174 {
1175   do
1176     {
1177       lex_source_expand__ (src);
1178
1179       size_t head_ofs = src->head - src->tail;
1180       size_t space = src->allocated - head_ofs;
1181       enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1182       size_t n = src->reader->class->read (src->reader, &src->buffer[head_ofs],
1183                                            space, prompt);
1184       assert (n <= space);
1185
1186       for (char *p = &src->buffer[head_ofs]; p < &src->buffer[head_ofs + n];
1187            p++)
1188         if (*p == '\0')
1189           {
1190             struct msg m;
1191             m.category = MSG_C_SYNTAX;
1192             m.severity = MSG_S_ERROR;
1193             m.file_name = src->reader->file_name;
1194             m.first_line = 0;
1195             m.last_line = 0;
1196             m.first_column = 0;
1197             m.last_column = 0;
1198             m.text = xstrdup ("Bad character U+0000 in input.");
1199             msg_emit (&m);
1200
1201             *p = ' ';
1202           }
1203
1204       if (n == 0)
1205         {
1206           /* End of input.
1207
1208              Ensure that the input always ends in a new-line followed by a null
1209              byte, as required by the segmenter library. */
1210
1211           if (src->head == src->tail
1212               || src->buffer[src->head - src->tail - 1] != '\n')
1213             src->buffer[src->head++ - src->tail] = '\n';
1214
1215           lex_source_expand__ (src);
1216           src->buffer[src->head++ - src->tail] = '\0';
1217
1218           return;
1219         }
1220
1221       src->head += n;
1222     }
1223   while (!memchr (&src->buffer[src->seg_pos - src->tail], '\n',
1224                   src->head - src->seg_pos));
1225 }
1226
1227 static struct lex_source *
1228 lex_source__ (const struct lexer *lexer)
1229 {
1230   return (ll_is_empty (&lexer->sources) ? NULL
1231           : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1232 }
1233
1234 static struct substring
1235 lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
1236 {
1237   const struct lex_token *token0 = lex_source_next__ (src, n0);
1238   const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1));
1239   size_t start = token0->token_pos;
1240   size_t end = token1->token_pos + token1->token_len;
1241
1242   return ss_buffer (&src->buffer[start - src->tail], end - start);
1243 }
1244
1245 static void
1246 lex_ellipsize__ (struct substring in, char *out, size_t out_size)
1247 {
1248   size_t out_maxlen;
1249   size_t out_len;
1250   int mblen;
1251
1252   assert (out_size >= 16);
1253   out_maxlen = out_size - (in.length >= out_size ? 3 : 0) - 1;
1254   for (out_len = 0; out_len < in.length; out_len += mblen)
1255     {
1256       if (in.string[out_len] == '\n'
1257           || (in.string[out_len] == '\r'
1258               && out_len + 1 < in.length
1259               && in.string[out_len + 1] == '\n'))
1260         break;
1261
1262       mblen = u8_mblen (CHAR_CAST (const uint8_t *, in.string + out_len),
1263                         in.length - out_len);
1264       if (out_len + mblen > out_maxlen)
1265         break;
1266     }
1267
1268   memcpy (out, in.string, out_len);
1269   strcpy (&out[out_len], out_len < in.length ? "..." : "");
1270 }
1271
1272 static void
1273 lex_source_error_valist (struct lex_source *src, int n0, int n1,
1274                          const char *format, va_list args)
1275 {
1276   const struct lex_token *token;
1277   struct string s;
1278   struct msg m;
1279
1280   ds_init_empty (&s);
1281
1282   token = lex_source_next__ (src, n0);
1283   if (token->token.type == T_ENDCMD)
1284     ds_put_cstr (&s, _("Syntax error at end of command"));
1285   else
1286     {
1287       struct substring syntax = lex_source_get_syntax__ (src, n0, n1);
1288       if (!ss_is_empty (syntax))
1289         {
1290           char syntax_cstr[64];
1291
1292           lex_ellipsize__ (syntax, syntax_cstr, sizeof syntax_cstr);
1293           ds_put_format (&s, _("Syntax error at `%s'"), syntax_cstr);
1294         }
1295       else
1296         ds_put_cstr (&s, _("Syntax error"));
1297     }
1298
1299   if (format)
1300     {
1301       ds_put_cstr (&s, ": ");
1302       ds_put_vformat (&s, format, args);
1303     }
1304   ds_put_byte (&s, '.');
1305
1306   m.category = MSG_C_SYNTAX;
1307   m.severity = MSG_S_ERROR;
1308   m.file_name = src->reader->file_name;
1309   m.first_line = lex_source_get_first_line_number (src, n0);
1310   m.last_line = lex_source_get_last_line_number (src, n1);
1311   m.first_column = lex_source_get_first_column (src, n0);
1312   m.last_column = lex_source_get_last_column (src, n1);
1313   m.text = ds_steal_cstr (&s);
1314   msg_emit (&m);
1315 }
1316
1317 static void PRINTF_FORMAT (2, 3)
1318 lex_get_error (struct lex_source *src, const char *format, ...)
1319 {
1320   va_list args;
1321   int n;
1322
1323   va_start (args, format);
1324
1325   n = deque_count (&src->deque) - 1;
1326   lex_source_error_valist (src, n, n, format, args);
1327   lex_source_pop_front (src);
1328
1329   va_end (args);
1330 }
1331
1332 /* Attempts to append an additional token into SRC's deque, reading more from
1333    the underlying lex_reader if necessary..  Returns true if successful, false
1334    if the deque already represents (a suffix of) the whole lex_reader's
1335    contents, */
1336 static bool
1337 lex_source_get__ (const struct lex_source *src_)
1338 {
1339   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1340   if (src->eof)
1341     return false;
1342
1343   /* State maintained while scanning tokens.  Usually we only need a single
1344      state, but scanner_push() can return SCAN_SAVE to indicate that the state
1345      needs to be saved and possibly restored later with SCAN_BACK. */
1346   struct state
1347     {
1348       struct segmenter segmenter;
1349       enum segment_type last_segment;
1350       int newlines;             /* Number of newlines encountered so far. */
1351       /* Maintained here so we can update lex_source's similar members when we
1352          finish. */
1353       size_t line_pos;
1354       size_t seg_pos;
1355     };
1356
1357   /* Initialize state. */
1358   struct state state =
1359     {
1360       .segmenter = src->segmenter,
1361       .newlines = 0,
1362       .seg_pos = src->seg_pos,
1363       .line_pos = src->line_pos,
1364     };
1365   struct state saved = state;
1366
1367   /* Append a new token to SRC and initialize it. */
1368   struct lex_token *token = lex_push_token__ (src);
1369   struct scanner scanner;
1370   scanner_init (&scanner, &token->token);
1371   token->line_pos = src->line_pos;
1372   token->token_pos = src->seg_pos;
1373   if (src->reader->line_number > 0)
1374     token->first_line = src->reader->line_number + src->n_newlines;
1375   else
1376     token->first_line = 0;
1377
1378   /* Extract segments and pass them through the scanner until we obtain a
1379      token. */
1380   for (;;)
1381     {
1382       /* Extract a segment. */
1383       const char *segment = &src->buffer[state.seg_pos - src->tail];
1384       size_t seg_maxlen = src->head - state.seg_pos;
1385       enum segment_type type;
1386       int seg_len = segmenter_push (&state.segmenter, segment, seg_maxlen,
1387                                     &type);
1388       if (seg_len < 0)
1389         {
1390           /* The segmenter needs more input to produce a segment. */
1391           lex_source_read__ (src);
1392           continue;
1393         }
1394
1395       /* Update state based on the segment. */
1396       state.last_segment = type;
1397       state.seg_pos += seg_len;
1398       if (type == SEG_NEWLINE)
1399         {
1400           state.newlines++;
1401           state.line_pos = state.seg_pos;
1402         }
1403
1404       /* Pass the segment into the scanner and try to get a token out. */
1405       enum scan_result result = scanner_push (&scanner, type,
1406                                               ss_buffer (segment, seg_len),
1407                                               &token->token);
1408       if (result == SCAN_SAVE)
1409         saved = state;
1410       else if (result == SCAN_BACK)
1411         {
1412           state = saved;
1413           break;
1414         }
1415       else if (result == SCAN_DONE)
1416         break;
1417     }
1418
1419   /* If we've reached the end of a line, or the end of a command, then pass
1420      the line to the output engine as a syntax text item.  */
1421   int n_lines = state.newlines;
1422   if (state.last_segment == SEG_END_COMMAND && !src->suppress_next_newline)
1423     {
1424       n_lines++;
1425       src->suppress_next_newline = true;
1426     }
1427   else if (n_lines > 0 && src->suppress_next_newline)
1428     {
1429       n_lines--;
1430       src->suppress_next_newline = false;
1431     }
1432   for (int i = 0; i < n_lines; i++)
1433     {
1434       const char *line = &src->buffer[src->journal_pos - src->tail];
1435       const char *newline = rawmemchr (line, '\n');
1436       size_t line_len = newline - line;
1437       if (line_len > 0 && line[line_len - 1] == '\r')
1438         line_len--;
1439
1440       char *syntax = malloc (line_len + 2);
1441       memcpy (syntax, line, line_len);
1442       syntax[line_len] = '\n';
1443       syntax[line_len + 1] = '\0';
1444
1445       text_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX, syntax));
1446
1447       src->journal_pos += newline - line + 1;
1448     }
1449
1450   token->token_len = state.seg_pos - src->seg_pos;
1451
1452   src->segmenter = state.segmenter;
1453   src->seg_pos = state.seg_pos;
1454   src->line_pos = state.line_pos;
1455   src->n_newlines += state.newlines;
1456
1457   switch (token->token.type)
1458     {
1459     default:
1460       break;
1461
1462     case T_STOP:
1463       token->token.type = T_ENDCMD;
1464       src->eof = true;
1465       break;
1466
1467     case SCAN_BAD_HEX_LENGTH:
1468       lex_get_error (src, _("String of hex digits has %d characters, which "
1469                             "is not a multiple of 2"),
1470                      (int) token->token.number);
1471       break;
1472
1473     case SCAN_BAD_HEX_DIGIT:
1474     case SCAN_BAD_UNICODE_DIGIT:
1475       lex_get_error (src, _("`%c' is not a valid hex digit"),
1476                      (int) token->token.number);
1477       break;
1478
1479     case SCAN_BAD_UNICODE_LENGTH:
1480       lex_get_error (src, _("Unicode string contains %d bytes, which is "
1481                             "not in the valid range of 1 to 8 bytes"),
1482                      (int) token->token.number);
1483       break;
1484
1485     case SCAN_BAD_UNICODE_CODE_POINT:
1486       lex_get_error (src, _("U+%04X is not a valid Unicode code point"),
1487                      (int) token->token.number);
1488       break;
1489
1490     case SCAN_EXPECTED_QUOTE:
1491       lex_get_error (src, _("Unterminated string constant"));
1492       break;
1493
1494     case SCAN_EXPECTED_EXPONENT:
1495       lex_get_error (src, _("Missing exponent following `%s'"),
1496                      token->token.string.string);
1497       break;
1498
1499     case SCAN_UNEXPECTED_DOT:
1500       lex_get_error (src, _("Unexpected `.' in middle of command"));
1501       break;
1502
1503     case SCAN_UNEXPECTED_CHAR:
1504       {
1505         char c_name[16];
1506         lex_get_error (src, _("Bad character %s in input"),
1507                        uc_name (token->token.number, c_name));
1508       }
1509       break;
1510
1511     case SCAN_SKIP:
1512       lex_source_pop_front (src);
1513       break;
1514     }
1515
1516   return true;
1517 }
1518 \f
1519 static void
1520 lex_source_push_endcmd__ (struct lex_source *src)
1521 {
1522   struct lex_token *token = lex_push_token__ (src);
1523   token->token.type = T_ENDCMD;
1524   token->token_pos = 0;
1525   token->token_len = 0;
1526   token->line_pos = 0;
1527   token->first_line = 0;
1528 }
1529
1530 static struct lex_source *
1531 lex_source_create (struct lex_reader *reader)
1532 {
1533   struct lex_source *src;
1534   enum segmenter_mode mode;
1535
1536   src = xzalloc (sizeof *src);
1537   src->reader = reader;
1538
1539   if (reader->syntax == LEX_SYNTAX_AUTO)
1540     mode = SEG_MODE_AUTO;
1541   else if (reader->syntax == LEX_SYNTAX_INTERACTIVE)
1542     mode = SEG_MODE_INTERACTIVE;
1543   else if (reader->syntax == LEX_SYNTAX_BATCH)
1544     mode = SEG_MODE_BATCH;
1545   else
1546     NOT_REACHED ();
1547   segmenter_init (&src->segmenter, mode);
1548
1549   src->tokens = deque_init (&src->deque, 4, sizeof *src->tokens);
1550
1551   lex_source_push_endcmd__ (src);
1552
1553   return src;
1554 }
1555
1556 static void
1557 lex_source_destroy (struct lex_source *src)
1558 {
1559   char *file_name = src->reader->file_name;
1560   char *encoding = src->reader->encoding;
1561   if (src->reader->class->destroy != NULL)
1562     src->reader->class->destroy (src->reader);
1563   free (file_name);
1564   free (encoding);
1565   free (src->buffer);
1566   while (!deque_is_empty (&src->deque))
1567     lex_source_pop__ (src);
1568   free (src->tokens);
1569   ll_remove (&src->ll);
1570   free (src);
1571 }
1572 \f
1573 struct lex_file_reader
1574   {
1575     struct lex_reader reader;
1576     struct u8_istream *istream;
1577   };
1578
1579 static struct lex_reader_class lex_file_reader_class;
1580
1581 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
1582    from stdin if FILE_NAME is "-").  The file is expected to be encoded with
1583    ENCODING, which should take one of the forms accepted by
1584    u8_istream_for_file().  SYNTAX and ERROR become the syntax mode and error
1585    mode of the new reader, respectively.
1586
1587    Returns a null pointer if FILE_NAME cannot be opened. */
1588 struct lex_reader *
1589 lex_reader_for_file (const char *file_name, const char *encoding,
1590                      enum lex_syntax_mode syntax,
1591                      enum lex_error_mode error)
1592 {
1593   struct lex_file_reader *r;
1594   struct u8_istream *istream;
1595
1596   istream = (!strcmp(file_name, "-")
1597              ? u8_istream_for_fd (encoding, STDIN_FILENO)
1598              : u8_istream_for_file (encoding, file_name, O_RDONLY));
1599   if (istream == NULL)
1600     {
1601       msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
1602       return NULL;
1603     }
1604
1605   r = xmalloc (sizeof *r);
1606   lex_reader_init (&r->reader, &lex_file_reader_class);
1607   r->reader.syntax = syntax;
1608   r->reader.error = error;
1609   r->reader.file_name = xstrdup (file_name);
1610   r->reader.encoding = encoding ? xstrdup (encoding) : NULL;
1611   r->reader.line_number = 1;
1612   r->istream = istream;
1613
1614   return &r->reader;
1615 }
1616
1617 static struct lex_file_reader *
1618 lex_file_reader_cast (struct lex_reader *r)
1619 {
1620   return UP_CAST (r, struct lex_file_reader, reader);
1621 }
1622
1623 static size_t
1624 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
1625                enum prompt_style prompt_style UNUSED)
1626 {
1627   struct lex_file_reader *r = lex_file_reader_cast (r_);
1628   ssize_t n_read = u8_istream_read (r->istream, buf, n);
1629   if (n_read < 0)
1630     {
1631       msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
1632       return 0;
1633     }
1634   return n_read;
1635 }
1636
1637 static void
1638 lex_file_close (struct lex_reader *r_)
1639 {
1640   struct lex_file_reader *r = lex_file_reader_cast (r_);
1641
1642   if (u8_istream_fileno (r->istream) != STDIN_FILENO)
1643     {
1644       if (u8_istream_close (r->istream) != 0)
1645         msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
1646     }
1647   else
1648     u8_istream_free (r->istream);
1649
1650   free (r);
1651 }
1652
1653 static struct lex_reader_class lex_file_reader_class =
1654   {
1655     lex_file_read,
1656     lex_file_close
1657   };
1658 \f
1659 struct lex_string_reader
1660   {
1661     struct lex_reader reader;
1662     struct substring s;
1663     size_t offset;
1664   };
1665
1666 static struct lex_reader_class lex_string_reader_class;
1667
1668 /* Creates and returns a new lex_reader for the contents of S, which must be
1669    encoded in the given ENCODING.  The new reader takes ownership of S and will free it
1670    with ss_dealloc() when it is closed. */
1671 struct lex_reader *
1672 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
1673 {
1674   struct lex_string_reader *r;
1675
1676   r = xmalloc (sizeof *r);
1677   lex_reader_init (&r->reader, &lex_string_reader_class);
1678   r->reader.syntax = LEX_SYNTAX_AUTO;
1679   r->reader.encoding = encoding ? xstrdup (encoding) : NULL;
1680   r->s = s;
1681   r->offset = 0;
1682
1683   return &r->reader;
1684 }
1685
1686 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
1687    which must be encoded in ENCODING.  The caller retains ownership of S. */
1688 struct lex_reader *
1689 lex_reader_for_string (const char *s, const char *encoding)
1690 {
1691   struct substring ss;
1692   ss_alloc_substring (&ss, ss_cstr (s));
1693   return lex_reader_for_substring_nocopy (ss, encoding);
1694 }
1695
1696 /* Formats FORMAT as a printf()-like format string and creates and returns a
1697    new lex_reader for the formatted result.  */
1698 struct lex_reader *
1699 lex_reader_for_format (const char *format, const char *encoding, ...)
1700 {
1701   struct lex_reader *r;
1702   va_list args;
1703
1704   va_start (args, encoding);
1705   r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
1706   va_end (args);
1707
1708   return r;
1709 }
1710
1711 static struct lex_string_reader *
1712 lex_string_reader_cast (struct lex_reader *r)
1713 {
1714   return UP_CAST (r, struct lex_string_reader, reader);
1715 }
1716
1717 static size_t
1718 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
1719                  enum prompt_style prompt_style UNUSED)
1720 {
1721   struct lex_string_reader *r = lex_string_reader_cast (r_);
1722   size_t chunk;
1723
1724   chunk = MIN (n, r->s.length - r->offset);
1725   memcpy (buf, r->s.string + r->offset, chunk);
1726   r->offset += chunk;
1727
1728   return chunk;
1729 }
1730
1731 static void
1732 lex_string_close (struct lex_reader *r_)
1733 {
1734   struct lex_string_reader *r = lex_string_reader_cast (r_);
1735
1736   ss_dealloc (&r->s);
1737   free (r);
1738 }
1739
1740 static struct lex_reader_class lex_string_reader_class =
1741   {
1742     lex_string_read,
1743     lex_string_close
1744   };