pintos-os.org Git - pspp/blob - src/language/lexer/lexer.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "language/lexer/lexer.h"
  20
  21 #include <errno.h>
  22 #include <fcntl.h>
  23 #include <limits.h>
  24 #include <math.h>
  25 #include <stdarg.h>
  26 #include <stdlib.h>
  27 #include <string.h>
  28 #include <unictype.h>
  29 #include <unistd.h>
  30 #include <unistr.h>
  31 #include <uniwidth.h>
  32
  33 #include "language/command.h"
  34 #include "language/lexer/scan.h"
  35 #include "language/lexer/segment.h"
  36 #include "language/lexer/token.h"
  37 #include "libpspp/assertion.h"
  38 #include "libpspp/cast.h"
  39 #include "libpspp/deque.h"
  40 #include "libpspp/i18n.h"
  41 #include "libpspp/ll.h"
  42 #include "libpspp/message.h"
  43 #include "libpspp/misc.h"
  44 #include "libpspp/str.h"
  45 #include "libpspp/u8-istream.h"
  46 #include "output/journal.h"
  47 #include "output/text-item.h"
  48
  49 #include "gl/c-ctype.h"
  50 #include "gl/minmax.h"
  51 #include "gl/xalloc.h"
  52 #include "gl/xmemdup0.h"
  53
  54 #include "gettext.h"
  55 #define _(msgid) gettext (msgid)
  56 #define N_(msgid) msgid
  57
  58 /* A token within a lex_source. */
  59 struct lex_token
  60   {
  61     /* The regular token information. */
  62     struct token token;
  63
  64     /* Location of token in terms of the lex_source's buffer.
  65        src->tail <= line_pos <= token_pos <= src->head. */
  66     size_t token_pos;           /* Start of token. */
  67     size_t token_len;           /* Length of source for token in bytes. */
  68     size_t line_pos;            /* Start of line containing token_pos. */
  69     int first_line;             /* Line number at token_pos. */
  70   };
  71
  72 /* A source of tokens, corresponding to a syntax file.
  73
  74    This is conceptually a lex_reader wrapped with everything needed to convert
  75    its UTF-8 bytes into tokens. */
  76 struct lex_source
  77   {
  78     struct ll ll;               /* In lexer's list of sources. */
  79     struct lex_reader *reader;
  80     struct segmenter segmenter;
  81     bool eof;                   /* True if T_STOP was read from 'reader'. */
  82
  83     /* Buffer of UTF-8 bytes. */
  84     char *buffer;
  85     size_t allocated;           /* Number of bytes allocated. */
  86     size_t tail;                /* &buffer[0] offset into UTF-8 source. */
  87     size_t head;                /* &buffer[head - tail] offset into source. */
  88
  89     /* Positions in source file, tail <= pos <= head for each member here. */
  90     size_t journal_pos;         /* First byte not yet output to journal. */
  91     size_t seg_pos;             /* First byte not yet scanned as token. */
  92     size_t line_pos;            /* First byte of line containing seg_pos. */
  93
  94     int n_newlines;             /* Number of new-lines up to seg_pos. */
  95     bool suppress_next_newline;
  96
  97     /* Tokens. */
  98     struct deque deque;         /* Indexes into 'tokens'. */
  99     struct lex_token *tokens;   /* Lookahead tokens for parser. */
 100   };
 101
 102 static struct lex_source *lex_source_create (struct lex_reader *);
 103 static void lex_source_destroy (struct lex_source *);
 104
 105 /* Lexer. */
 106 struct lexer
 107   {
 108     struct ll_list sources;     /* Contains "struct lex_source"s. */
 109   };
 110
 111 static struct lex_source *lex_source__ (const struct lexer *);
 112 static const struct lex_token *lex_next__ (const struct lexer *, int n);
 113 static void lex_source_push_endcmd__ (struct lex_source *);
 114
 115 static void lex_source_pop__ (struct lex_source *);
 116 static bool lex_source_get__ (const struct lex_source *);
 117 static void lex_source_error_valist (struct lex_source *, int n0, int n1,
 118                                      const char *format, va_list)
 119    PRINTF_FORMAT (4, 0);
 120 static const struct lex_token *lex_source_next__ (const struct lex_source *,
 121                                                   int n);
 122 \f
 123 /* Initializes READER with the specified CLASS and otherwise some reasonable
 124    defaults.  The caller should fill in the others members as desired. */
 125 void
 126 lex_reader_init (struct lex_reader *reader,
 127                  const struct lex_reader_class *class)
 128 {
 129   reader->class = class;
 130   reader->syntax = LEX_SYNTAX_AUTO;
 131   reader->error = LEX_ERROR_CONTINUE;
 132   reader->file_name = NULL;
 133   reader->encoding = NULL;
 134   reader->line_number = 0;
 135 }
 136
 137 /* Frees any file name already in READER and replaces it by a copy of
 138    FILE_NAME, or if FILE_NAME is null then clears any existing name. */
 139 void
 140 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
 141 {
 142   free (reader->file_name);
 143   reader->file_name = file_name != NULL ? xstrdup (file_name) : NULL;
 144 }
 145 \f
 146 /* Creates and returns a new lexer. */
 147 struct lexer *
 148 lex_create (void)
 149 {
 150   struct lexer *lexer = xzalloc (sizeof *lexer);
 151   ll_init (&lexer->sources);
 152   return lexer;
 153 }
 154
 155 /* Destroys LEXER. */
 156 void
 157 lex_destroy (struct lexer *lexer)
 158 {
 159   if (lexer != NULL)
 160     {
 161       struct lex_source *source, *next;
 162
 163       ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
 164         lex_source_destroy (source);
 165       free (lexer);
 166     }
 167 }
 168
 169 /* Inserts READER into LEXER so that the next token read by LEXER comes from
 170    READER.  Before the caller, LEXER must either be empty or at a T_ENDCMD
 171    token. */
 172 void
 173 lex_include (struct lexer *lexer, struct lex_reader *reader)
 174 {
 175   assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
 176   ll_push_head (&lexer->sources, &lex_source_create (reader)->ll);
 177 }
 178
 179 /* Appends READER to LEXER, so that it will be read after all other current
 180    readers have already been read. */
 181 void
 182 lex_append (struct lexer *lexer, struct lex_reader *reader)
 183 {
 184   ll_push_tail (&lexer->sources, &lex_source_create (reader)->ll);
 185 }
 186 \f
 187 /* Advancing. */
 188
 189 static struct lex_token *
 190 lex_push_token__ (struct lex_source *src)
 191 {
 192   struct lex_token *token;
 193
 194   if (deque_is_full (&src->deque))
 195     src->tokens = deque_expand (&src->deque, src->tokens, sizeof *src->tokens);
 196
 197   token = &src->tokens[deque_push_front (&src->deque)];
 198   token_init (&token->token);
 199   return token;
 200 }
 201
 202 static void
 203 lex_source_pop__ (struct lex_source *src)
 204 {
 205   token_destroy (&src->tokens[deque_pop_back (&src->deque)].token);
 206 }
 207
 208 static void
 209 lex_source_pop_front (struct lex_source *src)
 210 {
 211   token_destroy (&src->tokens[deque_pop_front (&src->deque)].token);
 212 }
 213
 214 /* Advances LEXER to the next token, consuming the current token. */
 215 void
 216 lex_get (struct lexer *lexer)
 217 {
 218   struct lex_source *src;
 219
 220   src = lex_source__ (lexer);
 221   if (src == NULL)
 222     return;
 223
 224   if (!deque_is_empty (&src->deque))
 225     lex_source_pop__ (src);
 226
 227   while (deque_is_empty (&src->deque))
 228     if (!lex_source_get__ (src))
 229       {
 230         lex_source_destroy (src);
 231         src = lex_source__ (lexer);
 232         if (src == NULL)
 233           return;
 234       }
 235 }
 236 \f
 237 /* Issuing errors. */
 238
 239 /* Prints a syntax error message containing the current token and
 240    given message MESSAGE (if non-null). */
 241 void
 242 lex_error (struct lexer *lexer, const char *format, ...)
 243 {
 244   va_list args;
 245
 246   va_start (args, format);
 247   lex_next_error_valist (lexer, 0, 0, format, args);
 248   va_end (args);
 249 }
 250
 251 /* Prints a syntax error message containing the current token and
 252    given message MESSAGE (if non-null). */
 253 void
 254 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
 255 {
 256   lex_next_error_valist (lexer, 0, 0, format, args);
 257 }
 258
 259 /* Prints a syntax error message containing the current token and
 260    given message MESSAGE (if non-null). */
 261 void
 262 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
 263 {
 264   va_list args;
 265
 266   va_start (args, format);
 267   lex_next_error_valist (lexer, n0, n1, format, args);
 268   va_end (args);
 269 }
 270
 271 /* Prints a syntax error message saying that OPTION0 or one of the other
 272    strings following it, up to the first NULL, is expected. */
 273 void
 274 lex_error_expecting (struct lexer *lexer, const char *option0, ...)
 275 {
 276   enum { MAX_OPTIONS = 8 };
 277   const char *options[MAX_OPTIONS + 1];
 278   va_list args;
 279   int n;
 280
 281   va_start (args, option0);
 282   options[0] = option0;
 283   n = 0;
 284   while (n + 1 < MAX_OPTIONS && options[n] != NULL)
 285     options[++n] = va_arg (args, const char *);
 286   va_end (args);
 287
 288   switch (n)
 289     {
 290     case 0:
 291       lex_error (lexer, NULL);
 292       break;
 293
 294     case 1:
 295       lex_error (lexer, _("expecting %s"), options[0]);
 296       break;
 297
 298     case 2:
 299       lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
 300       break;
 301
 302     case 3:
 303       lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
 304                  options[2]);
 305       break;
 306
 307     case 4:
 308       lex_error (lexer, _("expecting %s, %s, %s, or %s"),
 309                  options[0], options[1], options[2], options[3]);
 310       break;
 311
 312     case 5:
 313       lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
 314                  options[0], options[1], options[2], options[3], options[4]);
 315       break;
 316
 317     case 6:
 318       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
 319                  options[0], options[1], options[2], options[3], options[4],
 320                  options[5]);
 321       break;
 322
 323     case 7:
 324       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
 325                  options[0], options[1], options[2], options[3], options[4],
 326                  options[5], options[6]);
 327       break;
 328
 329     case 8:
 330       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
 331                  options[0], options[1], options[2], options[3], options[4],
 332                  options[5], options[6], options[7]);
 333       break;
 334
 335     default:
 336       NOT_REACHED ();
 337     }
 338 }
 339
 340 /* Reports an error to the effect that subcommand SBC may only be specified
 341    once.
 342
 343    This function does not take a lexer as an argument or use lex_error(),
 344    because the result would ordinarily just be redundant: "Syntax error at
 345    SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
 346    not help the user find the error. */
 347 void
 348 lex_sbc_only_once (const char *sbc)
 349 {
 350   msg (SE, _("Subcommand %s may only be specified once."), sbc);
 351 }
 352
 353 /* Reports an error to the effect that subcommand SBC is missing.
 354
 355    This function does not take a lexer as an argument or use lex_error(),
 356    because a missing subcommand can normally be detected only after the whole
 357    command has been parsed, and so lex_error() would always report "Syntax
 358    error at end of command", which does not help the user find the error. */
 359 void
 360 lex_sbc_missing (const char *sbc)
 361 {
 362   msg (SE, _("Required subcommand %s was not specified."), sbc);
 363 }
 364
 365 /* Reports an error to the effect that specification SPEC may only be specified
 366    once within subcommand SBC. */
 367 void
 368 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
 369 {
 370   lex_error (lexer, _("%s may only be specified once within subcommand %s"),
 371              spec, sbc);
 372 }
 373
 374 /* Reports an error to the effect that specification SPEC is missing within
 375    subcommand SBC. */
 376 void
 377 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
 378 {
 379   lex_error (lexer, _("Required %s specification missing from %s subcommand"),
 380              sbc, spec);
 381 }
 382
 383 /* Prints a syntax error message containing the current token and
 384    given message MESSAGE (if non-null). */
 385 void
 386 lex_next_error_valist (struct lexer *lexer, int n0, int n1,
 387                        const char *format, va_list args)
 388 {
 389   struct lex_source *src = lex_source__ (lexer);
 390
 391   if (src != NULL)
 392     lex_source_error_valist (src, n0, n1, format, args);
 393   else
 394     {
 395       struct string s;
 396
 397       ds_init_empty (&s);
 398       ds_put_format (&s, _("Syntax error at end of input"));
 399       if (format != NULL)
 400         {
 401           ds_put_cstr (&s, ": ");
 402           ds_put_vformat (&s, format, args);
 403         }
 404       ds_put_byte (&s, '.');
 405       msg (SE, "%s", ds_cstr (&s));
 406       ds_destroy (&s);
 407     }
 408 }
 409
 410 /* Checks that we're at end of command.
 411    If so, returns a successful command completion code.
 412    If not, flags a syntax error and returns an error command
 413    completion code. */
 414 int
 415 lex_end_of_command (struct lexer *lexer)
 416 {
 417   if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
 418     {
 419       lex_error (lexer, _("expecting end of command"));
 420       return CMD_FAILURE;
 421     }
 422   else
 423     return CMD_SUCCESS;
 424 }
 425 \f
 426 /* Token testing functions. */
 427
 428 /* Returns true if the current token is a number. */
 429 bool
 430 lex_is_number (struct lexer *lexer)
 431 {
 432   return lex_next_is_number (lexer, 0);
 433 }
 434
 435 /* Returns true if the current token is a string. */
 436 bool
 437 lex_is_string (struct lexer *lexer)
 438 {
 439   return lex_next_is_string (lexer, 0);
 440 }
 441
 442 /* Returns the value of the current token, which must be a
 443    floating point number. */
 444 double
 445 lex_number (struct lexer *lexer)
 446 {
 447   return lex_next_number (lexer, 0);
 448 }
 449
 450 /* Returns true iff the current token is an integer. */
 451 bool
 452 lex_is_integer (struct lexer *lexer)
 453 {
 454   return lex_next_is_integer (lexer, 0);
 455 }
 456
 457 /* Returns the value of the current token, which must be an
 458    integer. */
 459 long
 460 lex_integer (struct lexer *lexer)
 461 {
 462   return lex_next_integer (lexer, 0);
 463 }
 464 \f
 465 /* Token testing functions with lookahead.
 466
 467    A value of 0 for N as an argument to any of these functions refers to the
 468    current token.  Lookahead is limited to the current command.  Any N greater
 469    than the number of tokens remaining in the current command will be treated
 470    as referring to a T_ENDCMD token. */
 471
 472 /* Returns true if the token N ahead of the current token is a number. */
 473 bool
 474 lex_next_is_number (struct lexer *lexer, int n)
 475 {
 476   enum token_type next_token = lex_next_token (lexer, n);
 477   return next_token == T_POS_NUM || next_token == T_NEG_NUM;
 478 }
 479
 480 /* Returns true if the token N ahead of the current token is a string. */
 481 bool
 482 lex_next_is_string (struct lexer *lexer, int n)
 483 {
 484   return lex_next_token (lexer, n) == T_STRING;
 485 }
 486
 487 /* Returns the value of the token N ahead of the current token, which must be a
 488    floating point number. */
 489 double
 490 lex_next_number (struct lexer *lexer, int n)
 491 {
 492   assert (lex_next_is_number (lexer, n));
 493   return lex_next_tokval (lexer, n);
 494 }
 495
 496 /* Returns true if the token N ahead of the current token is an integer. */
 497 bool
 498 lex_next_is_integer (struct lexer *lexer, int n)
 499 {
 500   double value;
 501
 502   if (!lex_next_is_number (lexer, n))
 503     return false;
 504
 505   value = lex_next_tokval (lexer, n);
 506   return value > LONG_MIN && value <= LONG_MAX && floor (value) == value;
 507 }
 508
 509 /* Returns the value of the token N ahead of the current token, which must be
 510    an integer. */
 511 long
 512 lex_next_integer (struct lexer *lexer, int n)
 513 {
 514   assert (lex_next_is_integer (lexer, n));
 515   return lex_next_tokval (lexer, n);
 516 }
 517 \f
 518 /* Token matching functions. */
 519
 520 /* If the current token has the specified TYPE, skips it and returns true.
 521    Otherwise, returns false. */
 522 bool
 523 lex_match (struct lexer *lexer, enum token_type type)
 524 {
 525   if (lex_token (lexer) == type)
 526     {
 527       lex_get (lexer);
 528       return true;
 529     }
 530   else
 531     return false;
 532 }
 533
 534 /* If the current token matches IDENTIFIER, skips it and returns true.
 535    IDENTIFIER may be abbreviated to its first three letters.  Otherwise,
 536    returns false.
 537
 538    IDENTIFIER must be an ASCII string. */
 539 bool
 540 lex_match_id (struct lexer *lexer, const char *identifier)
 541 {
 542   return lex_match_id_n (lexer, identifier, 3);
 543 }
 544
 545 /* If the current token is IDENTIFIER, skips it and returns true.  IDENTIFIER
 546    may be abbreviated to its first N letters.  Otherwise, returns false.
 547
 548    IDENTIFIER must be an ASCII string. */
 549 bool
 550 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
 551 {
 552   if (lex_token (lexer) == T_ID
 553       && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
 554     {
 555       lex_get (lexer);
 556       return true;
 557     }
 558   else
 559     return false;
 560 }
 561
 562 /* If the current token is integer X, skips it and returns true.  Otherwise,
 563    returns false. */
 564 bool
 565 lex_match_int (struct lexer *lexer, int x)
 566 {
 567   if (lex_is_integer (lexer) && lex_integer (lexer) == x)
 568     {
 569       lex_get (lexer);
 570       return true;
 571     }
 572   else
 573     return false;
 574 }
 575 \f
 576 /* Forced matches. */
 577
 578 /* If this token is IDENTIFIER, skips it and returns true.  IDENTIFIER may be
 579    abbreviated to its first 3 letters.  Otherwise, reports an error and returns
 580    false.
 581
 582    IDENTIFIER must be an ASCII string. */
 583 bool
 584 lex_force_match_id (struct lexer *lexer, const char *identifier)
 585 {
 586   if (lex_match_id (lexer, identifier))
 587     return true;
 588   else
 589     {
 590       lex_error_expecting (lexer, identifier, NULL_SENTINEL);
 591       return false;
 592     }
 593 }
 594
 595 /* If the current token has the specified TYPE, skips it and returns true.
 596    Otherwise, reports an error and returns false. */
 597 bool
 598 lex_force_match (struct lexer *lexer, enum token_type type)
 599 {
 600   if (lex_token (lexer) == type)
 601     {
 602       lex_get (lexer);
 603       return true;
 604     }
 605   else
 606     {
 607       const char *type_string = token_type_to_string (type);
 608       if (type_string)
 609         {
 610           char *s = xasprintf ("`%s'", type_string);
 611           lex_error_expecting (lexer, s, NULL_SENTINEL);
 612           free (s);
 613         }
 614       else
 615         lex_error_expecting (lexer, token_type_to_name (type), NULL_SENTINEL);
 616
 617       return false;
 618     }
 619 }
 620
 621 /* If the current token is a string, does nothing and returns true.
 622    Otherwise, reports an error and returns false. */
 623 bool
 624 lex_force_string (struct lexer *lexer)
 625 {
 626   if (lex_is_string (lexer))
 627     return true;
 628   else
 629     {
 630       lex_error (lexer, _("expecting string"));
 631       return false;
 632     }
 633 }
 634
 635 /* If the current token is a string or an identifier, does nothing and returns
 636    true.  Otherwise, reports an error and returns false.
 637
 638    This is meant for use in syntactic situations where we want to encourage the
 639    user to supply a quoted string, but for compatibility we also accept
 640    identifiers.  (One example of such a situation is file names.)  Therefore,
 641    the error message issued when the current token is wrong only says that a
 642    string is expected and doesn't mention that an identifier would also be
 643    accepted. */
 644 bool
 645 lex_force_string_or_id (struct lexer *lexer)
 646 {
 647   return lex_is_integer (lexer) || lex_force_string (lexer);
 648 }
 649
 650 /* If the current token is an integer, does nothing and returns true.
 651    Otherwise, reports an error and returns false. */
 652 bool
 653 lex_force_int (struct lexer *lexer)
 654 {
 655   if (lex_is_integer (lexer))
 656     return true;
 657   else
 658     {
 659       lex_error (lexer, _("expecting integer"));
 660       return false;
 661     }
 662 }
 663
 664 /* If the current token is a number, does nothing and returns true.
 665    Otherwise, reports an error and returns false. */
 666 bool
 667 lex_force_num (struct lexer *lexer)
 668 {
 669   if (lex_is_number (lexer))
 670     return true;
 671
 672   lex_error (lexer, _("expecting number"));
 673   return false;
 674 }
 675
 676 /* If the current token is an identifier, does nothing and returns true.
 677    Otherwise, reports an error and returns false. */
 678 bool
 679 lex_force_id (struct lexer *lexer)
 680 {
 681   if (lex_token (lexer) == T_ID)
 682     return true;
 683
 684   lex_error (lexer, _("expecting identifier"));
 685   return false;
 686 }
 687 \f
 688 /* Token accessors. */
 689
 690 /* Returns the type of LEXER's current token. */
 691 enum token_type
 692 lex_token (const struct lexer *lexer)
 693 {
 694   return lex_next_token (lexer, 0);
 695 }
 696
 697 /* Returns the number in LEXER's current token.
 698
 699    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
 700    tokens this function will always return zero. */
 701 double
 702 lex_tokval (const struct lexer *lexer)
 703 {
 704   return lex_next_tokval (lexer, 0);
 705 }
 706
 707 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
 708
 709    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 710    this functions this function will always return NULL.
 711
 712    The UTF-8 encoding of the returned string is correct for variable names and
 713    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 714    data_in() to use it in a "union value".  */
 715 const char *
 716 lex_tokcstr (const struct lexer *lexer)
 717 {
 718   return lex_next_tokcstr (lexer, 0);
 719 }
 720
 721 /* Returns the string in LEXER's current token, UTF-8 encoded.  The string is
 722    null-terminated (but the null terminator is not included in the returned
 723    substring's 'length').
 724
 725    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 726    this functions this function will always return NULL.
 727
 728    The UTF-8 encoding of the returned string is correct for variable names and
 729    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 730    data_in() to use it in a "union value".  */
 731 struct substring
 732 lex_tokss (const struct lexer *lexer)
 733 {
 734   return lex_next_tokss (lexer, 0);
 735 }
 736 \f
 737 /* Looking ahead.
 738
 739    A value of 0 for N as an argument to any of these functions refers to the
 740    current token.  Lookahead is limited to the current command.  Any N greater
 741    than the number of tokens remaining in the current command will be treated
 742    as referring to a T_ENDCMD token. */
 743
 744 static const struct lex_token *
 745 lex_next__ (const struct lexer *lexer_, int n)
 746 {
 747   struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
 748   struct lex_source *src = lex_source__ (lexer);
 749
 750   if (src != NULL)
 751     return lex_source_next__ (src, n);
 752   else
 753     {
 754       static const struct lex_token stop_token =
 755         { TOKEN_INITIALIZER (T_STOP, 0.0, ""), 0, 0, 0, 0 };
 756
 757       return &stop_token;
 758     }
 759 }
 760
 761 static const struct lex_token *
 762 lex_source_next__ (const struct lex_source *src, int n)
 763 {
 764   while (deque_count (&src->deque) <= n)
 765     {
 766       if (!deque_is_empty (&src->deque))
 767         {
 768           struct lex_token *front;
 769
 770           front = &src->tokens[deque_front (&src->deque, 0)];
 771           if (front->token.type == T_STOP || front->token.type == T_ENDCMD)
 772             return front;
 773         }
 774
 775       lex_source_get__ (src);
 776     }
 777
 778   return &src->tokens[deque_back (&src->deque, n)];
 779 }
 780
 781 /* Returns the "struct token" of the token N after the current one in LEXER.
 782    The returned pointer can be invalidated by pretty much any succeeding call
 783    into the lexer, although the string pointer within the returned token is
 784    only invalidated by consuming the token (e.g. with lex_get()). */
 785 const struct token *
 786 lex_next (const struct lexer *lexer, int n)
 787 {
 788   return &lex_next__ (lexer, n)->token;
 789 }
 790
 791 /* Returns the type of the token N after the current one in LEXER. */
 792 enum token_type
 793 lex_next_token (const struct lexer *lexer, int n)
 794 {
 795   return lex_next (lexer, n)->type;
 796 }
 797
 798 /* Returns the number in the tokn N after the current one in LEXER.
 799
 800    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
 801    tokens this function will always return zero. */
 802 double
 803 lex_next_tokval (const struct lexer *lexer, int n)
 804 {
 805   const struct token *token = lex_next (lexer, n);
 806   return token->number;
 807 }
 808
 809 /* Returns the null-terminated string in the token N after the current one, in
 810    UTF-8 encoding.
 811
 812    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 813    this functions this function will always return NULL.
 814
 815    The UTF-8 encoding of the returned string is correct for variable names and
 816    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 817    data_in() to use it in a "union value".  */
 818 const char *
 819 lex_next_tokcstr (const struct lexer *lexer, int n)
 820 {
 821   return lex_next_tokss (lexer, n).string;
 822 }
 823
 824 /* Returns the string in the token N after the current one, in UTF-8 encoding.
 825    The string is null-terminated (but the null terminator is not included in
 826    the returned substring's 'length').
 827
 828    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 829    this functions this function will always return NULL.
 830
 831    The UTF-8 encoding of the returned string is correct for variable names and
 832    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 833    data_in() to use it in a "union value".  */
 834 struct substring
 835 lex_next_tokss (const struct lexer *lexer, int n)
 836 {
 837   return lex_next (lexer, n)->string;
 838 }
 839
 840 static bool
 841 lex_tokens_match (const struct token *actual, const struct token *expected)
 842 {
 843   if (actual->type != expected->type)
 844     return false;
 845
 846   switch (actual->type)
 847     {
 848     case T_POS_NUM:
 849     case T_NEG_NUM:
 850       return actual->number == expected->number;
 851
 852     case T_ID:
 853       return lex_id_match (expected->string, actual->string);
 854
 855     case T_STRING:
 856       return (actual->string.length == expected->string.length
 857               && !memcmp (actual->string.string, expected->string.string,
 858                           actual->string.length));
 859
 860     default:
 861       return true;
 862     }
 863 }
 864
 865 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
 866    skips it and returns true.  Otherwise, returns false.
 867
 868    S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
 869    "2SLS", or "END INPUT PROGRAM".  Identifiers may be abbreviated to their
 870    first three letters. */
 871 bool
 872 lex_match_phrase (struct lexer *lexer, const char *s)
 873 {
 874   struct string_lexer slex;
 875   struct token token;
 876   int i;
 877
 878   i = 0;
 879   string_lexer_init (&slex, s, SEG_MODE_INTERACTIVE);
 880   while (string_lexer_next (&slex, &token))
 881     if (token.type != SCAN_SKIP)
 882       {
 883         bool match = lex_tokens_match (lex_next (lexer, i++), &token);
 884         token_destroy (&token);
 885         if (!match)
 886           return false;
 887       }
 888
 889   while (i-- > 0)
 890     lex_get (lexer);
 891   return true;
 892 }
 893
 894 static int
 895 lex_source_get_first_line_number (const struct lex_source *src, int n)
 896 {
 897   return lex_source_next__ (src, n)->first_line;
 898 }
 899
 900 static int
 901 count_newlines (char *s, size_t length)
 902 {
 903   int n_newlines = 0;
 904   char *newline;
 905
 906   while ((newline = memchr (s, '\n', length)) != NULL)
 907     {
 908       n_newlines++;
 909       length -= (newline + 1) - s;
 910       s = newline + 1;
 911     }
 912
 913   return n_newlines;
 914 }
 915
 916 static int
 917 lex_source_get_last_line_number (const struct lex_source *src, int n)
 918 {
 919   const struct lex_token *token = lex_source_next__ (src, n);
 920
 921   if (token->first_line == 0)
 922     return 0;
 923   else
 924     {
 925       char *token_str = &src->buffer[token->token_pos - src->tail];
 926       return token->first_line + count_newlines (token_str, token->token_len) + 1;
 927     }
 928 }
 929
 930 static int
 931 count_columns (const char *s_, size_t length)
 932 {
 933   const uint8_t *s = CHAR_CAST (const uint8_t *, s_);
 934   int columns;
 935   size_t ofs;
 936   int mblen;
 937
 938   columns = 0;
 939   for (ofs = 0; ofs < length; ofs += mblen)
 940     {
 941       ucs4_t uc;
 942
 943       mblen = u8_mbtouc (&uc, s + ofs, length - ofs);
 944       if (uc != '\t')
 945         {
 946           int width = uc_width (uc, "UTF-8");
 947           if (width > 0)
 948             columns += width;
 949         }
 950       else
 951         columns = ROUND_UP (columns + 1, 8);
 952     }
 953
 954   return columns + 1;
 955 }
 956
 957 static int
 958 lex_source_get_first_column (const struct lex_source *src, int n)
 959 {
 960   const struct lex_token *token = lex_source_next__ (src, n);
 961   return count_columns (&src->buffer[token->line_pos - src->tail],
 962                         token->token_pos - token->line_pos);
 963 }
 964
 965 static int
 966 lex_source_get_last_column (const struct lex_source *src, int n)
 967 {
 968   const struct lex_token *token = lex_source_next__ (src, n);
 969   char *start, *end, *newline;
 970
 971   start = &src->buffer[token->line_pos - src->tail];
 972   end = &src->buffer[(token->token_pos + token->token_len) - src->tail];
 973   newline = memrchr (start, '\n', end - start);
 974   if (newline != NULL)
 975     start = newline + 1;
 976   return count_columns (start, end - start);
 977 }
 978
 979 /* Returns the 1-based line number of the start of the syntax that represents
 980    the token N after the current one in LEXER.  Returns 0 for a T_STOP token or
 981    if the token is drawn from a source that does not have line numbers. */
 982 int
 983 lex_get_first_line_number (const struct lexer *lexer, int n)
 984 {
 985   const struct lex_source *src = lex_source__ (lexer);
 986   return src != NULL ? lex_source_get_first_line_number (src, n) : 0;
 987 }
 988
 989 /* Returns the 1-based line number of the end of the syntax that represents the
 990    token N after the current one in LEXER, plus 1.  Returns 0 for a T_STOP
 991    token or if the token is drawn from a source that does not have line
 992    numbers.
 993
 994    Most of the time, a single token is wholly within a single line of syntax,
 995    but there are two exceptions: a T_STRING token can be made up of multiple
 996    segments on adjacent lines connected with "+" punctuators, and a T_NEG_NUM
 997    token can consist of a "-" on one line followed by the number on the next.
 998  */
 999 int
1000 lex_get_last_line_number (const struct lexer *lexer, int n)
1001 {
1002   const struct lex_source *src = lex_source__ (lexer);
1003   return src != NULL ? lex_source_get_last_line_number (src, n) : 0;
1004 }
1005
1006 /* Returns the 1-based column number of the start of the syntax that represents
1007    the token N after the current one in LEXER.  Returns 0 for a T_STOP
1008    token.
1009
1010    Column numbers are measured according to the width of characters as shown in
1011    a typical fixed-width font, in which CJK characters have width 2 and
1012    combining characters have width 0.  */
1013 int
1014 lex_get_first_column (const struct lexer *lexer, int n)
1015 {
1016   const struct lex_source *src = lex_source__ (lexer);
1017   return src != NULL ? lex_source_get_first_column (src, n) : 0;
1018 }
1019
1020 /* Returns the 1-based column number of the end of the syntax that represents
1021    the token N after the current one in LEXER, plus 1.  Returns 0 for a T_STOP
1022    token.
1023
1024    Column numbers are measured according to the width of characters as shown in
1025    a typical fixed-width font, in which CJK characters have width 2 and
1026    combining characters have width 0.  */
1027 int
1028 lex_get_last_column (const struct lexer *lexer, int n)
1029 {
1030   const struct lex_source *src = lex_source__ (lexer);
1031   return src != NULL ? lex_source_get_last_column (src, n) : 0;
1032 }
1033
1034 /* Returns the name of the syntax file from which the current command is drawn.
1035    Returns NULL for a T_STOP token or if the command's source does not have
1036    line numbers.
1037
1038    There is no version of this function that takes an N argument because
1039    lookahead only works to the end of a command and any given command is always
1040    within a single syntax file. */
1041 const char *
1042 lex_get_file_name (const struct lexer *lexer)
1043 {
1044   struct lex_source *src = lex_source__ (lexer);
1045   return src == NULL ? NULL : src->reader->file_name;
1046 }
1047
1048 const char *
1049 lex_get_encoding (const struct lexer *lexer)
1050 {
1051   struct lex_source *src = lex_source__ (lexer);
1052   return src == NULL ? NULL : src->reader->encoding;
1053 }
1054
1055
1056 /* Returns the syntax mode for the syntax file from which the current drawn is
1057    drawn.  Returns LEX_SYNTAX_AUTO for a T_STOP token or if the command's
1058    source does not have line numbers.
1059
1060    There is no version of this function that takes an N argument because
1061    lookahead only works to the end of a command and any given command is always
1062    within a single syntax file. */
1063 enum lex_syntax_mode
1064 lex_get_syntax_mode (const struct lexer *lexer)
1065 {
1066   struct lex_source *src = lex_source__ (lexer);
1067   return src == NULL ? LEX_SYNTAX_AUTO : src->reader->syntax;
1068 }
1069
1070 /* Returns the error mode for the syntax file from which the current drawn is
1071    drawn.  Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1072    source does not have line numbers.
1073
1074    There is no version of this function that takes an N argument because
1075    lookahead only works to the end of a command and any given command is always
1076    within a single syntax file. */
1077 enum lex_error_mode
1078 lex_get_error_mode (const struct lexer *lexer)
1079 {
1080   struct lex_source *src = lex_source__ (lexer);
1081   return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1082 }
1083
1084 /* If the source that LEXER is currently reading has error mode
1085    LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1086    token to be read comes directly from whatever is next read from the stream.
1087
1088    It makes sense to call this function after encountering an error in a
1089    command entered on the console, because usually the user would prefer not to
1090    have cascading errors. */
1091 void
1092 lex_interactive_reset (struct lexer *lexer)
1093 {
1094   struct lex_source *src = lex_source__ (lexer);
1095   if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1096     {
1097       src->head = src->tail = 0;
1098       src->journal_pos = src->seg_pos = src->line_pos = 0;
1099       src->n_newlines = 0;
1100       src->suppress_next_newline = false;
1101       segmenter_init (&src->segmenter, segmenter_get_mode (&src->segmenter));
1102       while (!deque_is_empty (&src->deque))
1103         lex_source_pop__ (src);
1104       lex_source_push_endcmd__ (src);
1105     }
1106 }
1107
1108 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1109 void
1110 lex_discard_rest_of_command (struct lexer *lexer)
1111 {
1112   while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1113     lex_get (lexer);
1114 }
1115
1116 /* Discards all lookahead tokens in LEXER, then discards all input sources
1117    until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1118    runs out of input sources. */
1119 void
1120 lex_discard_noninteractive (struct lexer *lexer)
1121 {
1122   struct lex_source *src = lex_source__ (lexer);
1123
1124   if (src != NULL)
1125     {
1126       while (!deque_is_empty (&src->deque))
1127         lex_source_pop__ (src);
1128
1129       for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1130            src = lex_source__ (lexer))
1131         lex_source_destroy (src);
1132     }
1133 }
1134 \f
1135 static size_t
1136 lex_source_max_tail__ (const struct lex_source *src)
1137 {
1138   const struct lex_token *token;
1139   size_t max_tail;
1140
1141   assert (src->seg_pos >= src->line_pos);
1142   max_tail = MIN (src->journal_pos, src->line_pos);
1143
1144   /* Use the oldest token also.  (We know that src->deque cannot be empty
1145      because we are in the process of adding a new token, which is already
1146      initialized enough to use here.) */
1147   token = &src->tokens[deque_back (&src->deque, 0)];
1148   assert (token->token_pos >= token->line_pos);
1149   max_tail = MIN (max_tail, token->line_pos);
1150
1151   return max_tail;
1152 }
1153
1154 static void
1155 lex_source_expand__ (struct lex_source *src)
1156 {
1157   if (src->head - src->tail >= src->allocated)
1158     {
1159       size_t max_tail = lex_source_max_tail__ (src);
1160       if (max_tail > src->tail)
1161         {
1162           /* Advance the tail, freeing up room at the head. */
1163           memmove (src->buffer, src->buffer + (max_tail - src->tail),
1164                    src->head - max_tail);
1165           src->tail = max_tail;
1166         }
1167       else
1168         {
1169           /* Buffer is completely full.  Expand it. */
1170           src->buffer = x2realloc (src->buffer, &src->allocated);
1171         }
1172     }
1173   else
1174     {
1175       /* There's space available at the head of the buffer.  Nothing to do. */
1176     }
1177 }
1178
1179 static void
1180 lex_source_read__ (struct lex_source *src)
1181 {
1182   do
1183     {
1184       lex_source_expand__ (src);
1185
1186       size_t head_ofs = src->head - src->tail;
1187       size_t space = src->allocated - head_ofs;
1188       enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1189       size_t n = src->reader->class->read (src->reader, &src->buffer[head_ofs],
1190                                            space, prompt);
1191       assert (n <= space);
1192
1193       for (char *p = &src->buffer[head_ofs]; p < &src->buffer[head_ofs + n];
1194            p++)
1195         if (*p == '\0')
1196           {
1197             struct msg m;
1198             m.category = MSG_C_SYNTAX;
1199             m.severity = MSG_S_ERROR;
1200             m.file_name = src->reader->file_name;
1201             m.first_line = 0;
1202             m.last_line = 0;
1203             m.first_column = 0;
1204             m.last_column = 0;
1205             m.text = xstrdup ("Bad character U+0000 in input.");
1206             msg_emit (&m);
1207
1208             *p = ' ';
1209           }
1210
1211       if (n == 0)
1212         {
1213           /* End of input.
1214
1215              Ensure that the input always ends in a new-line followed by a null
1216              byte, as required by the segmenter library. */
1217
1218           if (src->head == src->tail
1219               || src->buffer[src->head - src->tail - 1] != '\n')
1220             src->buffer[src->head++ - src->tail] = '\n';
1221
1222           lex_source_expand__ (src);
1223           src->buffer[src->head++ - src->tail] = '\0';
1224
1225           return;
1226         }
1227
1228       src->head += n;
1229     }
1230   while (!memchr (&src->buffer[src->seg_pos - src->tail], '\n',
1231                   src->head - src->seg_pos));
1232 }
1233
1234 static struct lex_source *
1235 lex_source__ (const struct lexer *lexer)
1236 {
1237   return (ll_is_empty (&lexer->sources) ? NULL
1238           : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1239 }
1240
1241 static struct substring
1242 lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
1243 {
1244   const struct lex_token *token0 = lex_source_next__ (src, n0);
1245   const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1));
1246   size_t start = token0->token_pos;
1247   size_t end = token1->token_pos + token1->token_len;
1248
1249   return ss_buffer (&src->buffer[start - src->tail], end - start);
1250 }
1251
1252 static void
1253 lex_ellipsize__ (struct substring in, char *out, size_t out_size)
1254 {
1255   size_t out_maxlen;
1256   size_t out_len;
1257   int mblen;
1258
1259   assert (out_size >= 16);
1260   out_maxlen = out_size - (in.length >= out_size ? 3 : 0) - 1;
1261   for (out_len = 0; out_len < in.length; out_len += mblen)
1262     {
1263       if (in.string[out_len] == '\n'
1264           || (in.string[out_len] == '\r'
1265               && out_len + 1 < in.length
1266               && in.string[out_len + 1] == '\n'))
1267         break;
1268
1269       mblen = u8_mblen (CHAR_CAST (const uint8_t *, in.string + out_len),
1270                         in.length - out_len);
1271       if (out_len + mblen > out_maxlen)
1272         break;
1273     }
1274
1275   memcpy (out, in.string, out_len);
1276   strcpy (&out[out_len], out_len < in.length ? "..." : "");
1277 }
1278
1279 static void
1280 lex_source_error_valist (struct lex_source *src, int n0, int n1,
1281                          const char *format, va_list args)
1282 {
1283   const struct lex_token *token;
1284   struct string s;
1285   struct msg m;
1286
1287   ds_init_empty (&s);
1288
1289   token = lex_source_next__ (src, n0);
1290   if (token->token.type == T_ENDCMD)
1291     ds_put_cstr (&s, _("Syntax error at end of command"));
1292   else
1293     {
1294       struct substring syntax = lex_source_get_syntax__ (src, n0, n1);
1295       if (!ss_is_empty (syntax))
1296         {
1297           char syntax_cstr[64];
1298
1299           lex_ellipsize__ (syntax, syntax_cstr, sizeof syntax_cstr);
1300           ds_put_format (&s, _("Syntax error at `%s'"), syntax_cstr);
1301         }
1302       else
1303         ds_put_cstr (&s, _("Syntax error"));
1304     }
1305
1306   if (format)
1307     {
1308       ds_put_cstr (&s, ": ");
1309       ds_put_vformat (&s, format, args);
1310     }
1311   ds_put_byte (&s, '.');
1312
1313   m.category = MSG_C_SYNTAX;
1314   m.severity = MSG_S_ERROR;
1315   m.file_name = src->reader->file_name;
1316   m.first_line = lex_source_get_first_line_number (src, n0);
1317   m.last_line = lex_source_get_last_line_number (src, n1);
1318   m.first_column = lex_source_get_first_column (src, n0);
1319   m.last_column = lex_source_get_last_column (src, n1);
1320   m.text = ds_steal_cstr (&s);
1321   msg_emit (&m);
1322 }
1323
1324 static void PRINTF_FORMAT (2, 3)
1325 lex_get_error (struct lex_source *src, const char *format, ...)
1326 {
1327   va_list args;
1328   int n;
1329
1330   va_start (args, format);
1331
1332   n = deque_count (&src->deque) - 1;
1333   lex_source_error_valist (src, n, n, format, args);
1334   lex_source_pop_front (src);
1335
1336   va_end (args);
1337 }
1338
1339 /* Attempts to append an additional token into SRC's deque, reading more from
1340    the underlying lex_reader if necessary..  Returns true if successful, false
1341    if the deque already represents (a suffix of) the whole lex_reader's
1342    contents, */
1343 static bool
1344 lex_source_get__ (const struct lex_source *src_)
1345 {
1346   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1347   if (src->eof)
1348     return false;
1349
1350   /* State maintained while scanning tokens.  Usually we only need a single
1351      state, but scanner_push() can return SCAN_SAVE to indicate that the state
1352      needs to be saved and possibly restored later with SCAN_BACK. */
1353   struct state
1354     {
1355       struct segmenter segmenter;
1356       enum segment_type last_segment;
1357       int newlines;             /* Number of newlines encountered so far. */
1358       /* Maintained here so we can update lex_source's similar members when we
1359          finish. */
1360       size_t line_pos;
1361       size_t seg_pos;
1362     };
1363
1364   /* Initialize state. */
1365   struct state state =
1366     {
1367       .segmenter = src->segmenter,
1368       .newlines = 0,
1369       .seg_pos = src->seg_pos,
1370       .line_pos = src->line_pos,
1371     };
1372   struct state saved = state;
1373
1374   /* Append a new token to SRC and initialize it. */
1375   struct lex_token *token = lex_push_token__ (src);
1376   struct scanner scanner;
1377   scanner_init (&scanner, &token->token);
1378   token->line_pos = src->line_pos;
1379   token->token_pos = src->seg_pos;
1380   if (src->reader->line_number > 0)
1381     token->first_line = src->reader->line_number + src->n_newlines;
1382   else
1383     token->first_line = 0;
1384
1385   /* Extract segments and pass them through the scanner until we obtain a
1386      token. */
1387   for (;;)
1388     {
1389       /* Extract a segment. */
1390       const char *segment = &src->buffer[state.seg_pos - src->tail];
1391       size_t seg_maxlen = src->head - state.seg_pos;
1392       enum segment_type type;
1393       int seg_len = segmenter_push (&state.segmenter, segment, seg_maxlen,
1394                                     &type);
1395       if (seg_len < 0)
1396         {
1397           /* The segmenter needs more input to produce a segment. */
1398           lex_source_read__ (src);
1399           continue;
1400         }
1401
1402       /* Update state based on the segment. */
1403       state.last_segment = type;
1404       state.seg_pos += seg_len;
1405       if (type == SEG_NEWLINE)
1406         {
1407           state.newlines++;
1408           state.line_pos = state.seg_pos;
1409         }
1410
1411       /* Pass the segment into the scanner and try to get a token out. */
1412       enum scan_result result = scanner_push (&scanner, type,
1413                                               ss_buffer (segment, seg_len),
1414                                               &token->token);
1415       if (result == SCAN_SAVE)
1416         saved = state;
1417       else if (result == SCAN_BACK)
1418         {
1419           state = saved;
1420           break;
1421         }
1422       else if (result == SCAN_DONE)
1423         break;
1424     }
1425
1426   /* If we've reached the end of a line, or the end of a command, then pass
1427      the line to the output engine as a syntax text item.  */
1428   int n_lines = state.newlines;
1429   if (state.last_segment == SEG_END_COMMAND && !src->suppress_next_newline)
1430     {
1431       n_lines++;
1432       src->suppress_next_newline = true;
1433     }
1434   else if (n_lines > 0 && src->suppress_next_newline)
1435     {
1436       n_lines--;
1437       src->suppress_next_newline = false;
1438     }
1439   for (int i = 0; i < n_lines; i++)
1440     {
1441       const char *line = &src->buffer[src->journal_pos - src->tail];
1442       const char *newline = rawmemchr (line, '\n');
1443       size_t line_len = newline - line;
1444       if (line_len > 0 && line[line_len - 1] == '\r')
1445         line_len--;
1446
1447       char *syntax = malloc (line_len + 2);
1448       memcpy (syntax, line, line_len);
1449       syntax[line_len] = '\n';
1450       syntax[line_len + 1] = '\0';
1451
1452       text_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX, syntax));
1453
1454       src->journal_pos += newline - line + 1;
1455     }
1456
1457   token->token_len = state.seg_pos - src->seg_pos;
1458
1459   src->segmenter = state.segmenter;
1460   src->seg_pos = state.seg_pos;
1461   src->line_pos = state.line_pos;
1462   src->n_newlines += state.newlines;
1463
1464   switch (token->token.type)
1465     {
1466     default:
1467       break;
1468
1469     case T_STOP:
1470       token->token.type = T_ENDCMD;
1471       src->eof = true;
1472       break;
1473
1474     case SCAN_BAD_HEX_LENGTH:
1475       lex_get_error (src, _("String of hex digits has %d characters, which "
1476                             "is not a multiple of 2"),
1477                      (int) token->token.number);
1478       break;
1479
1480     case SCAN_BAD_HEX_DIGIT:
1481     case SCAN_BAD_UNICODE_DIGIT:
1482       lex_get_error (src, _("`%c' is not a valid hex digit"),
1483                      (int) token->token.number);
1484       break;
1485
1486     case SCAN_BAD_UNICODE_LENGTH:
1487       lex_get_error (src, _("Unicode string contains %d bytes, which is "
1488                             "not in the valid range of 1 to 8 bytes"),
1489                      (int) token->token.number);
1490       break;
1491
1492     case SCAN_BAD_UNICODE_CODE_POINT:
1493       lex_get_error (src, _("U+%04X is not a valid Unicode code point"),
1494                      (int) token->token.number);
1495       break;
1496
1497     case SCAN_EXPECTED_QUOTE:
1498       lex_get_error (src, _("Unterminated string constant"));
1499       break;
1500
1501     case SCAN_EXPECTED_EXPONENT:
1502       lex_get_error (src, _("Missing exponent following `%s'"),
1503                      token->token.string.string);
1504       break;
1505
1506     case SCAN_UNEXPECTED_DOT:
1507       lex_get_error (src, _("Unexpected `.' in middle of command"));
1508       break;
1509
1510     case SCAN_UNEXPECTED_CHAR:
1511       {
1512         char c_name[16];
1513         lex_get_error (src, _("Bad character %s in input"),
1514                        uc_name (token->token.number, c_name));
1515       }
1516       break;
1517
1518     case SCAN_SKIP:
1519       lex_source_pop_front (src);
1520       break;
1521     }
1522
1523   return true;
1524 }
1525 \f
1526 static void
1527 lex_source_push_endcmd__ (struct lex_source *src)
1528 {
1529   struct lex_token *token = lex_push_token__ (src);
1530   token->token.type = T_ENDCMD;
1531   token->token_pos = 0;
1532   token->token_len = 0;
1533   token->line_pos = 0;
1534   token->first_line = 0;
1535 }
1536
1537 static struct lex_source *
1538 lex_source_create (struct lex_reader *reader)
1539 {
1540   struct lex_source *src;
1541   enum segmenter_mode mode;
1542
1543   src = xzalloc (sizeof *src);
1544   src->reader = reader;
1545
1546   if (reader->syntax == LEX_SYNTAX_AUTO)
1547     mode = SEG_MODE_AUTO;
1548   else if (reader->syntax == LEX_SYNTAX_INTERACTIVE)
1549     mode = SEG_MODE_INTERACTIVE;
1550   else if (reader->syntax == LEX_SYNTAX_BATCH)
1551     mode = SEG_MODE_BATCH;
1552   else
1553     NOT_REACHED ();
1554   segmenter_init (&src->segmenter, mode);
1555
1556   src->tokens = deque_init (&src->deque, 4, sizeof *src->tokens);
1557
1558   lex_source_push_endcmd__ (src);
1559
1560   return src;
1561 }
1562
1563 static void
1564 lex_source_destroy (struct lex_source *src)
1565 {
1566   char *file_name = src->reader->file_name;
1567   char *encoding = src->reader->encoding;
1568   if (src->reader->class->destroy != NULL)
1569     src->reader->class->destroy (src->reader);
1570   free (file_name);
1571   free (encoding);
1572   free (src->buffer);
1573   while (!deque_is_empty (&src->deque))
1574     lex_source_pop__ (src);
1575   free (src->tokens);
1576   ll_remove (&src->ll);
1577   free (src);
1578 }
1579 \f
1580 struct lex_file_reader
1581   {
1582     struct lex_reader reader;
1583     struct u8_istream *istream;
1584   };
1585
1586 static struct lex_reader_class lex_file_reader_class;
1587
1588 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
1589    from stdin if FILE_NAME is "-").  The file is expected to be encoded with
1590    ENCODING, which should take one of the forms accepted by
1591    u8_istream_for_file().  SYNTAX and ERROR become the syntax mode and error
1592    mode of the new reader, respectively.
1593
1594    Returns a null pointer if FILE_NAME cannot be opened. */
1595 struct lex_reader *
1596 lex_reader_for_file (const char *file_name, const char *encoding,
1597                      enum lex_syntax_mode syntax,
1598                      enum lex_error_mode error)
1599 {
1600   struct lex_file_reader *r;
1601   struct u8_istream *istream;
1602
1603   istream = (!strcmp(file_name, "-")
1604              ? u8_istream_for_fd (encoding, STDIN_FILENO)
1605              : u8_istream_for_file (encoding, file_name, O_RDONLY));
1606   if (istream == NULL)
1607     {
1608       msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
1609       return NULL;
1610     }
1611
1612   r = xmalloc (sizeof *r);
1613   lex_reader_init (&r->reader, &lex_file_reader_class);
1614   r->reader.syntax = syntax;
1615   r->reader.error = error;
1616   r->reader.file_name = xstrdup (file_name);
1617   r->reader.encoding = encoding ? xstrdup (encoding) : NULL;
1618   r->reader.line_number = 1;
1619   r->istream = istream;
1620
1621   return &r->reader;
1622 }
1623
1624 static struct lex_file_reader *
1625 lex_file_reader_cast (struct lex_reader *r)
1626 {
1627   return UP_CAST (r, struct lex_file_reader, reader);
1628 }
1629
1630 static size_t
1631 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
1632                enum prompt_style prompt_style UNUSED)
1633 {
1634   struct lex_file_reader *r = lex_file_reader_cast (r_);
1635   ssize_t n_read = u8_istream_read (r->istream, buf, n);
1636   if (n_read < 0)
1637     {
1638       msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
1639       return 0;
1640     }
1641   return n_read;
1642 }
1643
1644 static void
1645 lex_file_close (struct lex_reader *r_)
1646 {
1647   struct lex_file_reader *r = lex_file_reader_cast (r_);
1648
1649   if (u8_istream_fileno (r->istream) != STDIN_FILENO)
1650     {
1651       if (u8_istream_close (r->istream) != 0)
1652         msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
1653     }
1654   else
1655     u8_istream_free (r->istream);
1656
1657   free (r);
1658 }
1659
1660 static struct lex_reader_class lex_file_reader_class =
1661   {
1662     lex_file_read,
1663     lex_file_close
1664   };
1665 \f
1666 struct lex_string_reader
1667   {
1668     struct lex_reader reader;
1669     struct substring s;
1670     size_t offset;
1671   };
1672
1673 static struct lex_reader_class lex_string_reader_class;
1674
1675 /* Creates and returns a new lex_reader for the contents of S, which must be
1676    encoded in the given ENCODING.  The new reader takes ownership of S and will free it
1677    with ss_dealloc() when it is closed. */
1678 struct lex_reader *
1679 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
1680 {
1681   struct lex_string_reader *r;
1682
1683   r = xmalloc (sizeof *r);
1684   lex_reader_init (&r->reader, &lex_string_reader_class);
1685   r->reader.syntax = LEX_SYNTAX_AUTO;
1686   r->reader.encoding = encoding ? xstrdup (encoding) : NULL;
1687   r->s = s;
1688   r->offset = 0;
1689
1690   return &r->reader;
1691 }
1692
1693 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
1694    which must be encoded in ENCODING.  The caller retains ownership of S. */
1695 struct lex_reader *
1696 lex_reader_for_string (const char *s, const char *encoding)
1697 {
1698   struct substring ss;
1699   ss_alloc_substring (&ss, ss_cstr (s));
1700   return lex_reader_for_substring_nocopy (ss, encoding);
1701 }
1702
1703 /* Formats FORMAT as a printf()-like format string and creates and returns a
1704    new lex_reader for the formatted result.  */
1705 struct lex_reader *
1706 lex_reader_for_format (const char *format, const char *encoding, ...)
1707 {
1708   struct lex_reader *r;
1709   va_list args;
1710
1711   va_start (args, encoding);
1712   r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
1713   va_end (args);
1714
1715   return r;
1716 }
1717
1718 static struct lex_string_reader *
1719 lex_string_reader_cast (struct lex_reader *r)
1720 {
1721   return UP_CAST (r, struct lex_string_reader, reader);
1722 }
1723
1724 static size_t
1725 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
1726                  enum prompt_style prompt_style UNUSED)
1727 {
1728   struct lex_string_reader *r = lex_string_reader_cast (r_);
1729   size_t chunk;
1730
1731   chunk = MIN (n, r->s.length - r->offset);
1732   memcpy (buf, r->s.string + r->offset, chunk);
1733   r->offset += chunk;
1734
1735   return chunk;
1736 }
1737
1738 static void
1739 lex_string_close (struct lex_reader *r_)
1740 {
1741   struct lex_string_reader *r = lex_string_reader_cast (r_);
1742
1743   ss_dealloc (&r->s);
1744   free (r);
1745 }
1746
1747 static struct lex_reader_class lex_string_reader_class =
1748   {
1749     lex_string_read,
1750     lex_string_close
1751   };