src/language/lexer/lexer.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "language/lexer/lexer.h"
  20
  21 #include <errno.h>
  22 #include <fcntl.h>
  23 #include <limits.h>
  24 #include <math.h>
  25 #include <stdarg.h>
  26 #include <stdlib.h>
  27 #include <string.h>
  28 #include <unictype.h>
  29 #include <unistd.h>
  30 #include <unistr.h>
  31 #include <uniwidth.h>
  32
  33 #include "data/file-name.h"
  34 #include "language/command.h"
  35 #include "language/lexer/scan.h"
  36 #include "language/lexer/segment.h"
  37 #include "language/lexer/token.h"
  38 #include "libpspp/assertion.h"
  39 #include "libpspp/cast.h"
  40 #include "libpspp/deque.h"
  41 #include "libpspp/i18n.h"
  42 #include "libpspp/ll.h"
  43 #include "libpspp/message.h"
  44 #include "libpspp/misc.h"
  45 #include "libpspp/str.h"
  46 #include "libpspp/u8-istream.h"
  47 #include "output/journal.h"
  48 #include "output/text-item.h"
  49
  50 #include "gl/c-ctype.h"
  51 #include "gl/minmax.h"
  52 #include "gl/xalloc.h"
  53 #include "gl/xmemdup0.h"
  54
  55 #include "gettext.h"
  56 #define _(msgid) gettext (msgid)
  57 #define N_(msgid) msgid
  58
  59 /* A token within a lex_source. */
  60 struct lex_token
  61   {
  62     /* The regular token information. */
  63     struct token token;
  64
  65     /* Location of token in terms of the lex_source's buffer.
  66        src->tail <= line_pos <= token_pos <= src->head. */
  67     size_t token_pos;           /* Start of token. */
  68     size_t token_len;           /* Length of source for token in bytes. */
  69     size_t line_pos;            /* Start of line containing token_pos. */
  70     int first_line;             /* Line number at token_pos. */
  71   };
  72
  73 /* A source of tokens, corresponding to a syntax file.
  74
  75    This is conceptually a lex_reader wrapped with everything needed to convert
  76    its UTF-8 bytes into tokens. */
  77 struct lex_source
  78   {
  79     struct ll ll;               /* In lexer's list of sources. */
  80     struct lex_reader *reader;
  81     struct segmenter segmenter;
  82     bool eof;                   /* True if T_STOP was read from 'reader'. */
  83
  84     /* Buffer of UTF-8 bytes. */
  85     char *buffer;
  86     size_t allocated;           /* Number of bytes allocated. */
  87     size_t tail;                /* &buffer[0] offset into UTF-8 source. */
  88     size_t head;                /* &buffer[head - tail] offset into source. */
  89
  90     /* Positions in source file, tail <= pos <= head for each member here. */
  91     size_t journal_pos;         /* First byte not yet output to journal. */
  92     size_t seg_pos;             /* First byte not yet scanned as token. */
  93     size_t line_pos;            /* First byte of line containing seg_pos. */
  94
  95     int n_newlines;             /* Number of new-lines up to seg_pos. */
  96     bool suppress_next_newline;
  97
  98     /* Tokens. */
  99     struct deque deque;         /* Indexes into 'tokens'. */
 100     struct lex_token *tokens;   /* Lookahead tokens for parser. */
 101   };
 102
 103 static struct lex_source *lex_source_create (struct lex_reader *);
 104 static void lex_source_destroy (struct lex_source *);
 105
 106 /* Lexer. */
 107 struct lexer
 108   {
 109     struct ll_list sources;     /* Contains "struct lex_source"s. */
 110   };
 111
 112 static struct lex_source *lex_source__ (const struct lexer *);
 113 static const struct lex_token *lex_next__ (const struct lexer *, int n);
 114 static void lex_source_push_endcmd__ (struct lex_source *);
 115
 116 static void lex_source_pop__ (struct lex_source *);
 117 static bool lex_source_get__ (const struct lex_source *);
 118 static void lex_source_error_valist (struct lex_source *, int n0, int n1,
 119                                      const char *format, va_list)
 120    PRINTF_FORMAT (4, 0);
 121 static const struct lex_token *lex_source_next__ (const struct lex_source *,
 122                                                   int n);
 123 \f
 124 /* Initializes READER with the specified CLASS and otherwise some reasonable
 125    defaults.  The caller should fill in the others members as desired. */
 126 void
 127 lex_reader_init (struct lex_reader *reader,
 128                  const struct lex_reader_class *class)
 129 {
 130   reader->class = class;
 131   reader->syntax = LEX_SYNTAX_AUTO;
 132   reader->error = LEX_ERROR_INTERACTIVE;
 133   reader->file_name = NULL;
 134   reader->line_number = 0;
 135 }
 136
 137 /* Frees any file name already in READER and replaces it by a copy of
 138    FILE_NAME, or if FILE_NAME is null then clears any existing name. */
 139 void
 140 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
 141 {
 142   free (reader->file_name);
 143   reader->file_name = file_name != NULL ? xstrdup (file_name) : NULL;
 144 }
 145 \f
 146 /* Creates and returns a new lexer. */
 147 struct lexer *
 148 lex_create (void)
 149 {
 150   struct lexer *lexer = xzalloc (sizeof *lexer);
 151   ll_init (&lexer->sources);
 152   return lexer;
 153 }
 154
 155 /* Destroys LEXER. */
 156 void
 157 lex_destroy (struct lexer *lexer)
 158 {
 159   if (lexer != NULL)
 160     {
 161       struct lex_source *source, *next;
 162
 163       ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
 164         lex_source_destroy (source);
 165       free (lexer);
 166     }
 167 }
 168
 169 /* Inserts READER into LEXER so that the next token read by LEXER comes from
 170    READER.  Before the caller, LEXER must either be empty or at a T_ENDCMD
 171    token. */
 172 void
 173 lex_include (struct lexer *lexer, struct lex_reader *reader)
 174 {
 175   assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
 176   ll_push_head (&lexer->sources, &lex_source_create (reader)->ll);
 177 }
 178
 179 /* Appends READER to LEXER, so that it will be read after all other current
 180    readers have already been read. */
 181 void
 182 lex_append (struct lexer *lexer, struct lex_reader *reader)
 183 {
 184   ll_push_tail (&lexer->sources, &lex_source_create (reader)->ll);
 185 }
 186 \f
 187 /* Advacning. */
 188
 189 static struct lex_token *
 190 lex_push_token__ (struct lex_source *src)
 191 {
 192   struct lex_token *token;
 193
 194   if (deque_is_full (&src->deque))
 195     src->tokens = deque_expand (&src->deque, src->tokens, sizeof *src->tokens);
 196
 197   token = &src->tokens[deque_push_front (&src->deque)];
 198   token_init (&token->token);
 199   return token;
 200 }
 201
 202 static void
 203 lex_source_pop__ (struct lex_source *src)
 204 {
 205   token_destroy (&src->tokens[deque_pop_back (&src->deque)].token);
 206 }
 207
 208 static void
 209 lex_source_pop_front (struct lex_source *src)
 210 {
 211   token_destroy (&src->tokens[deque_pop_front (&src->deque)].token);
 212 }
 213
 214 /* Advances LEXER to the next token, consuming the current token. */
 215 void
 216 lex_get (struct lexer *lexer)
 217 {
 218   struct lex_source *src;
 219
 220   src = lex_source__ (lexer);
 221   if (src == NULL)
 222     return;
 223
 224   if (!deque_is_empty (&src->deque))
 225     lex_source_pop__ (src);
 226
 227   while (deque_is_empty (&src->deque))
 228     if (!lex_source_get__ (src))
 229       {
 230         lex_source_destroy (src);
 231         src = lex_source__ (lexer);
 232         if (src == NULL)
 233           return;
 234       }
 235 }
 236 \f
 237 /* Issuing errors. */
 238
 239 /* Prints a syntax error message containing the current token and
 240    given message MESSAGE (if non-null). */
 241 void
 242 lex_error (struct lexer *lexer, const char *format, ...)
 243 {
 244   va_list args;
 245
 246   va_start (args, format);
 247   lex_next_error_valist (lexer, 0, 0, format, args);
 248   va_end (args);
 249 }
 250
 251 /* Prints a syntax error message containing the current token and
 252    given message MESSAGE (if non-null). */
 253 void
 254 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
 255 {
 256   lex_next_error_valist (lexer, 0, 0, format, args);
 257 }
 258
 259 /* Prints a syntax error message containing the current token and
 260    given message MESSAGE (if non-null). */
 261 void
 262 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
 263 {
 264   va_list args;
 265
 266   va_start (args, format);
 267   lex_next_error_valist (lexer, n0, n1, format, args);
 268   va_end (args);
 269 }
 270
 271 /* Prints a syntax error message saying that OPTION0 or one of the other
 272    strings following it, up to the first NULL, is expected. */
 273 void
 274 lex_error_expecting (struct lexer *lexer, const char *option0, ...)
 275 {
 276   enum { MAX_OPTIONS = 8 };
 277   const char *options[MAX_OPTIONS + 1];
 278   va_list args;
 279   int n;
 280
 281   va_start (args, option0);
 282   options[0] = option0;
 283   n = 0;
 284   while (n + 1 < MAX_OPTIONS && options[n] != NULL)
 285     options[++n] = va_arg (args, const char *);
 286   va_end (args);
 287
 288   switch (n)
 289     {
 290     case 0:
 291       lex_error (lexer, NULL);
 292       break;
 293
 294     case 1:
 295       lex_error (lexer, _("expecting %s"), options[0]);
 296       break;
 297
 298     case 2:
 299       lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
 300       break;
 301
 302     case 3:
 303       lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
 304                  options[2]);
 305       break;
 306
 307     case 4:
 308       lex_error (lexer, _("expecting %s, %s, %s, or %s"),
 309                  options[0], options[1], options[2], options[3]);
 310       break;
 311
 312     case 5:
 313       lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
 314                  options[0], options[1], options[2], options[3], options[4]);
 315       break;
 316
 317     case 6:
 318       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
 319                  options[0], options[1], options[2], options[3], options[4],
 320                  options[5]);
 321       break;
 322
 323     case 7:
 324       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
 325                  options[0], options[1], options[2], options[3], options[4],
 326                  options[5], options[6]);
 327       break;
 328
 329     case 8:
 330       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
 331                  options[0], options[1], options[2], options[3], options[4],
 332                  options[5], options[6], options[7]);
 333       break;
 334
 335     default:
 336       NOT_REACHED ();
 337     }
 338 }
 339
 340 /* Reports an error to the effect that subcommand SBC may only be specified
 341    once.
 342
 343    This function does not take a lexer as an argument or use lex_error(),
 344    because the result would ordinarily just be redundant: "Syntax error at
 345    SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
 346    not help the user find the error. */
 347 void
 348 lex_sbc_only_once (const char *sbc)
 349 {
 350   msg (SE, _("Subcommand %s may only be specified once."), sbc);
 351 }
 352
 353 /* Reports an error to the effect that subcommand SBC is missing.
 354
 355    This function does not take a lexer as an argument or use lex_error(),
 356    because a missing subcommand can normally be detected only after the whole
 357    command has been parsed, and so lex_error() would always report "Syntax
 358    error at end of command", which does not help the user find the error. */
 359 void
 360 lex_sbc_missing (const char *sbc)
 361 {
 362   msg (SE, _("Required subcommand %s was not specified."), sbc);
 363 }
 364
 365 /* Reports an error to the effect that specification SPEC may only be specified
 366    once within subcommand SBC. */
 367 void
 368 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
 369 {
 370   lex_error (lexer, _("%s may only be specified once within subcommand %s"),
 371              spec, sbc);
 372 }
 373
 374 /* Reports an error to the effect that specification SPEC is missing within
 375    subcommand SBC. */
 376 void
 377 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
 378 {
 379   lex_error (lexer, _("Required %s specification missing from %s subcommand"),
 380              sbc, spec);
 381 }
 382
 383 /* Prints a syntax error message containing the current token and
 384    given message MESSAGE (if non-null). */
 385 void
 386 lex_next_error_valist (struct lexer *lexer, int n0, int n1,
 387                        const char *format, va_list args)
 388 {
 389   struct lex_source *src = lex_source__ (lexer);
 390
 391   if (src != NULL)
 392     lex_source_error_valist (src, n0, n1, format, args);
 393   else
 394     {
 395       struct string s;
 396
 397       ds_init_empty (&s);
 398       ds_put_format (&s, _("Syntax error at end of input"));
 399       if (format != NULL)
 400         {
 401           ds_put_cstr (&s, ": ");
 402           ds_put_vformat (&s, format, args);
 403         }
 404       ds_put_byte (&s, '.');
 405       msg (SE, "%s", ds_cstr (&s));
 406       ds_destroy (&s);
 407     }
 408 }
 409
 410 /* Checks that we're at end of command.
 411    If so, returns a successful command completion code.
 412    If not, flags a syntax error and returns an error command
 413    completion code. */
 414 int
 415 lex_end_of_command (struct lexer *lexer)
 416 {
 417   if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
 418     {
 419       lex_error (lexer, _("expecting end of command"));
 420       return CMD_FAILURE;
 421     }
 422   else
 423     return CMD_SUCCESS;
 424 }
 425 \f
 426 /* Token testing functions. */
 427
 428 /* Returns true if the current token is a number. */
 429 bool
 430 lex_is_number (struct lexer *lexer)
 431 {
 432   return lex_next_is_number (lexer, 0);
 433 }
 434
 435 /* Returns true if the current token is a string. */
 436 bool
 437 lex_is_string (struct lexer *lexer)
 438 {
 439   return lex_next_is_string (lexer, 0);
 440 }
 441
 442 /* Returns the value of the current token, which must be a
 443    floating point number. */
 444 double
 445 lex_number (struct lexer *lexer)
 446 {
 447   return lex_next_number (lexer, 0);
 448 }
 449
 450 /* Returns true iff the current token is an integer. */
 451 bool
 452 lex_is_integer (struct lexer *lexer)
 453 {
 454   return lex_next_is_integer (lexer, 0);
 455 }
 456
 457 /* Returns the value of the current token, which must be an
 458    integer. */
 459 long
 460 lex_integer (struct lexer *lexer)
 461 {
 462   return lex_next_integer (lexer, 0);
 463 }
 464 \f
 465 /* Token testing functions with lookahead.
 466
 467    A value of 0 for N as an argument to any of these functions refers to the
 468    current token.  Lookahead is limited to the current command.  Any N greater
 469    than the number of tokens remaining in the current command will be treated
 470    as referring to a T_ENDCMD token. */
 471
 472 /* Returns true if the token N ahead of the current token is a number. */
 473 bool
 474 lex_next_is_number (struct lexer *lexer, int n)
 475 {
 476   enum token_type next_token = lex_next_token (lexer, n);
 477   return next_token == T_POS_NUM || next_token == T_NEG_NUM;
 478 }
 479
 480 /* Returns true if the token N ahead of the current token is a string. */
 481 bool
 482 lex_next_is_string (struct lexer *lexer, int n)
 483 {
 484   return lex_next_token (lexer, n) == T_STRING;
 485 }
 486
 487 /* Returns the value of the token N ahead of the current token, which must be a
 488    floating point number. */
 489 double
 490 lex_next_number (struct lexer *lexer, int n)
 491 {
 492   assert (lex_next_is_number (lexer, n));
 493   return lex_next_tokval (lexer, n);
 494 }
 495
 496 /* Returns true if the token N ahead of the current token is an integer. */
 497 bool
 498 lex_next_is_integer (struct lexer *lexer, int n)
 499 {
 500   double value;
 501
 502   if (!lex_next_is_number (lexer, n))
 503     return false;
 504
 505   value = lex_next_tokval (lexer, n);
 506   return value > LONG_MIN && value <= LONG_MAX && floor (value) == value;
 507 }
 508
 509 /* Returns the value of the token N ahead of the current token, which must be
 510    an integer. */
 511 long
 512 lex_next_integer (struct lexer *lexer, int n)
 513 {
 514   assert (lex_next_is_integer (lexer, n));
 515   return lex_next_tokval (lexer, n);
 516 }
 517 \f
 518 /* Token matching functions. */
 519
 520 /* If the current token has the specified TYPE, skips it and returns true.
 521    Otherwise, returns false. */
 522 bool
 523 lex_match (struct lexer *lexer, enum token_type type)
 524 {
 525   if (lex_token (lexer) == type)
 526     {
 527       lex_get (lexer);
 528       return true;
 529     }
 530   else
 531     return false;
 532 }
 533
 534 /* If the current token matches IDENTIFIER, skips it and returns true.
 535    IDENTIFIER may be abbreviated to its first three letters.  Otherwise,
 536    returns false.
 537
 538    IDENTIFIER must be an ASCII string. */
 539 bool
 540 lex_match_id (struct lexer *lexer, const char *identifier)
 541 {
 542   return lex_match_id_n (lexer, identifier, 3);
 543 }
 544
 545 /* If the current token is IDENTIFIER, skips it and returns true.  IDENTIFIER
 546    may be abbreviated to its first N letters.  Otherwise, returns false.
 547
 548    IDENTIFIER must be an ASCII string. */
 549 bool
 550 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
 551 {
 552   if (lex_token (lexer) == T_ID
 553       && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
 554     {
 555       lex_get (lexer);
 556       return true;
 557     }
 558   else
 559     return false;
 560 }
 561
 562 /* If the current token is integer X, skips it and returns true.  Otherwise,
 563    returns false. */
 564 bool
 565 lex_match_int (struct lexer *lexer, int x)
 566 {
 567   if (lex_is_integer (lexer) && lex_integer (lexer) == x)
 568     {
 569       lex_get (lexer);
 570       return true;
 571     }
 572   else
 573     return false;
 574 }
 575 \f
 576 /* Forced matches. */
 577
 578 /* If this token is IDENTIFIER, skips it and returns true.  IDENTIFIER may be
 579    abbreviated to its first 3 letters.  Otherwise, reports an error and returns
 580    false.
 581
 582    IDENTIFIER must be an ASCII string. */
 583 bool
 584 lex_force_match_id (struct lexer *lexer, const char *identifier)
 585 {
 586   if (lex_match_id (lexer, identifier))
 587     return true;
 588   else
 589     {
 590       lex_error_expecting (lexer, identifier, NULL_SENTINEL);
 591       return false;
 592     }
 593 }
 594
 595 /* If the current token has the specified TYPE, skips it and returns true.
 596    Otherwise, reports an error and returns false. */
 597 bool
 598 lex_force_match (struct lexer *lexer, enum token_type type)
 599 {
 600   if (lex_token (lexer) == type)
 601     {
 602       lex_get (lexer);
 603       return true;
 604     }
 605   else
 606     {
 607       char *s = xasprintf ("`%s'", token_type_to_string (type));
 608       lex_error_expecting (lexer, s, NULL_SENTINEL);
 609       free (s);
 610       return false;
 611     }
 612 }
 613
 614 /* If the current token is a string, does nothing and returns true.
 615    Otherwise, reports an error and returns false. */
 616 bool
 617 lex_force_string (struct lexer *lexer)
 618 {
 619   if (lex_is_string (lexer))
 620     return true;
 621   else
 622     {
 623       lex_error (lexer, _("expecting string"));
 624       return false;
 625     }
 626 }
 627
 628 /* If the current token is an integer, does nothing and returns true.
 629    Otherwise, reports an error and returns false. */
 630 bool
 631 lex_force_int (struct lexer *lexer)
 632 {
 633   if (lex_is_integer (lexer))
 634     return true;
 635   else
 636     {
 637       lex_error (lexer, _("expecting integer"));
 638       return false;
 639     }
 640 }
 641
 642 /* If the current token is a number, does nothing and returns true.
 643    Otherwise, reports an error and returns false. */
 644 bool
 645 lex_force_num (struct lexer *lexer)
 646 {
 647   if (lex_is_number (lexer))
 648     return true;
 649
 650   lex_error (lexer, _("expecting number"));
 651   return false;
 652 }
 653
 654 /* If the current token is an identifier, does nothing and returns true.
 655    Otherwise, reports an error and returns false. */
 656 bool
 657 lex_force_id (struct lexer *lexer)
 658 {
 659   if (lex_token (lexer) == T_ID)
 660     return true;
 661
 662   lex_error (lexer, _("expecting identifier"));
 663   return false;
 664 }
 665 \f
 666 /* Token accessors. */
 667
 668 /* Returns the type of LEXER's current token. */
 669 enum token_type
 670 lex_token (const struct lexer *lexer)
 671 {
 672   return lex_next_token (lexer, 0);
 673 }
 674
 675 /* Returns the number in LEXER's current token.
 676
 677    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
 678    tokens this function will always return zero. */
 679 double
 680 lex_tokval (const struct lexer *lexer)
 681 {
 682   return lex_next_tokval (lexer, 0);
 683 }
 684
 685 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
 686
 687    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 688    this functions this function will always return NULL.
 689
 690    The UTF-8 encoding of the returned string is correct for variable names and
 691    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 692    data_in() to use it in a "union value".  */
 693 const char *
 694 lex_tokcstr (const struct lexer *lexer)
 695 {
 696   return lex_next_tokcstr (lexer, 0);
 697 }
 698
 699 /* Returns the string in LEXER's current token, UTF-8 encoded.  The string is
 700    null-terminated (but the null terminator is not included in the returned
 701    substring's 'length').
 702
 703    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 704    this functions this function will always return NULL.
 705
 706    The UTF-8 encoding of the returned string is correct for variable names and
 707    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 708    data_in() to use it in a "union value".  */
 709 struct substring
 710 lex_tokss (const struct lexer *lexer)
 711 {
 712   return lex_next_tokss (lexer, 0);
 713 }
 714 \f
 715 /* Looking ahead.
 716
 717    A value of 0 for N as an argument to any of these functions refers to the
 718    current token.  Lookahead is limited to the current command.  Any N greater
 719    than the number of tokens remaining in the current command will be treated
 720    as referring to a T_ENDCMD token. */
 721
 722 static const struct lex_token *
 723 lex_next__ (const struct lexer *lexer_, int n)
 724 {
 725   struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
 726   struct lex_source *src = lex_source__ (lexer);
 727
 728   if (src != NULL)
 729     return lex_source_next__ (src, n);
 730   else
 731     {
 732       static const struct lex_token stop_token =
 733         { TOKEN_INITIALIZER (T_STOP, 0.0, ""), 0, 0, 0, 0 };
 734
 735       return &stop_token;
 736     }
 737 }
 738
 739 static const struct lex_token *
 740 lex_source_next__ (const struct lex_source *src, int n)
 741 {
 742   while (deque_count (&src->deque) <= n)
 743     {
 744       if (!deque_is_empty (&src->deque))
 745         {
 746           struct lex_token *front;
 747
 748           front = &src->tokens[deque_front (&src->deque, 0)];
 749           if (front->token.type == T_STOP || front->token.type == T_ENDCMD)
 750             return front;
 751         }
 752
 753       lex_source_get__ (src);
 754     }
 755
 756   return &src->tokens[deque_back (&src->deque, n)];
 757 }
 758
 759 /* Returns the "struct token" of the token N after the current one in LEXER.
 760    The returned pointer can be invalidated by pretty much any succeeding call
 761    into the lexer, although the string pointer within the returned token is
 762    only invalidated by consuming the token (e.g. with lex_get()). */
 763 const struct token *
 764 lex_next (const struct lexer *lexer, int n)
 765 {
 766   return &lex_next__ (lexer, n)->token;
 767 }
 768
 769 /* Returns the type of the token N after the current one in LEXER. */
 770 enum token_type
 771 lex_next_token (const struct lexer *lexer, int n)
 772 {
 773   return lex_next (lexer, n)->type;
 774 }
 775
 776 /* Returns the number in the tokn N after the current one in LEXER.
 777
 778    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
 779    tokens this function will always return zero. */
 780 double
 781 lex_next_tokval (const struct lexer *lexer, int n)
 782 {
 783   const struct token *token = lex_next (lexer, n);
 784   return token->number;
 785 }
 786
 787 /* Returns the null-terminated string in the token N after the current one, in
 788    UTF-8 encoding.
 789
 790    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 791    this functions this function will always return NULL.
 792
 793    The UTF-8 encoding of the returned string is correct for variable names and
 794    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 795    data_in() to use it in a "union value".  */
 796 const char *
 797 lex_next_tokcstr (const struct lexer *lexer, int n)
 798 {
 799   return lex_next_tokss (lexer, n).string;
 800 }
 801
 802 /* Returns the string in the token N after the current one, in UTF-8 encoding.
 803    The string is null-terminated (but the null terminator is not included in
 804    the returned substring's 'length').
 805
 806    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 807    this functions this function will always return NULL.
 808
 809    The UTF-8 encoding of the returned string is correct for variable names and
 810    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 811    data_in() to use it in a "union value".  */
 812 struct substring
 813 lex_next_tokss (const struct lexer *lexer, int n)
 814 {
 815   return lex_next (lexer, n)->string;
 816 }
 817
 818 /* If LEXER is positioned at the (pseudo)identifier S, skips it and returns
 819    true.  Otherwise, returns false.
 820
 821    S may consist of an arbitrary number of identifiers, integers, and
 822    punctuation e.g. "KRUSKAL-WALLIS", "2SLS", or "END INPUT PROGRAM".
 823    Identifiers may be abbreviated to their first three letters.  Currently only
 824    hyphens, slashes, and equals signs are supported as punctuation (but it
 825    would be easy to add more).
 826
 827    S must be an ASCII string. */
 828 bool
 829 lex_match_phrase (struct lexer *lexer, const char *s)
 830 {
 831   int tok_idx;
 832
 833   for (tok_idx = 0; ; tok_idx++)
 834     {
 835       enum token_type token;
 836       unsigned char c;
 837
 838       while (c_isspace (*s))
 839         s++;
 840
 841       c = *s;
 842       if (c == '\0')
 843         {
 844           int i;
 845
 846           for (i = 0; i < tok_idx; i++)
 847             lex_get (lexer);
 848           return true;
 849         }
 850
 851       token = lex_next_token (lexer, tok_idx);
 852       switch (c)
 853         {
 854         case '-':
 855           if (token != T_DASH)
 856             return false;
 857           s++;
 858           break;
 859
 860         case '/':
 861           if (token != T_SLASH)
 862             return false;
 863           s++;
 864           break;
 865
 866         case '=':
 867           if (token != T_EQUALS)
 868             return false;
 869           s++;
 870           break;
 871
 872         case '0': case '1': case '2': case '3': case '4':
 873         case '5': case '6': case '7': case '8': case '9':
 874           {
 875             unsigned int value;
 876
 877             if (token != T_POS_NUM)
 878               return false;
 879
 880             value = 0;
 881             do
 882               {
 883                 value = value * 10 + (*s++ - '0');
 884               }
 885             while (c_isdigit (*s));
 886
 887             if (lex_next_tokval (lexer, tok_idx) != value)
 888               return false;
 889           }
 890           break;
 891
 892         default:
 893           if (lex_is_id1 (c))
 894             {
 895               int len;
 896
 897               if (token != T_ID)
 898                 return false;
 899
 900               len = lex_id_get_length (ss_cstr (s));
 901               if (!lex_id_match (ss_buffer (s, len),
 902                                  lex_next_tokss (lexer, tok_idx)))
 903                 return false;
 904
 905               s += len;
 906             }
 907           else
 908             NOT_REACHED ();
 909         }
 910     }
 911 }
 912
 913 static int
 914 lex_source_get_first_line_number (const struct lex_source *src, int n)
 915 {
 916   return lex_source_next__ (src, n)->first_line;
 917 }
 918
 919 static int
 920 count_newlines (char *s, size_t length)
 921 {
 922   int n_newlines = 0;
 923   char *newline;
 924
 925   while ((newline = memchr (s, '\n', length)) != NULL)
 926     {
 927       n_newlines++;
 928       length -= (newline + 1) - s;
 929       s = newline + 1;
 930     }
 931
 932   return n_newlines;
 933 }
 934
 935 static int
 936 lex_source_get_last_line_number (const struct lex_source *src, int n)
 937 {
 938   const struct lex_token *token = lex_source_next__ (src, n);
 939
 940   if (token->first_line == 0)
 941     return 0;
 942   else
 943     {
 944       char *token_str = &src->buffer[token->token_pos - src->tail];
 945       return token->first_line + count_newlines (token_str, token->token_len) + 1;
 946     }
 947 }
 948
 949 static int
 950 count_columns (const char *s_, size_t length)
 951 {
 952   const uint8_t *s = CHAR_CAST (const uint8_t *, s_);
 953   int columns;
 954   size_t ofs;
 955   int mblen;
 956
 957   columns = 0;
 958   for (ofs = 0; ofs < length; ofs += mblen)
 959     {
 960       ucs4_t uc;
 961
 962       mblen = u8_mbtouc (&uc, s + ofs, length - ofs);
 963       if (uc != '\t')
 964         {
 965           int width = uc_width (uc, "UTF-8");
 966           if (width > 0)
 967             columns += width;
 968         }
 969       else
 970         columns = ROUND_UP (columns + 1, 8);
 971     }
 972
 973   return columns + 1;
 974 }
 975
 976 static int
 977 lex_source_get_first_column (const struct lex_source *src, int n)
 978 {
 979   const struct lex_token *token = lex_source_next__ (src, n);
 980   return count_columns (&src->buffer[token->line_pos - src->tail],
 981                         token->token_pos - token->line_pos);
 982 }
 983
 984 static int
 985 lex_source_get_last_column (const struct lex_source *src, int n)
 986 {
 987   const struct lex_token *token = lex_source_next__ (src, n);
 988   char *start, *end, *newline;
 989
 990   start = &src->buffer[token->line_pos - src->tail];
 991   end = &src->buffer[(token->token_pos + token->token_len) - src->tail];
 992   newline = memrchr (start, '\n', end - start);
 993   if (newline != NULL)
 994     start = newline + 1;
 995   return count_columns (start, end - start);
 996 }
 997
 998 /* Returns the 1-based line number of the start of the syntax that represents
 999    the token N after the current one in LEXER.  Returns 0 for a T_STOP token or
1000    if the token is drawn from a source that does not have line numbers. */
1001 int
1002 lex_get_first_line_number (const struct lexer *lexer, int n)
1003 {
1004   const struct lex_source *src = lex_source__ (lexer);
1005   return src != NULL ? lex_source_get_first_line_number (src, n) : 0;
1006 }
1007
1008 /* Returns the 1-based line number of the end of the syntax that represents the
1009    token N after the current one in LEXER, plus 1.  Returns 0 for a T_STOP
1010    token or if the token is drawn from a source that does not have line
1011    numbers.
1012
1013    Most of the time, a single token is wholly within a single line of syntax,
1014    but there are two exceptions: a T_STRING token can be made up of multiple
1015    segments on adjacent lines connected with "+" punctuators, and a T_NEG_NUM
1016    token can consist of a "-" on one line followed by the number on the next.
1017  */
1018 int
1019 lex_get_last_line_number (const struct lexer *lexer, int n)
1020 {
1021   const struct lex_source *src = lex_source__ (lexer);
1022   return src != NULL ? lex_source_get_last_line_number (src, n) : 0;
1023 }
1024
1025 /* Returns the 1-based column number of the start of the syntax that represents
1026    the token N after the current one in LEXER.  Returns 0 for a T_STOP
1027    token.
1028
1029    Column numbers are measured according to the width of characters as shown in
1030    a typical fixed-width font, in which CJK characters have width 2 and
1031    combining characters have width 0.  */
1032 int
1033 lex_get_first_column (const struct lexer *lexer, int n)
1034 {
1035   const struct lex_source *src = lex_source__ (lexer);
1036   return src != NULL ? lex_source_get_first_column (src, n) : 0;
1037 }
1038
1039 /* Returns the 1-based column number of the end of the syntax that represents
1040    the token N after the current one in LEXER, plus 1.  Returns 0 for a T_STOP
1041    token.
1042
1043    Column numbers are measured according to the width of characters as shown in
1044    a typical fixed-width font, in which CJK characters have width 2 and
1045    combining characters have width 0.  */
1046 int
1047 lex_get_last_column (const struct lexer *lexer, int n)
1048 {
1049   const struct lex_source *src = lex_source__ (lexer);
1050   return src != NULL ? lex_source_get_last_column (src, n) : 0;
1051 }
1052
1053 /* Returns the name of the syntax file from which the current command is drawn.
1054    Returns NULL for a T_STOP token or if the command's source does not have
1055    line numbers.
1056
1057    There is no version of this function that takes an N argument because
1058    lookahead only works to the end of a command and any given command is always
1059    within a single syntax file. */
1060 const char *
1061 lex_get_file_name (const struct lexer *lexer)
1062 {
1063   struct lex_source *src = lex_source__ (lexer);
1064   return src == NULL ? NULL : src->reader->file_name;
1065 }
1066
1067 /* Returns the syntax mode for the syntax file from which the current drawn is
1068    drawn.  Returns LEX_SYNTAX_AUTO for a T_STOP token or if the command's
1069    source does not have line numbers.
1070
1071    There is no version of this function that takes an N argument because
1072    lookahead only works to the end of a command and any given command is always
1073    within a single syntax file. */
1074 enum lex_syntax_mode
1075 lex_get_syntax_mode (const struct lexer *lexer)
1076 {
1077   struct lex_source *src = lex_source__ (lexer);
1078   return src == NULL ? LEX_SYNTAX_AUTO : src->reader->syntax;
1079 }
1080
1081 /* Returns the error mode for the syntax file from which the current drawn is
1082    drawn.  Returns LEX_ERROR_INTERACTIVE for a T_STOP token or if the command's
1083    source does not have line numbers.
1084
1085    There is no version of this function that takes an N argument because
1086    lookahead only works to the end of a command and any given command is always
1087    within a single syntax file. */
1088 enum lex_error_mode
1089 lex_get_error_mode (const struct lexer *lexer)
1090 {
1091   struct lex_source *src = lex_source__ (lexer);
1092   return src == NULL ? LEX_ERROR_INTERACTIVE : src->reader->error;
1093 }
1094
1095 /* If the source that LEXER is currently reading has error mode
1096    LEX_ERROR_INTERACTIVE, discards all buffered input and tokens, so that the
1097    next token to be read comes directly from whatever is next read from the
1098    stream.
1099
1100    It makes sense to call this function after encountering an error in a
1101    command entered on the console, because usually the user would prefer not to
1102    have cascading errors. */
1103 void
1104 lex_interactive_reset (struct lexer *lexer)
1105 {
1106   struct lex_source *src = lex_source__ (lexer);
1107   if (src != NULL && src->reader->error == LEX_ERROR_INTERACTIVE)
1108     {
1109       src->head = src->tail = 0;
1110       src->journal_pos = src->seg_pos = src->line_pos = 0;
1111       src->n_newlines = 0;
1112       src->suppress_next_newline = false;
1113       segmenter_init (&src->segmenter, segmenter_get_mode (&src->segmenter));
1114       while (!deque_is_empty (&src->deque))
1115         lex_source_pop__ (src);
1116       lex_source_push_endcmd__ (src);
1117     }
1118 }
1119
1120 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1121 void
1122 lex_discard_rest_of_command (struct lexer *lexer)
1123 {
1124   while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1125     lex_get (lexer);
1126 }
1127
1128 /* Discards all lookahead tokens in LEXER, then discards all input sources
1129    until it encounters one with error mode LEX_ERROR_INTERACTIVE or until it
1130    runs out of input sources. */
1131 void
1132 lex_discard_noninteractive (struct lexer *lexer)
1133 {
1134   struct lex_source *src = lex_source__ (lexer);
1135
1136   if (src != NULL)
1137     {
1138       while (!deque_is_empty (&src->deque))
1139         lex_source_pop__ (src);
1140
1141       for (; src != NULL && src->reader->error != LEX_ERROR_INTERACTIVE;
1142            src = lex_source__ (lexer))
1143         lex_source_destroy (src);
1144     }
1145 }
1146 \f
1147 static size_t
1148 lex_source_max_tail__ (const struct lex_source *src)
1149 {
1150   const struct lex_token *token;
1151   size_t max_tail;
1152
1153   assert (src->seg_pos >= src->line_pos);
1154   max_tail = MIN (src->journal_pos, src->line_pos);
1155
1156   /* Use the oldest token also.  (We know that src->deque cannot be empty
1157      because we are in the process of adding a new token, which is already
1158      initialized enough to use here.) */
1159   token = &src->tokens[deque_back (&src->deque, 0)];
1160   assert (token->token_pos >= token->line_pos);
1161   max_tail = MIN (max_tail, token->line_pos);
1162
1163   return max_tail;
1164 }
1165
1166 static void
1167 lex_source_expand__ (struct lex_source *src)
1168 {
1169   if (src->head - src->tail >= src->allocated)
1170     {
1171       size_t max_tail = lex_source_max_tail__ (src);
1172       if (max_tail > src->tail)
1173         {
1174           /* Advance the tail, freeing up room at the head. */
1175           memmove (src->buffer, src->buffer + (max_tail - src->tail),
1176                    src->head - max_tail);
1177           src->tail = max_tail;
1178         }
1179       else
1180         {
1181           /* Buffer is completely full.  Expand it. */
1182           src->buffer = x2realloc (src->buffer, &src->allocated);
1183         }
1184     }
1185   else
1186     {
1187       /* There's space available at the head of the buffer.  Nothing to do. */
1188     }
1189 }
1190
1191 static void
1192 lex_source_read__ (struct lex_source *src)
1193 {
1194   do
1195     {
1196       size_t head_ofs;
1197       size_t n;
1198
1199       lex_source_expand__ (src);
1200
1201       head_ofs = src->head - src->tail;
1202       n = src->reader->class->read (src->reader, &src->buffer[head_ofs],
1203                                     src->allocated - head_ofs,
1204                                     segmenter_get_prompt (&src->segmenter));
1205       if (n == 0)
1206         {
1207           /* End of input.
1208
1209              Ensure that the input always ends in a new-line followed by a null
1210              byte, as required by the segmenter library. */
1211
1212           if (src->head == src->tail
1213               || src->buffer[src->head - src->tail - 1] != '\n')
1214             src->buffer[src->head++ - src->tail] = '\n';
1215
1216           lex_source_expand__ (src);
1217           src->buffer[src->head++ - src->tail] = '\0';
1218
1219           return;
1220         }
1221
1222       src->head += n;
1223     }
1224   while (!memchr (&src->buffer[src->seg_pos - src->tail], '\n',
1225                   src->head - src->seg_pos));
1226 }
1227
1228 static struct lex_source *
1229 lex_source__ (const struct lexer *lexer)
1230 {
1231   return (ll_is_empty (&lexer->sources) ? NULL
1232           : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1233 }
1234
1235 static struct substring
1236 lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
1237 {
1238   const struct lex_token *token0 = lex_source_next__ (src, n0);
1239   const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1));
1240   size_t start = token0->token_pos;
1241   size_t end = token1->token_pos + token1->token_len;
1242
1243   return ss_buffer (&src->buffer[start - src->tail], end - start);
1244 }
1245
1246 static void
1247 lex_ellipsize__ (struct substring in, char *out, size_t out_size)
1248 {
1249   size_t out_maxlen;
1250   size_t out_len;
1251   int mblen;
1252
1253   assert (out_size >= 16);
1254   out_maxlen = out_size - (in.length >= out_size ? 3 : 0) - 1;
1255   for (out_len = 0; out_len < in.length; out_len += mblen)
1256     {
1257       if (in.string[out_len] == '\n'
1258           || (in.string[out_len] == '\r'
1259               && out_len + 1 < in.length
1260               && in.string[out_len + 1] == '\n'))
1261         break;
1262
1263       mblen = u8_mblen (CHAR_CAST (const uint8_t *, in.string + out_len),
1264                         in.length - out_len);
1265       if (out_len + mblen > out_maxlen)
1266         break;
1267     }
1268
1269   memcpy (out, in.string, out_len);
1270   strcpy (&out[out_len], out_len < in.length ? "..." : "");
1271 }
1272
1273 static void
1274 lex_source_error_valist (struct lex_source *src, int n0, int n1,
1275                          const char *format, va_list args)
1276 {
1277   const struct lex_token *token;
1278   struct string s;
1279   struct msg m;
1280
1281   ds_init_empty (&s);
1282
1283   token = lex_source_next__ (src, n0);
1284   if (token->token.type == T_ENDCMD)
1285     ds_put_cstr (&s, _("Syntax error at end of command"));
1286   else
1287     {
1288       struct substring syntax = lex_source_get_syntax__ (src, n0, n1);
1289       if (!ss_is_empty (syntax))
1290         {
1291           char syntax_cstr[64];
1292
1293           lex_ellipsize__ (syntax, syntax_cstr, sizeof syntax_cstr);
1294           ds_put_format (&s, _("Syntax error at `%s'"), syntax_cstr);
1295         }
1296       else
1297         ds_put_cstr (&s, _("Syntax error"));
1298     }
1299
1300   if (format)
1301     {
1302       ds_put_cstr (&s, ": ");
1303       ds_put_vformat (&s, format, args);
1304     }
1305   ds_put_byte (&s, '.');
1306
1307   m.category = MSG_C_SYNTAX;
1308   m.severity = MSG_S_ERROR;
1309   m.file_name = src->reader->file_name;
1310   m.first_line = lex_source_get_first_line_number (src, n0);
1311   m.last_line = lex_source_get_last_line_number (src, n1);
1312   m.first_column = lex_source_get_first_column (src, n0);
1313   m.last_column = lex_source_get_last_column (src, n1);
1314   m.text = ds_steal_cstr (&s);
1315   msg_emit (&m);
1316 }
1317
1318 static void PRINTF_FORMAT (2, 3)
1319 lex_get_error (struct lex_source *src, const char *format, ...)
1320 {
1321   va_list args;
1322   int n;
1323
1324   va_start (args, format);
1325
1326   n = deque_count (&src->deque) - 1;
1327   lex_source_error_valist (src, n, n, format, args);
1328   lex_source_pop_front (src);
1329
1330   va_end (args);
1331 }
1332
1333 static bool
1334 lex_source_get__ (const struct lex_source *src_)
1335 {
1336   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1337
1338   struct state
1339     {
1340       struct segmenter segmenter;
1341       enum segment_type last_segment;
1342       int newlines;
1343       size_t line_pos;
1344       size_t seg_pos;
1345     };
1346
1347   struct state state, saved;
1348   enum scan_result result;
1349   struct scanner scanner;
1350   struct lex_token *token;
1351   int n_lines;
1352   int i;
1353
1354   if (src->eof)
1355     return false;
1356
1357   state.segmenter = src->segmenter;
1358   state.newlines = 0;
1359   state.seg_pos = src->seg_pos;
1360   state.line_pos = src->line_pos;
1361   saved = state;
1362
1363   token = lex_push_token__ (src);
1364   scanner_init (&scanner, &token->token);
1365   token->line_pos = src->line_pos;
1366   token->token_pos = src->seg_pos;
1367   if (src->reader->line_number > 0)
1368     token->first_line = src->reader->line_number + src->n_newlines;
1369   else
1370     token->first_line = 0;
1371
1372   for (;;)
1373     {
1374       enum segment_type type;
1375       const char *segment;
1376       size_t seg_maxlen;
1377       int seg_len;
1378
1379       segment = &src->buffer[state.seg_pos - src->tail];
1380       seg_maxlen = src->head - state.seg_pos;
1381       seg_len = segmenter_push (&state.segmenter, segment, seg_maxlen, &type);
1382       if (seg_len < 0)
1383         {
1384           lex_source_read__ (src);
1385           continue;
1386         }
1387
1388       state.last_segment = type;
1389       state.seg_pos += seg_len;
1390       if (type == SEG_NEWLINE)
1391         {
1392           state.newlines++;
1393           state.line_pos = state.seg_pos;
1394         }
1395
1396       result = scanner_push (&scanner, type, ss_buffer (segment, seg_len),
1397                              &token->token);
1398       if (result == SCAN_SAVE)
1399         saved = state;
1400       else if (result == SCAN_BACK)
1401         {
1402           state = saved;
1403           break;
1404         }
1405       else if (result == SCAN_DONE)
1406         break;
1407     }
1408
1409   n_lines = state.newlines;
1410   if (state.last_segment == SEG_END_COMMAND && !src->suppress_next_newline)
1411     {
1412       n_lines++;
1413       src->suppress_next_newline = true;
1414     }
1415   else if (n_lines > 0 && src->suppress_next_newline)
1416     {
1417       n_lines--;
1418       src->suppress_next_newline = false;
1419     }
1420   for (i = 0; i < n_lines; i++)
1421     {
1422       const char *newline;
1423       const char *line;
1424       size_t line_len;
1425       char *syntax;
1426
1427       line = &src->buffer[src->journal_pos - src->tail];
1428       newline = rawmemchr (line, '\n');
1429       line_len = newline - line;
1430       if (line_len > 0 && line[line_len - 1] == '\r')
1431         line_len--;
1432
1433       syntax = malloc (line_len + 2);
1434       memcpy (syntax, line, line_len);
1435       syntax[line_len] = '\n';
1436       syntax[line_len + 1] = '\0';
1437
1438       text_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX, syntax));
1439
1440       src->journal_pos += newline - line + 1;
1441     }
1442
1443   token->token_len = state.seg_pos - src->seg_pos;
1444
1445   src->segmenter = state.segmenter;
1446   src->seg_pos = state.seg_pos;
1447   src->line_pos = state.line_pos;
1448   src->n_newlines += state.newlines;
1449
1450   switch (token->token.type)
1451     {
1452     default:
1453       break;
1454
1455     case T_STOP:
1456       token->token.type = T_ENDCMD;
1457       src->eof = true;
1458       break;
1459
1460     case SCAN_BAD_HEX_LENGTH:
1461       lex_get_error (src, _("String of hex digits has %d characters, which "
1462                             "is not a multiple of 2"),
1463                      (int) token->token.number);
1464       break;
1465
1466     case SCAN_BAD_HEX_DIGIT:
1467     case SCAN_BAD_UNICODE_DIGIT:
1468       lex_get_error (src, _("`%c' is not a valid hex digit"),
1469                      (int) token->token.number);
1470       break;
1471
1472     case SCAN_BAD_UNICODE_LENGTH:
1473       lex_get_error (src, _("Unicode string contains %d bytes, which is "
1474                             "not in the valid range of 1 to 8 bytes"),
1475                      (int) token->token.number);
1476       break;
1477
1478     case SCAN_BAD_UNICODE_CODE_POINT:
1479       lex_get_error (src, _("U+%04X is not a valid Unicode code point"),
1480                      (int) token->token.number);
1481       break;
1482
1483     case SCAN_EXPECTED_QUOTE:
1484       lex_get_error (src, _("Unterminated string constant"));
1485       break;
1486
1487     case SCAN_EXPECTED_EXPONENT:
1488       lex_get_error (src, _("Missing exponent following `%s'"),
1489                      token->token.string.string);
1490       break;
1491
1492     case SCAN_UNEXPECTED_DOT:
1493       lex_get_error (src, _("Unexpected `.' in middle of command"));
1494       break;
1495
1496     case SCAN_UNEXPECTED_CHAR:
1497       {
1498         char c_name[16];
1499         lex_get_error (src, _("Bad character %s in input"),
1500                        uc_name (token->token.number, c_name));
1501       }
1502       break;
1503
1504     case SCAN_SKIP:
1505       lex_source_pop_front (src);
1506       break;
1507     }
1508
1509   return true;
1510 }
1511 \f
1512 static void
1513 lex_source_push_endcmd__ (struct lex_source *src)
1514 {
1515   struct lex_token *token = lex_push_token__ (src);
1516   token->token.type = T_ENDCMD;
1517   token->token_pos = 0;
1518   token->token_len = 0;
1519   token->line_pos = 0;
1520   token->first_line = 0;
1521 }
1522
1523 static struct lex_source *
1524 lex_source_create (struct lex_reader *reader)
1525 {
1526   struct lex_source *src;
1527   enum segmenter_mode mode;
1528
1529   src = xzalloc (sizeof *src);
1530   src->reader = reader;
1531
1532   if (reader->syntax == LEX_SYNTAX_AUTO)
1533     mode = SEG_MODE_AUTO;
1534   else if (reader->syntax == LEX_SYNTAX_INTERACTIVE)
1535     mode = SEG_MODE_INTERACTIVE;
1536   else if (reader->syntax == LEX_SYNTAX_BATCH)
1537     mode = SEG_MODE_BATCH;
1538   else
1539     NOT_REACHED ();
1540   segmenter_init (&src->segmenter, mode);
1541
1542   src->tokens = deque_init (&src->deque, 4, sizeof *src->tokens);
1543
1544   lex_source_push_endcmd__ (src);
1545
1546   return src;
1547 }
1548
1549 static void
1550 lex_source_destroy (struct lex_source *src)
1551 {
1552   char *file_name = src->reader->file_name;
1553   if (src->reader->class->destroy != NULL)
1554     src->reader->class->destroy (src->reader);
1555   free (file_name);
1556   free (src->buffer);
1557   while (!deque_is_empty (&src->deque))
1558     lex_source_pop__ (src);
1559   free (src->tokens);
1560   ll_remove (&src->ll);
1561   free (src);
1562 }
1563 \f
1564 struct lex_file_reader
1565   {
1566     struct lex_reader reader;
1567     struct u8_istream *istream;
1568     char *file_name;
1569   };
1570
1571 static struct lex_reader_class lex_file_reader_class;
1572
1573 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
1574    from stdin if FILE_NAME is "-").  The file is expected to be encoded with
1575    ENCODING, which should take one of the forms accepted by
1576    u8_istream_for_file().  SYNTAX and ERROR become the syntax mode and error
1577    mode of the new reader, respectively.
1578
1579    Returns a null pointer if FILE_NAME cannot be opened. */
1580 struct lex_reader *
1581 lex_reader_for_file (const char *file_name, const char *encoding,
1582                      enum lex_syntax_mode syntax,
1583                      enum lex_error_mode error)
1584 {
1585   struct lex_file_reader *r;
1586   struct u8_istream *istream;
1587
1588   istream = (!strcmp(file_name, "-")
1589              ? u8_istream_for_fd (encoding, STDIN_FILENO)
1590              : u8_istream_for_file (encoding, file_name, O_RDONLY));
1591   if (istream == NULL)
1592     {
1593       msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
1594       return NULL;
1595     }
1596
1597   r = xmalloc (sizeof *r);
1598   lex_reader_init (&r->reader, &lex_file_reader_class);
1599   r->reader.syntax = syntax;
1600   r->reader.error = error;
1601   r->reader.file_name = xstrdup (file_name);
1602   r->reader.line_number = 1;
1603   r->istream = istream;
1604   r->file_name = xstrdup (file_name);
1605
1606   return &r->reader;
1607 }
1608
1609 static struct lex_file_reader *
1610 lex_file_reader_cast (struct lex_reader *r)
1611 {
1612   return UP_CAST (r, struct lex_file_reader, reader);
1613 }
1614
1615 static size_t
1616 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
1617                enum prompt_style prompt_style UNUSED)
1618 {
1619   struct lex_file_reader *r = lex_file_reader_cast (r_);
1620   ssize_t n_read = u8_istream_read (r->istream, buf, n);
1621   if (n_read < 0)
1622     {
1623       msg (ME, _("Error reading `%s': %s."), r->file_name, strerror (errno));
1624       return 0;
1625     }
1626   return n_read;
1627 }
1628
1629 static void
1630 lex_file_close (struct lex_reader *r_)
1631 {
1632   struct lex_file_reader *r = lex_file_reader_cast (r_);
1633
1634   if (u8_istream_fileno (r->istream) != STDIN_FILENO)
1635     {
1636       if (u8_istream_close (r->istream) != 0)
1637         msg (ME, _("Error closing `%s': %s."), r->file_name, strerror (errno));
1638     }
1639   else
1640     u8_istream_free (r->istream);
1641
1642   free (r->file_name);
1643   free (r);
1644 }
1645
1646 static struct lex_reader_class lex_file_reader_class =
1647   {
1648     lex_file_read,
1649     lex_file_close
1650   };
1651 \f
1652 struct lex_string_reader
1653   {
1654     struct lex_reader reader;
1655     struct substring s;
1656     size_t offset;
1657   };
1658
1659 static struct lex_reader_class lex_string_reader_class;
1660
1661 /* Creates and returns a new lex_reader for the contents of S, which must be
1662    encoded in UTF-8.  The new reader takes ownership of S and will free it
1663    with ss_dealloc() when it is closed. */
1664 struct lex_reader *
1665 lex_reader_for_substring_nocopy (struct substring s)
1666 {
1667   struct lex_string_reader *r;
1668
1669   r = xmalloc (sizeof *r);
1670   lex_reader_init (&r->reader, &lex_string_reader_class);
1671   r->reader.syntax = LEX_SYNTAX_INTERACTIVE;
1672   r->s = s;
1673   r->offset = 0;
1674
1675   return &r->reader;
1676 }
1677
1678 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
1679    which must be encoded in UTF-8.  The caller retains ownership of S. */
1680 struct lex_reader *
1681 lex_reader_for_string (const char *s)
1682 {
1683   struct substring ss;
1684   ss_alloc_substring (&ss, ss_cstr (s));
1685   return lex_reader_for_substring_nocopy (ss);
1686 }
1687
1688 /* Formats FORMAT as a printf()-like format string and creates and returns a
1689    new lex_reader for the formatted result.  */
1690 struct lex_reader *
1691 lex_reader_for_format (const char *format, ...)
1692 {
1693   struct lex_reader *r;
1694   va_list args;
1695
1696   va_start (args, format);
1697   r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)));
1698   va_end (args);
1699
1700   return r;
1701 }
1702
1703 static struct lex_string_reader *
1704 lex_string_reader_cast (struct lex_reader *r)
1705 {
1706   return UP_CAST (r, struct lex_string_reader, reader);
1707 }
1708
1709 static size_t
1710 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
1711                  enum prompt_style prompt_style UNUSED)
1712 {
1713   struct lex_string_reader *r = lex_string_reader_cast (r_);
1714   size_t chunk;
1715
1716   chunk = MIN (n, r->s.length - r->offset);
1717   memcpy (buf, r->s.string + r->offset, chunk);
1718   r->offset += chunk;
1719
1720   return chunk;
1721 }
1722
1723 static void
1724 lex_string_close (struct lex_reader *r_)
1725 {
1726   struct lex_string_reader *r = lex_string_reader_cast (r_);
1727
1728   ss_dealloc (&r->s);
1729   free (r);
1730 }
1731
1732 static struct lex_reader_class lex_string_reader_class =
1733   {
1734     lex_string_read,
1735     lex_string_close
1736   };