pintos-os.org Git - pspp/blob - src/language/lexer/lexer.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "language/lexer/lexer.h"
  20
  21 #include <errno.h>
  22 #include <fcntl.h>
  23 #include <limits.h>
  24 #include <math.h>
  25 #include <stdarg.h>
  26 #include <stdlib.h>
  27 #include <string.h>
  28 #include <unictype.h>
  29 #include <unistd.h>
  30 #include <unistr.h>
  31 #include <uniwidth.h>
  32
  33 #include "data/file-name.h"
  34 #include "language/command.h"
  35 #include "language/lexer/scan.h"
  36 #include "language/lexer/segment.h"
  37 #include "language/lexer/token.h"
  38 #include "libpspp/assertion.h"
  39 #include "libpspp/cast.h"
  40 #include "libpspp/deque.h"
  41 #include "libpspp/i18n.h"
  42 #include "libpspp/ll.h"
  43 #include "libpspp/message.h"
  44 #include "libpspp/misc.h"
  45 #include "libpspp/str.h"
  46 #include "libpspp/u8-istream.h"
  47 #include "output/journal.h"
  48 #include "output/text-item.h"
  49
  50 #include "gl/c-ctype.h"
  51 #include "gl/minmax.h"
  52 #include "gl/xalloc.h"
  53 #include "gl/xmemdup0.h"
  54
  55 #include "gettext.h"
  56 #define _(msgid) gettext (msgid)
  57 #define N_(msgid) msgid
  58
  59 /* A token within a lex_source. */
  60 struct lex_token
  61   {
  62     /* The regular token information. */
  63     struct token token;
  64
  65     /* Location of token in terms of the lex_source's buffer.
  66        src->tail <= line_pos <= token_pos <= src->head. */
  67     size_t token_pos;           /* Start of token. */
  68     size_t token_len;           /* Length of source for token in bytes. */
  69     size_t line_pos;            /* Start of line containing token_pos. */
  70     int first_line;             /* Line number at token_pos. */
  71   };
  72
  73 /* A source of tokens, corresponding to a syntax file.
  74
  75    This is conceptually a lex_reader wrapped with everything needed to convert
  76    its UTF-8 bytes into tokens. */
  77 struct lex_source
  78   {
  79     struct ll ll;               /* In lexer's list of sources. */
  80     struct lex_reader *reader;
  81     struct segmenter segmenter;
  82     bool eof;                   /* True if T_STOP was read from 'reader'. */
  83
  84     /* Buffer of UTF-8 bytes. */
  85     char *buffer;
  86     size_t allocated;           /* Number of bytes allocated. */
  87     size_t tail;                /* &buffer[0] offset into UTF-8 source. */
  88     size_t head;                /* &buffer[head - tail] offset into source. */
  89
  90     /* Positions in source file, tail <= pos <= head for each member here. */
  91     size_t journal_pos;         /* First byte not yet output to journal. */
  92     size_t seg_pos;             /* First byte not yet scanned as token. */
  93     size_t line_pos;            /* First byte of line containing seg_pos. */
  94
  95     int n_newlines;             /* Number of new-lines up to seg_pos. */
  96     bool suppress_next_newline;
  97
  98     /* Tokens. */
  99     struct deque deque;         /* Indexes into 'tokens'. */
 100     struct lex_token *tokens;   /* Lookahead tokens for parser. */
 101   };
 102
 103 static struct lex_source *lex_source_create (struct lex_reader *);
 104 static void lex_source_destroy (struct lex_source *);
 105
 106 /* Lexer. */
 107 struct lexer
 108   {
 109     struct ll_list sources;     /* Contains "struct lex_source"s. */
 110   };
 111
 112 static struct lex_source *lex_source__ (const struct lexer *);
 113 static const struct lex_token *lex_next__ (const struct lexer *, int n);
 114 static void lex_source_push_endcmd__ (struct lex_source *);
 115
 116 static void lex_source_pop__ (struct lex_source *);
 117 static bool lex_source_get__ (const struct lex_source *);
 118 static void lex_source_error_valist (struct lex_source *, int n0, int n1,
 119                                      const char *format, va_list)
 120    PRINTF_FORMAT (4, 0);
 121 static const struct lex_token *lex_source_next__ (const struct lex_source *,
 122                                                   int n);
 123 \f
 124 /* Initializes READER with the specified CLASS and otherwise some reasonable
 125    defaults.  The caller should fill in the others members as desired. */
 126 void
 127 lex_reader_init (struct lex_reader *reader,
 128                  const struct lex_reader_class *class)
 129 {
 130   reader->class = class;
 131   reader->syntax = LEX_SYNTAX_AUTO;
 132   reader->error = LEX_ERROR_INTERACTIVE;
 133   reader->file_name = NULL;
 134   reader->line_number = 0;
 135 }
 136
 137 /* Frees any file name already in READER and replaces it by a copy of
 138    FILE_NAME, or if FILE_NAME is null then clears any existing name. */
 139 void
 140 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
 141 {
 142   free (reader->file_name);
 143   reader->file_name = file_name != NULL ? xstrdup (file_name) : NULL;
 144 }
 145 \f
 146 /* Creates and returns a new lexer. */
 147 struct lexer *
 148 lex_create (void)
 149 {
 150   struct lexer *lexer = xzalloc (sizeof *lexer);
 151   ll_init (&lexer->sources);
 152   return lexer;
 153 }
 154
 155 /* Destroys LEXER. */
 156 void
 157 lex_destroy (struct lexer *lexer)
 158 {
 159   if (lexer != NULL)
 160     {
 161       struct lex_source *source, *next;
 162
 163       ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
 164         lex_source_destroy (source);
 165       free (lexer);
 166     }
 167 }
 168
 169 /* Inserts READER into LEXER so that the next token read by LEXER comes from
 170    READER.  Before the caller, LEXER must either be empty or at a T_ENDCMD
 171    token. */
 172 void
 173 lex_include (struct lexer *lexer, struct lex_reader *reader)
 174 {
 175   assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
 176   ll_push_head (&lexer->sources, &lex_source_create (reader)->ll);
 177 }
 178
 179 /* Appends READER to LEXER, so that it will be read after all other current
 180    readers have already been read. */
 181 void
 182 lex_append (struct lexer *lexer, struct lex_reader *reader)
 183 {
 184   ll_push_tail (&lexer->sources, &lex_source_create (reader)->ll);
 185 }
 186 \f
 187 /* Advacning. */
 188
 189 static struct lex_token *
 190 lex_push_token__ (struct lex_source *src)
 191 {
 192   struct lex_token *token;
 193
 194   if (deque_is_full (&src->deque))
 195     src->tokens = deque_expand (&src->deque, src->tokens, sizeof *src->tokens);
 196
 197   token = &src->tokens[deque_push_front (&src->deque)];
 198   token_init (&token->token);
 199   return token;
 200 }
 201
 202 static void
 203 lex_source_pop__ (struct lex_source *src)
 204 {
 205   token_destroy (&src->tokens[deque_pop_back (&src->deque)].token);
 206 }
 207
 208 static void
 209 lex_source_pop_front (struct lex_source *src)
 210 {
 211   token_destroy (&src->tokens[deque_pop_front (&src->deque)].token);
 212 }
 213
 214 /* Advances LEXER to the next token, consuming the current token. */
 215 void
 216 lex_get (struct lexer *lexer)
 217 {
 218   struct lex_source *src;
 219
 220   src = lex_source__ (lexer);
 221   if (src == NULL)
 222     return;
 223
 224   if (!deque_is_empty (&src->deque))
 225     lex_source_pop__ (src);
 226
 227   while (deque_is_empty (&src->deque))
 228     if (!lex_source_get__ (src))
 229       {
 230         lex_source_destroy (src);
 231         src = lex_source__ (lexer);
 232         if (src == NULL)
 233           return;
 234       }
 235 }
 236 \f
 237 /* Issuing errors. */
 238
 239 /* Prints a syntax error message containing the current token and
 240    given message MESSAGE (if non-null). */
 241 void
 242 lex_error (struct lexer *lexer, const char *format, ...)
 243 {
 244   va_list args;
 245
 246   va_start (args, format);
 247   lex_next_error_valist (lexer, 0, 0, format, args);
 248   va_end (args);
 249 }
 250
 251 /* Prints a syntax error message containing the current token and
 252    given message MESSAGE (if non-null). */
 253 void
 254 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
 255 {
 256   lex_next_error_valist (lexer, 0, 0, format, args);
 257 }
 258
 259 /* Prints a syntax error message containing the current token and
 260    given message MESSAGE (if non-null). */
 261 void
 262 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
 263 {
 264   va_list args;
 265
 266   va_start (args, format);
 267   lex_next_error_valist (lexer, n0, n1, format, args);
 268   va_end (args);
 269 }
 270
 271 /* Prints a syntax error message saying that OPTION0 or one of the other
 272    strings following it, up to the first NULL, is expected. */
 273 void
 274 lex_error_expecting (struct lexer *lexer, const char *option0, ...)
 275 {
 276   enum { MAX_OPTIONS = 8 };
 277   const char *options[MAX_OPTIONS + 1];
 278   va_list args;
 279   int n;
 280
 281   va_start (args, option0);
 282   options[0] = option0;
 283   n = 0;
 284   while (n + 1 < MAX_OPTIONS && options[n] != NULL)
 285     options[++n] = va_arg (args, const char *);
 286   va_end (args);
 287
 288   switch (n)
 289     {
 290     case 0:
 291       lex_error (lexer, NULL);
 292       break;
 293
 294     case 1:
 295       lex_error (lexer, _("expecting %s"), options[0]);
 296       break;
 297
 298     case 2:
 299       lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
 300       break;
 301
 302     case 3:
 303       lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
 304                  options[2]);
 305       break;
 306
 307     case 4:
 308       lex_error (lexer, _("expecting %s, %s, %s, or %s"),
 309                  options[0], options[1], options[2], options[3]);
 310       break;
 311
 312     case 5:
 313       lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
 314                  options[0], options[1], options[2], options[3], options[4]);
 315       break;
 316
 317     case 6:
 318       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
 319                  options[0], options[1], options[2], options[3], options[4],
 320                  options[5]);
 321       break;
 322
 323     case 7:
 324       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
 325                  options[0], options[1], options[2], options[3], options[4],
 326                  options[5], options[6]);
 327       break;
 328
 329     case 8:
 330       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
 331                  options[0], options[1], options[2], options[3], options[4],
 332                  options[5], options[6], options[7]);
 333       break;
 334
 335     default:
 336       NOT_REACHED ();
 337     }
 338 }
 339
 340 /* Reports an error to the effect that subcommand SBC may only be
 341    specified once. */
 342 void
 343 lex_sbc_only_once (const char *sbc)
 344 {
 345   msg (SE, _("Subcommand %s may only be specified once."), sbc);
 346 }
 347
 348 /* Reports an error to the effect that subcommand SBC is
 349    missing. */
 350 void
 351 lex_sbc_missing (struct lexer *lexer, const char *sbc)
 352 {
 353   lex_error (lexer, _("missing required subcommand %s"), sbc);
 354 }
 355
 356 /* Prints a syntax error message containing the current token and
 357    given message MESSAGE (if non-null). */
 358 void
 359 lex_next_error_valist (struct lexer *lexer, int n0, int n1,
 360                        const char *format, va_list args)
 361 {
 362   struct lex_source *src = lex_source__ (lexer);
 363
 364   if (src != NULL)
 365     lex_source_error_valist (src, n0, n1, format, args);
 366   else
 367     {
 368       struct string s;
 369
 370       ds_init_empty (&s);
 371       ds_put_format (&s, _("Syntax error at end of input"));
 372       if (format != NULL)
 373         {
 374           ds_put_cstr (&s, ": ");
 375           ds_put_vformat (&s, format, args);
 376         }
 377       ds_put_byte (&s, '.');
 378       msg (SE, "%s", ds_cstr (&s));
 379       ds_destroy (&s);
 380     }
 381 }
 382
 383 /* Checks that we're at end of command.
 384    If so, returns a successful command completion code.
 385    If not, flags a syntax error and returns an error command
 386    completion code. */
 387 int
 388 lex_end_of_command (struct lexer *lexer)
 389 {
 390   if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
 391     {
 392       lex_error (lexer, _("expecting end of command"));
 393       return CMD_FAILURE;
 394     }
 395   else
 396     return CMD_SUCCESS;
 397 }
 398 \f
 399 /* Token testing functions. */
 400
 401 /* Returns true if the current token is a number. */
 402 bool
 403 lex_is_number (struct lexer *lexer)
 404 {
 405   return lex_next_is_number (lexer, 0);
 406 }
 407
 408 /* Returns true if the current token is a string. */
 409 bool
 410 lex_is_string (struct lexer *lexer)
 411 {
 412   return lex_next_is_string (lexer, 0);
 413 }
 414
 415 /* Returns the value of the current token, which must be a
 416    floating point number. */
 417 double
 418 lex_number (struct lexer *lexer)
 419 {
 420   return lex_next_number (lexer, 0);
 421 }
 422
 423 /* Returns true iff the current token is an integer. */
 424 bool
 425 lex_is_integer (struct lexer *lexer)
 426 {
 427   return lex_next_is_integer (lexer, 0);
 428 }
 429
 430 /* Returns the value of the current token, which must be an
 431    integer. */
 432 long
 433 lex_integer (struct lexer *lexer)
 434 {
 435   return lex_next_integer (lexer, 0);
 436 }
 437 \f
 438 /* Token testing functions with lookahead.
 439
 440    A value of 0 for N as an argument to any of these functions refers to the
 441    current token.  Lookahead is limited to the current command.  Any N greater
 442    than the number of tokens remaining in the current command will be treated
 443    as referring to a T_ENDCMD token. */
 444
 445 /* Returns true if the token N ahead of the current token is a number. */
 446 bool
 447 lex_next_is_number (struct lexer *lexer, int n)
 448 {
 449   enum token_type next_token = lex_next_token (lexer, n);
 450   return next_token == T_POS_NUM || next_token == T_NEG_NUM;
 451 }
 452
 453 /* Returns true if the token N ahead of the current token is a string. */
 454 bool
 455 lex_next_is_string (struct lexer *lexer, int n)
 456 {
 457   return lex_next_token (lexer, n) == T_STRING;
 458 }
 459
 460 /* Returns the value of the token N ahead of the current token, which must be a
 461    floating point number. */
 462 double
 463 lex_next_number (struct lexer *lexer, int n)
 464 {
 465   assert (lex_next_is_number (lexer, n));
 466   return lex_next_tokval (lexer, n);
 467 }
 468
 469 /* Returns true if the token N ahead of the current token is an integer. */
 470 bool
 471 lex_next_is_integer (struct lexer *lexer, int n)
 472 {
 473   double value;
 474
 475   if (!lex_next_is_number (lexer, n))
 476     return false;
 477
 478   value = lex_next_tokval (lexer, n);
 479   return value > LONG_MIN && value <= LONG_MAX && floor (value) == value;
 480 }
 481
 482 /* Returns the value of the token N ahead of the current token, which must be
 483    an integer. */
 484 long
 485 lex_next_integer (struct lexer *lexer, int n)
 486 {
 487   assert (lex_next_is_integer (lexer, n));
 488   return lex_next_tokval (lexer, n);
 489 }
 490 \f
 491 /* Token matching functions. */
 492
 493 /* If the current token has the specified TYPE, skips it and returns true.
 494    Otherwise, returns false. */
 495 bool
 496 lex_match (struct lexer *lexer, enum token_type type)
 497 {
 498   if (lex_token (lexer) == type)
 499     {
 500       lex_get (lexer);
 501       return true;
 502     }
 503   else
 504     return false;
 505 }
 506
 507 /* If the current token matches IDENTIFIER, skips it and returns true.
 508    IDENTIFIER may be abbreviated to its first three letters.  Otherwise,
 509    returns false.
 510
 511    IDENTIFIER must be an ASCII string. */
 512 bool
 513 lex_match_id (struct lexer *lexer, const char *identifier)
 514 {
 515   return lex_match_id_n (lexer, identifier, 3);
 516 }
 517
 518 /* If the current token is IDENTIFIER, skips it and returns true.  IDENTIFIER
 519    may be abbreviated to its first N letters.  Otherwise, returns false.
 520
 521    IDENTIFIER must be an ASCII string. */
 522 bool
 523 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
 524 {
 525   if (lex_token (lexer) == T_ID
 526       && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
 527     {
 528       lex_get (lexer);
 529       return true;
 530     }
 531   else
 532     return false;
 533 }
 534
 535 /* If the current token is integer X, skips it and returns true.  Otherwise,
 536    returns false. */
 537 bool
 538 lex_match_int (struct lexer *lexer, int x)
 539 {
 540   if (lex_is_integer (lexer) && lex_integer (lexer) == x)
 541     {
 542       lex_get (lexer);
 543       return true;
 544     }
 545   else
 546     return false;
 547 }
 548 \f
 549 /* Forced matches. */
 550
 551 /* If this token is IDENTIFIER, skips it and returns true.  IDENTIFIER may be
 552    abbreviated to its first 3 letters.  Otherwise, reports an error and returns
 553    false.
 554
 555    IDENTIFIER must be an ASCII string. */
 556 bool
 557 lex_force_match_id (struct lexer *lexer, const char *identifier)
 558 {
 559   if (lex_match_id (lexer, identifier))
 560     return true;
 561   else
 562     {
 563       lex_error_expecting (lexer, identifier, NULL_SENTINEL);
 564       return false;
 565     }
 566 }
 567
 568 /* If the current token has the specified TYPE, skips it and returns true.
 569    Otherwise, reports an error and returns false. */
 570 bool
 571 lex_force_match (struct lexer *lexer, enum token_type type)
 572 {
 573   if (lex_token (lexer) == type)
 574     {
 575       lex_get (lexer);
 576       return true;
 577     }
 578   else
 579     {
 580       char *s = xasprintf ("`%s'", token_type_to_string (type));
 581       lex_error_expecting (lexer, s, NULL_SENTINEL);
 582       free (s);
 583       return false;
 584     }
 585 }
 586
 587 /* If the current token is a string, does nothing and returns true.
 588    Otherwise, reports an error and returns false. */
 589 bool
 590 lex_force_string (struct lexer *lexer)
 591 {
 592   if (lex_is_string (lexer))
 593     return true;
 594   else
 595     {
 596       lex_error (lexer, _("expecting string"));
 597       return false;
 598     }
 599 }
 600
 601 /* If the current token is an integer, does nothing and returns true.
 602    Otherwise, reports an error and returns false. */
 603 bool
 604 lex_force_int (struct lexer *lexer)
 605 {
 606   if (lex_is_integer (lexer))
 607     return true;
 608   else
 609     {
 610       lex_error (lexer, _("expecting integer"));
 611       return false;
 612     }
 613 }
 614
 615 /* If the current token is a number, does nothing and returns true.
 616    Otherwise, reports an error and returns false. */
 617 bool
 618 lex_force_num (struct lexer *lexer)
 619 {
 620   if (lex_is_number (lexer))
 621     return true;
 622
 623   lex_error (lexer, _("expecting number"));
 624   return false;
 625 }
 626
 627 /* If the current token is an identifier, does nothing and returns true.
 628    Otherwise, reports an error and returns false. */
 629 bool
 630 lex_force_id (struct lexer *lexer)
 631 {
 632   if (lex_token (lexer) == T_ID)
 633     return true;
 634
 635   lex_error (lexer, _("expecting identifier"));
 636   return false;
 637 }
 638 \f
 639 /* Token accessors. */
 640
 641 /* Returns the type of LEXER's current token. */
 642 enum token_type
 643 lex_token (const struct lexer *lexer)
 644 {
 645   return lex_next_token (lexer, 0);
 646 }
 647
 648 /* Returns the number in LEXER's current token.
 649
 650    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
 651    tokens this function will always return zero. */
 652 double
 653 lex_tokval (const struct lexer *lexer)
 654 {
 655   return lex_next_tokval (lexer, 0);
 656 }
 657
 658 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
 659
 660    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 661    this functions this function will always return NULL.
 662
 663    The UTF-8 encoding of the returned string is correct for variable names and
 664    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 665    data_in() to use it in a "union value".  */
 666 const char *
 667 lex_tokcstr (const struct lexer *lexer)
 668 {
 669   return lex_next_tokcstr (lexer, 0);
 670 }
 671
 672 /* Returns the string in LEXER's current token, UTF-8 encoded.  The string is
 673    null-terminated (but the null terminator is not included in the returned
 674    substring's 'length').
 675
 676    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 677    this functions this function will always return NULL.
 678
 679    The UTF-8 encoding of the returned string is correct for variable names and
 680    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 681    data_in() to use it in a "union value".  */
 682 struct substring
 683 lex_tokss (const struct lexer *lexer)
 684 {
 685   return lex_next_tokss (lexer, 0);
 686 }
 687 \f
 688 /* Looking ahead.
 689
 690    A value of 0 for N as an argument to any of these functions refers to the
 691    current token.  Lookahead is limited to the current command.  Any N greater
 692    than the number of tokens remaining in the current command will be treated
 693    as referring to a T_ENDCMD token. */
 694
 695 static const struct lex_token *
 696 lex_next__ (const struct lexer *lexer_, int n)
 697 {
 698   struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
 699   struct lex_source *src = lex_source__ (lexer);
 700
 701   if (src != NULL)
 702     return lex_source_next__ (src, n);
 703   else
 704     {
 705       static const struct lex_token stop_token =
 706         { TOKEN_INITIALIZER (T_STOP, 0.0, ""), 0, 0, 0, 0 };
 707
 708       return &stop_token;
 709     }
 710 }
 711
 712 static const struct lex_token *
 713 lex_source_next__ (const struct lex_source *src, int n)
 714 {
 715   while (deque_count (&src->deque) <= n)
 716     {
 717       if (!deque_is_empty (&src->deque))
 718         {
 719           struct lex_token *front;
 720
 721           front = &src->tokens[deque_front (&src->deque, 0)];
 722           if (front->token.type == T_STOP || front->token.type == T_ENDCMD)
 723             return front;
 724         }
 725
 726       lex_source_get__ (src);
 727     }
 728
 729   return &src->tokens[deque_back (&src->deque, n)];
 730 }
 731
 732 /* Returns the "struct token" of the token N after the current one in LEXER.
 733    The returned pointer can be invalidated by pretty much any succeeding call
 734    into the lexer, although the string pointer within the returned token is
 735    only invalidated by consuming the token (e.g. with lex_get()). */
 736 const struct token *
 737 lex_next (const struct lexer *lexer, int n)
 738 {
 739   return &lex_next__ (lexer, n)->token;
 740 }
 741
 742 /* Returns the type of the token N after the current one in LEXER. */
 743 enum token_type
 744 lex_next_token (const struct lexer *lexer, int n)
 745 {
 746   return lex_next (lexer, n)->type;
 747 }
 748
 749 /* Returns the number in the tokn N after the current one in LEXER.
 750
 751    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
 752    tokens this function will always return zero. */
 753 double
 754 lex_next_tokval (const struct lexer *lexer, int n)
 755 {
 756   const struct token *token = lex_next (lexer, n);
 757   return token->number;
 758 }
 759
 760 /* Returns the null-terminated string in the token N after the current one, in
 761    UTF-8 encoding.
 762
 763    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 764    this functions this function will always return NULL.
 765
 766    The UTF-8 encoding of the returned string is correct for variable names and
 767    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 768    data_in() to use it in a "union value".  */
 769 const char *
 770 lex_next_tokcstr (const struct lexer *lexer, int n)
 771 {
 772   return lex_next_tokss (lexer, n).string;
 773 }
 774
 775 /* Returns the string in the token N after the current one, in UTF-8 encoding.
 776    The string is null-terminated (but the null terminator is not included in
 777    the returned substring's 'length').
 778
 779    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 780    this functions this function will always return NULL.
 781
 782    The UTF-8 encoding of the returned string is correct for variable names and
 783    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 784    data_in() to use it in a "union value".  */
 785 struct substring
 786 lex_next_tokss (const struct lexer *lexer, int n)
 787 {
 788   return lex_next (lexer, n)->string;
 789 }
 790
 791 /* If LEXER is positioned at the (pseudo)identifier S, skips it and returns
 792    true.  Otherwise, returns false.
 793
 794    S may consist of an arbitrary number of identifiers, integers, and
 795    punctuation e.g. "KRUSKAL-WALLIS", "2SLS", or "END INPUT PROGRAM".
 796    Identifiers may be abbreviated to their first three letters.  Currently only
 797    hyphens, slashes, and equals signs are supported as punctuation (but it
 798    would be easy to add more).
 799
 800    S must be an ASCII string. */
 801 bool
 802 lex_match_phrase (struct lexer *lexer, const char *s)
 803 {
 804   int tok_idx;
 805
 806   for (tok_idx = 0; ; tok_idx++)
 807     {
 808       enum token_type token;
 809       unsigned char c;
 810
 811       while (c_isspace (*s))
 812         s++;
 813
 814       c = *s;
 815       if (c == '\0')
 816         {
 817           int i;
 818
 819           for (i = 0; i < tok_idx; i++)
 820             lex_get (lexer);
 821           return true;
 822         }
 823
 824       token = lex_next_token (lexer, tok_idx);
 825       switch (c)
 826         {
 827         case '-':
 828           if (token != T_DASH)
 829             return false;
 830           s++;
 831           break;
 832
 833         case '/':
 834           if (token != T_SLASH)
 835             return false;
 836           s++;
 837           break;
 838
 839         case '=':
 840           if (token != T_EQUALS)
 841             return false;
 842           s++;
 843           break;
 844
 845         case '0': case '1': case '2': case '3': case '4':
 846         case '5': case '6': case '7': case '8': case '9':
 847           {
 848             unsigned int value;
 849
 850             if (token != T_POS_NUM)
 851               return false;
 852
 853             value = 0;
 854             do
 855               {
 856                 value = value * 10 + (*s++ - '0');
 857               }
 858             while (c_isdigit (*s));
 859
 860             if (lex_next_tokval (lexer, tok_idx) != value)
 861               return false;
 862           }
 863           break;
 864
 865         default:
 866           if (lex_is_id1 (c))
 867             {
 868               int len;
 869
 870               if (token != T_ID)
 871                 return false;
 872
 873               len = lex_id_get_length (ss_cstr (s));
 874               if (!lex_id_match (ss_buffer (s, len),
 875                                  lex_next_tokss (lexer, tok_idx)))
 876                 return false;
 877
 878               s += len;
 879             }
 880           else
 881             NOT_REACHED ();
 882         }
 883     }
 884 }
 885
 886 static int
 887 lex_source_get_first_line_number (const struct lex_source *src, int n)
 888 {
 889   return lex_source_next__ (src, n)->first_line;
 890 }
 891
 892 static int
 893 count_newlines (char *s, size_t length)
 894 {
 895   int n_newlines = 0;
 896   char *newline;
 897
 898   while ((newline = memchr (s, '\n', length)) != NULL)
 899     {
 900       n_newlines++;
 901       length -= (newline + 1) - s;
 902       s = newline + 1;
 903     }
 904
 905   return n_newlines;
 906 }
 907
 908 static int
 909 lex_source_get_last_line_number (const struct lex_source *src, int n)
 910 {
 911   const struct lex_token *token = lex_source_next__ (src, n);
 912
 913   if (token->first_line == 0)
 914     return 0;
 915   else
 916     {
 917       char *token_str = &src->buffer[token->token_pos - src->tail];
 918       return token->first_line + count_newlines (token_str, token->token_len) + 1;
 919     }
 920 }
 921
 922 static int
 923 count_columns (const char *s_, size_t length)
 924 {
 925   const uint8_t *s = CHAR_CAST (const uint8_t *, s_);
 926   int columns;
 927   size_t ofs;
 928   int mblen;
 929
 930   columns = 0;
 931   for (ofs = 0; ofs < length; ofs += mblen)
 932     {
 933       ucs4_t uc;
 934
 935       mblen = u8_mbtouc (&uc, s + ofs, length - ofs);
 936       if (uc != '\t')
 937         {
 938           int width = uc_width (uc, "UTF-8");
 939           if (width > 0)
 940             columns += width;
 941         }
 942       else
 943         columns = ROUND_UP (columns + 1, 8);
 944     }
 945
 946   return columns + 1;
 947 }
 948
 949 static int
 950 lex_source_get_first_column (const struct lex_source *src, int n)
 951 {
 952   const struct lex_token *token = lex_source_next__ (src, n);
 953   return count_columns (&src->buffer[token->line_pos - src->tail],
 954                         token->token_pos - token->line_pos);
 955 }
 956
 957 static int
 958 lex_source_get_last_column (const struct lex_source *src, int n)
 959 {
 960   const struct lex_token *token = lex_source_next__ (src, n);
 961   char *start, *end, *newline;
 962
 963   start = &src->buffer[token->line_pos - src->tail];
 964   end = &src->buffer[(token->token_pos + token->token_len) - src->tail];
 965   newline = memrchr (start, '\n', end - start);
 966   if (newline != NULL)
 967     start = newline + 1;
 968   return count_columns (start, end - start);
 969 }
 970
 971 /* Returns the 1-based line number of the start of the syntax that represents
 972    the token N after the current one in LEXER.  Returns 0 for a T_STOP token or
 973    if the token is drawn from a source that does not have line numbers. */
 974 int
 975 lex_get_first_line_number (const struct lexer *lexer, int n)
 976 {
 977   const struct lex_source *src = lex_source__ (lexer);
 978   return src != NULL ? lex_source_get_first_line_number (src, n) : 0;
 979 }
 980
 981 /* Returns the 1-based line number of the end of the syntax that represents the
 982    token N after the current one in LEXER, plus 1.  Returns 0 for a T_STOP
 983    token or if the token is drawn from a source that does not have line
 984    numbers.
 985
 986    Most of the time, a single token is wholly within a single line of syntax,
 987    but there are two exceptions: a T_STRING token can be made up of multiple
 988    segments on adjacent lines connected with "+" punctuators, and a T_NEG_NUM
 989    token can consist of a "-" on one line followed by the number on the next.
 990  */
 991 int
 992 lex_get_last_line_number (const struct lexer *lexer, int n)
 993 {
 994   const struct lex_source *src = lex_source__ (lexer);
 995   return src != NULL ? lex_source_get_last_line_number (src, n) : 0;
 996 }
 997
 998 /* Returns the 1-based column number of the start of the syntax that represents
 999    the token N after the current one in LEXER.  Returns 0 for a T_STOP
1000    token.
1001
1002    Column numbers are measured according to the width of characters as shown in
1003    a typical fixed-width font, in which CJK characters have width 2 and
1004    combining characters have width 0.  */
1005 int
1006 lex_get_first_column (const struct lexer *lexer, int n)
1007 {
1008   const struct lex_source *src = lex_source__ (lexer);
1009   return src != NULL ? lex_source_get_first_column (src, n) : 0;
1010 }
1011
1012 /* Returns the 1-based column number of the end of the syntax that represents
1013    the token N after the current one in LEXER, plus 1.  Returns 0 for a T_STOP
1014    token.
1015
1016    Column numbers are measured according to the width of characters as shown in
1017    a typical fixed-width font, in which CJK characters have width 2 and
1018    combining characters have width 0.  */
1019 int
1020 lex_get_last_column (const struct lexer *lexer, int n)
1021 {
1022   const struct lex_source *src = lex_source__ (lexer);
1023   return src != NULL ? lex_source_get_last_column (src, n) : 0;
1024 }
1025
1026 /* Returns the name of the syntax file from which the current command is drawn.
1027    Returns NULL for a T_STOP token or if the command's source does not have
1028    line numbers.
1029
1030    There is no version of this function that takes an N argument because
1031    lookahead only works to the end of a command and any given command is always
1032    within a single syntax file. */
1033 const char *
1034 lex_get_file_name (const struct lexer *lexer)
1035 {
1036   struct lex_source *src = lex_source__ (lexer);
1037   return src == NULL ? NULL : src->reader->file_name;
1038 }
1039
1040 /* Returns the syntax mode for the syntax file from which the current drawn is
1041    drawn.  Returns LEX_SYNTAX_AUTO for a T_STOP token or if the command's
1042    source does not have line numbers.
1043
1044    There is no version of this function that takes an N argument because
1045    lookahead only works to the end of a command and any given command is always
1046    within a single syntax file. */
1047 enum lex_syntax_mode
1048 lex_get_syntax_mode (const struct lexer *lexer)
1049 {
1050   struct lex_source *src = lex_source__ (lexer);
1051   return src == NULL ? LEX_SYNTAX_AUTO : src->reader->syntax;
1052 }
1053
1054 /* Returns the error mode for the syntax file from which the current drawn is
1055    drawn.  Returns LEX_ERROR_INTERACTIVE for a T_STOP token or if the command's
1056    source does not have line numbers.
1057
1058    There is no version of this function that takes an N argument because
1059    lookahead only works to the end of a command and any given command is always
1060    within a single syntax file. */
1061 enum lex_error_mode
1062 lex_get_error_mode (const struct lexer *lexer)
1063 {
1064   struct lex_source *src = lex_source__ (lexer);
1065   return src == NULL ? LEX_ERROR_INTERACTIVE : src->reader->error;
1066 }
1067
1068 /* If the source that LEXER is currently reading has error mode
1069    LEX_ERROR_INTERACTIVE, discards all buffered input and tokens, so that the
1070    next token to be read comes directly from whatever is next read from the
1071    stream.
1072
1073    It makes sense to call this function after encountering an error in a
1074    command entered on the console, because usually the user would prefer not to
1075    have cascading errors. */
1076 void
1077 lex_interactive_reset (struct lexer *lexer)
1078 {
1079   struct lex_source *src = lex_source__ (lexer);
1080   if (src != NULL && src->reader->error == LEX_ERROR_INTERACTIVE)
1081     {
1082       src->head = src->tail = 0;
1083       src->journal_pos = src->seg_pos = src->line_pos = 0;
1084       src->n_newlines = 0;
1085       src->suppress_next_newline = false;
1086       segmenter_init (&src->segmenter, segmenter_get_mode (&src->segmenter));
1087       while (!deque_is_empty (&src->deque))
1088         lex_source_pop__ (src);
1089       lex_source_push_endcmd__ (src);
1090     }
1091 }
1092
1093 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1094 void
1095 lex_discard_rest_of_command (struct lexer *lexer)
1096 {
1097   while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1098     lex_get (lexer);
1099 }
1100
1101 /* Discards all lookahead tokens in LEXER, then discards all input sources
1102    until it encounters one with error mode LEX_ERROR_INTERACTIVE or until it
1103    runs out of input sources. */
1104 void
1105 lex_discard_noninteractive (struct lexer *lexer)
1106 {
1107   struct lex_source *src = lex_source__ (lexer);
1108
1109   if (src != NULL)
1110     {
1111       while (!deque_is_empty (&src->deque))
1112         lex_source_pop__ (src);
1113
1114       for (; src != NULL && src->reader->error != LEX_ERROR_INTERACTIVE;
1115            src = lex_source__ (lexer))
1116         lex_source_destroy (src);
1117     }
1118 }
1119 \f
1120 static size_t
1121 lex_source_max_tail__ (const struct lex_source *src)
1122 {
1123   const struct lex_token *token;
1124   size_t max_tail;
1125
1126   assert (src->seg_pos >= src->line_pos);
1127   max_tail = MIN (src->journal_pos, src->line_pos);
1128
1129   /* Use the oldest token also.  (We know that src->deque cannot be empty
1130      because we are in the process of adding a new token, which is already
1131      initialized enough to use here.) */
1132   token = &src->tokens[deque_back (&src->deque, 0)];
1133   assert (token->token_pos >= token->line_pos);
1134   max_tail = MIN (max_tail, token->line_pos);
1135
1136   return max_tail;
1137 }
1138
1139 static void
1140 lex_source_expand__ (struct lex_source *src)
1141 {
1142   if (src->head - src->tail >= src->allocated)
1143     {
1144       size_t max_tail = lex_source_max_tail__ (src);
1145       if (max_tail > src->tail)
1146         {
1147           /* Advance the tail, freeing up room at the head. */
1148           memmove (src->buffer, src->buffer + (max_tail - src->tail),
1149                    src->head - max_tail);
1150           src->tail = max_tail;
1151         }
1152       else
1153         {
1154           /* Buffer is completely full.  Expand it. */
1155           src->buffer = x2realloc (src->buffer, &src->allocated);
1156         }
1157     }
1158   else
1159     {
1160       /* There's space available at the head of the buffer.  Nothing to do. */
1161     }
1162 }
1163
1164 static void
1165 lex_source_read__ (struct lex_source *src)
1166 {
1167   do
1168     {
1169       size_t head_ofs;
1170       size_t n;
1171
1172       lex_source_expand__ (src);
1173
1174       head_ofs = src->head - src->tail;
1175       n = src->reader->class->read (src->reader, &src->buffer[head_ofs],
1176                                     src->allocated - head_ofs,
1177                                     segmenter_get_prompt (&src->segmenter));
1178       if (n == 0)
1179         {
1180           /* End of input.
1181
1182              Ensure that the input always ends in a new-line followed by a null
1183              byte, as required by the segmenter library. */
1184
1185           if (src->head == src->tail
1186               || src->buffer[src->head - src->tail - 1] != '\n')
1187             src->buffer[src->head++ - src->tail] = '\n';
1188
1189           lex_source_expand__ (src);
1190           src->buffer[src->head++ - src->tail] = '\0';
1191
1192           return;
1193         }
1194
1195       src->head += n;
1196     }
1197   while (!memchr (&src->buffer[src->seg_pos - src->tail], '\n',
1198                   src->head - src->seg_pos));
1199 }
1200
1201 static struct lex_source *
1202 lex_source__ (const struct lexer *lexer)
1203 {
1204   return (ll_is_empty (&lexer->sources) ? NULL
1205           : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1206 }
1207
1208 static struct substring
1209 lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
1210 {
1211   const struct lex_token *token0 = lex_source_next__ (src, n0);
1212   const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1));
1213   size_t start = token0->token_pos;
1214   size_t end = token1->token_pos + token1->token_len;
1215
1216   return ss_buffer (&src->buffer[start - src->tail], end - start);
1217 }
1218
1219 static void
1220 lex_ellipsize__ (struct substring in, char *out, size_t out_size)
1221 {
1222   size_t out_maxlen;
1223   size_t out_len;
1224   int mblen;
1225
1226   assert (out_size >= 16);
1227   out_maxlen = out_size - (in.length >= out_size ? 3 : 0) - 1;
1228   for (out_len = 0; out_len < in.length; out_len += mblen)
1229     {
1230       if (in.string[out_len] == '\n'
1231           || (in.string[out_len] == '\r'
1232               && out_len + 1 < in.length
1233               && in.string[out_len + 1] == '\n'))
1234         break;
1235
1236       mblen = u8_mblen (CHAR_CAST (const uint8_t *, in.string + out_len),
1237                         in.length - out_len);
1238       if (out_len + mblen > out_maxlen)
1239         break;
1240     }
1241
1242   memcpy (out, in.string, out_len);
1243   strcpy (&out[out_len], out_len < in.length ? "..." : "");
1244 }
1245
1246 static void
1247 lex_source_error_valist (struct lex_source *src, int n0, int n1,
1248                          const char *format, va_list args)
1249 {
1250   const struct lex_token *token;
1251   struct string s;
1252   struct msg m;
1253
1254   ds_init_empty (&s);
1255
1256   token = lex_source_next__ (src, n0);
1257   if (token->token.type == T_ENDCMD)
1258     ds_put_cstr (&s, _("Syntax error at end of command"));
1259   else
1260     {
1261       struct substring syntax = lex_source_get_syntax__ (src, n0, n1);
1262       if (!ss_is_empty (syntax))
1263         {
1264           char syntax_cstr[64];
1265
1266           lex_ellipsize__ (syntax, syntax_cstr, sizeof syntax_cstr);
1267           ds_put_format (&s, _("Syntax error at `%s'"), syntax_cstr);
1268         }
1269       else
1270         ds_put_cstr (&s, _("Syntax error"));
1271     }
1272
1273   if (format)
1274     {
1275       ds_put_cstr (&s, ": ");
1276       ds_put_vformat (&s, format, args);
1277     }
1278   ds_put_byte (&s, '.');
1279
1280   m.category = MSG_C_SYNTAX;
1281   m.severity = MSG_S_ERROR;
1282   m.file_name = src->reader->file_name;
1283   m.first_line = lex_source_get_first_line_number (src, n0);
1284   m.last_line = lex_source_get_last_line_number (src, n1);
1285   m.first_column = lex_source_get_first_column (src, n0);
1286   m.last_column = lex_source_get_last_column (src, n1);
1287   m.text = ds_steal_cstr (&s);
1288   msg_emit (&m);
1289 }
1290
1291 static void PRINTF_FORMAT (2, 3)
1292 lex_get_error (struct lex_source *src, const char *format, ...)
1293 {
1294   va_list args;
1295   int n;
1296
1297   va_start (args, format);
1298
1299   n = deque_count (&src->deque) - 1;
1300   lex_source_error_valist (src, n, n, format, args);
1301   lex_source_pop_front (src);
1302
1303   va_end (args);
1304 }
1305
1306 static bool
1307 lex_source_get__ (const struct lex_source *src_)
1308 {
1309   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1310
1311   struct state
1312     {
1313       struct segmenter segmenter;
1314       enum segment_type last_segment;
1315       int newlines;
1316       size_t line_pos;
1317       size_t seg_pos;
1318     };
1319
1320   struct state state, saved;
1321   enum scan_result result;
1322   struct scanner scanner;
1323   struct lex_token *token;
1324   int n_lines;
1325   int i;
1326
1327   if (src->eof)
1328     return false;
1329
1330   state.segmenter = src->segmenter;
1331   state.newlines = 0;
1332   state.seg_pos = src->seg_pos;
1333   state.line_pos = src->line_pos;
1334   saved = state;
1335
1336   token = lex_push_token__ (src);
1337   scanner_init (&scanner, &token->token);
1338   token->line_pos = src->line_pos;
1339   token->token_pos = src->seg_pos;
1340   if (src->reader->line_number > 0)
1341     token->first_line = src->reader->line_number + src->n_newlines;
1342   else
1343     token->first_line = 0;
1344
1345   for (;;)
1346     {
1347       enum segment_type type;
1348       const char *segment;
1349       size_t seg_maxlen;
1350       int seg_len;
1351
1352       segment = &src->buffer[state.seg_pos - src->tail];
1353       seg_maxlen = src->head - state.seg_pos;
1354       seg_len = segmenter_push (&state.segmenter, segment, seg_maxlen, &type);
1355       if (seg_len < 0)
1356         {
1357           lex_source_read__ (src);
1358           continue;
1359         }
1360
1361       state.last_segment = type;
1362       state.seg_pos += seg_len;
1363       if (type == SEG_NEWLINE)
1364         {
1365           state.newlines++;
1366           state.line_pos = state.seg_pos;
1367         }
1368
1369       result = scanner_push (&scanner, type, ss_buffer (segment, seg_len),
1370                              &token->token);
1371       if (result == SCAN_SAVE)
1372         saved = state;
1373       else if (result == SCAN_BACK)
1374         {
1375           state = saved;
1376           break;
1377         }
1378       else if (result == SCAN_DONE)
1379         break;
1380     }
1381
1382   n_lines = state.newlines;
1383   if (state.last_segment == SEG_END_COMMAND && !src->suppress_next_newline)
1384     {
1385       n_lines++;
1386       src->suppress_next_newline = true;
1387     }
1388   else if (n_lines > 0 && src->suppress_next_newline)
1389     {
1390       n_lines--;
1391       src->suppress_next_newline = false;
1392     }
1393   for (i = 0; i < n_lines; i++)
1394     {
1395       const char *newline;
1396       const char *line;
1397       size_t line_len;
1398       char *syntax;
1399
1400       line = &src->buffer[src->journal_pos - src->tail];
1401       newline = rawmemchr (line, '\n');
1402       line_len = newline - line;
1403       if (line_len > 0 && line[line_len - 1] == '\r')
1404         line_len--;
1405
1406       syntax = malloc (line_len + 2);
1407       memcpy (syntax, line, line_len);
1408       syntax[line_len] = '\n';
1409       syntax[line_len + 1] = '\0';
1410
1411       text_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX, syntax));
1412
1413       src->journal_pos += newline - line + 1;
1414     }
1415
1416   token->token_len = state.seg_pos - src->seg_pos;
1417
1418   src->segmenter = state.segmenter;
1419   src->seg_pos = state.seg_pos;
1420   src->line_pos = state.line_pos;
1421   src->n_newlines += state.newlines;
1422
1423   switch (token->token.type)
1424     {
1425     default:
1426       break;
1427
1428     case T_STOP:
1429       token->token.type = T_ENDCMD;
1430       src->eof = true;
1431       break;
1432
1433     case SCAN_BAD_HEX_LENGTH:
1434       lex_get_error (src, _("String of hex digits has %d characters, which "
1435                             "is not a multiple of 2"),
1436                      (int) token->token.number);
1437       break;
1438
1439     case SCAN_BAD_HEX_DIGIT:
1440     case SCAN_BAD_UNICODE_DIGIT:
1441       lex_get_error (src, _("`%c' is not a valid hex digit"),
1442                      (int) token->token.number);
1443       break;
1444
1445     case SCAN_BAD_UNICODE_LENGTH:
1446       lex_get_error (src, _("Unicode string contains %d bytes, which is "
1447                             "not in the valid range of 1 to 8 bytes"),
1448                      (int) token->token.number);
1449       break;
1450
1451     case SCAN_BAD_UNICODE_CODE_POINT:
1452       lex_get_error (src, _("U+%04X is not a valid Unicode code point"),
1453                      (int) token->token.number);
1454       break;
1455
1456     case SCAN_EXPECTED_QUOTE:
1457       lex_get_error (src, _("Unterminated string constant"));
1458       break;
1459
1460     case SCAN_EXPECTED_EXPONENT:
1461       lex_get_error (src, _("Missing exponent following `%s'"),
1462                      token->token.string.string);
1463       break;
1464
1465     case SCAN_UNEXPECTED_DOT:
1466       lex_get_error (src, _("Unexpected `.' in middle of command"));
1467       break;
1468
1469     case SCAN_UNEXPECTED_CHAR:
1470       {
1471         char c_name[16];
1472         lex_get_error (src, _("Bad character %s in input"),
1473                        uc_name (token->token.number, c_name));
1474       }
1475       break;
1476
1477     case SCAN_SKIP:
1478       lex_source_pop_front (src);
1479       break;
1480     }
1481
1482   return true;
1483 }
1484 \f
1485 static void
1486 lex_source_push_endcmd__ (struct lex_source *src)
1487 {
1488   struct lex_token *token = lex_push_token__ (src);
1489   token->token.type = T_ENDCMD;
1490   token->token_pos = 0;
1491   token->token_len = 0;
1492   token->line_pos = 0;
1493   token->first_line = 0;
1494 }
1495
1496 static struct lex_source *
1497 lex_source_create (struct lex_reader *reader)
1498 {
1499   struct lex_source *src;
1500   enum segmenter_mode mode;
1501
1502   src = xzalloc (sizeof *src);
1503   src->reader = reader;
1504
1505   if (reader->syntax == LEX_SYNTAX_AUTO)
1506     mode = SEG_MODE_AUTO;
1507   else if (reader->syntax == LEX_SYNTAX_INTERACTIVE)
1508     mode = SEG_MODE_INTERACTIVE;
1509   else if (reader->syntax == LEX_SYNTAX_BATCH)
1510     mode = SEG_MODE_BATCH;
1511   else
1512     NOT_REACHED ();
1513   segmenter_init (&src->segmenter, mode);
1514
1515   src->tokens = deque_init (&src->deque, 4, sizeof *src->tokens);
1516
1517   lex_source_push_endcmd__ (src);
1518
1519   return src;
1520 }
1521
1522 static void
1523 lex_source_destroy (struct lex_source *src)
1524 {
1525   char *file_name = src->reader->file_name;
1526   if (src->reader->class->destroy != NULL)
1527     src->reader->class->destroy (src->reader);
1528   free (file_name);
1529   free (src->buffer);
1530   while (!deque_is_empty (&src->deque))
1531     lex_source_pop__ (src);
1532   free (src->tokens);
1533   ll_remove (&src->ll);
1534   free (src);
1535 }
1536 \f
1537 struct lex_file_reader
1538   {
1539     struct lex_reader reader;
1540     struct u8_istream *istream;
1541     char *file_name;
1542   };
1543
1544 static struct lex_reader_class lex_file_reader_class;
1545
1546 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
1547    from stdin if FILE_NAME is "-").  The file is expected to be encoded with
1548    ENCODING, which should take one of the forms accepted by
1549    u8_istream_for_file().  SYNTAX and ERROR become the syntax mode and error
1550    mode of the new reader, respectively.
1551
1552    Returns a null pointer if FILE_NAME cannot be opened. */
1553 struct lex_reader *
1554 lex_reader_for_file (const char *file_name, const char *encoding,
1555                      enum lex_syntax_mode syntax,
1556                      enum lex_error_mode error)
1557 {
1558   struct lex_file_reader *r;
1559   struct u8_istream *istream;
1560
1561   istream = (!strcmp(file_name, "-")
1562              ? u8_istream_for_fd (encoding, STDIN_FILENO)
1563              : u8_istream_for_file (encoding, file_name, O_RDONLY));
1564   if (istream == NULL)
1565     {
1566       msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
1567       return NULL;
1568     }
1569
1570   r = xmalloc (sizeof *r);
1571   lex_reader_init (&r->reader, &lex_file_reader_class);
1572   r->reader.syntax = syntax;
1573   r->reader.error = error;
1574   r->reader.file_name = xstrdup (file_name);
1575   r->reader.line_number = 1;
1576   r->istream = istream;
1577   r->file_name = xstrdup (file_name);
1578
1579   return &r->reader;
1580 }
1581
1582 static struct lex_file_reader *
1583 lex_file_reader_cast (struct lex_reader *r)
1584 {
1585   return UP_CAST (r, struct lex_file_reader, reader);
1586 }
1587
1588 static size_t
1589 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
1590                enum prompt_style prompt_style UNUSED)
1591 {
1592   struct lex_file_reader *r = lex_file_reader_cast (r_);
1593   ssize_t n_read = u8_istream_read (r->istream, buf, n);
1594   if (n_read < 0)
1595     {
1596       msg (ME, _("Error reading `%s': %s."), r->file_name, strerror (errno));
1597       return 0;
1598     }
1599   return n_read;
1600 }
1601
1602 static void
1603 lex_file_close (struct lex_reader *r_)
1604 {
1605   struct lex_file_reader *r = lex_file_reader_cast (r_);
1606
1607   if (u8_istream_fileno (r->istream) != STDIN_FILENO)
1608     {
1609       if (u8_istream_close (r->istream) != 0)
1610         msg (ME, _("Error closing `%s': %s."), r->file_name, strerror (errno));
1611     }
1612   else
1613     u8_istream_free (r->istream);
1614
1615   free (r->file_name);
1616   free (r);
1617 }
1618
1619 static struct lex_reader_class lex_file_reader_class =
1620   {
1621     lex_file_read,
1622     lex_file_close
1623   };
1624 \f
1625 struct lex_string_reader
1626   {
1627     struct lex_reader reader;
1628     struct substring s;
1629     size_t offset;
1630   };
1631
1632 static struct lex_reader_class lex_string_reader_class;
1633
1634 /* Creates and returns a new lex_reader for the contents of S, which must be
1635    encoded in UTF-8.  The new reader takes ownership of S and will free it
1636    with ss_dealloc() when it is closed. */
1637 struct lex_reader *
1638 lex_reader_for_substring_nocopy (struct substring s)
1639 {
1640   struct lex_string_reader *r;
1641
1642   r = xmalloc (sizeof *r);
1643   lex_reader_init (&r->reader, &lex_string_reader_class);
1644   r->reader.syntax = LEX_SYNTAX_INTERACTIVE;
1645   r->s = s;
1646   r->offset = 0;
1647
1648   return &r->reader;
1649 }
1650
1651 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
1652    which must be encoded in UTF-8.  The caller retains ownership of S. */
1653 struct lex_reader *
1654 lex_reader_for_string (const char *s)
1655 {
1656   struct substring ss;
1657   ss_alloc_substring (&ss, ss_cstr (s));
1658   return lex_reader_for_substring_nocopy (ss);
1659 }
1660
1661 /* Formats FORMAT as a printf()-like format string and creates and returns a
1662    new lex_reader for the formatted result.  */
1663 struct lex_reader *
1664 lex_reader_for_format (const char *format, ...)
1665 {
1666   struct lex_reader *r;
1667   va_list args;
1668
1669   va_start (args, format);
1670   r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)));
1671   va_end (args);
1672
1673   return r;
1674 }
1675
1676 static struct lex_string_reader *
1677 lex_string_reader_cast (struct lex_reader *r)
1678 {
1679   return UP_CAST (r, struct lex_string_reader, reader);
1680 }
1681
1682 static size_t
1683 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
1684                  enum prompt_style prompt_style UNUSED)
1685 {
1686   struct lex_string_reader *r = lex_string_reader_cast (r_);
1687   size_t chunk;
1688
1689   chunk = MIN (n, r->s.length - r->offset);
1690   memcpy (buf, r->s.string + r->offset, chunk);
1691   r->offset += chunk;
1692
1693   return chunk;
1694 }
1695
1696 static void
1697 lex_string_close (struct lex_reader *r_)
1698 {
1699   struct lex_string_reader *r = lex_string_reader_cast (r_);
1700
1701   ss_dealloc (&r->s);
1702   free (r);
1703 }
1704
1705 static struct lex_reader_class lex_string_reader_class =
1706   {
1707     lex_string_read,
1708     lex_string_close
1709   };