pintos-os.org Git - pspp/blob - src/language/lexer/lexer.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "language/lexer/lexer.h"
  20
  21 #include <errno.h>
  22 #include <fcntl.h>
  23 #include <limits.h>
  24 #include <math.h>
  25 #include <stdarg.h>
  26 #include <stdlib.h>
  27 #include <string.h>
  28 #include <unictype.h>
  29 #include <unistd.h>
  30 #include <unistr.h>
  31 #include <uniwidth.h>
  32
  33 #include "data/file-name.h"
  34 #include "language/command.h"
  35 #include "language/lexer/scan.h"
  36 #include "language/lexer/segment.h"
  37 #include "language/lexer/token.h"
  38 #include "libpspp/assertion.h"
  39 #include "libpspp/cast.h"
  40 #include "libpspp/deque.h"
  41 #include "libpspp/i18n.h"
  42 #include "libpspp/ll.h"
  43 #include "libpspp/message.h"
  44 #include "libpspp/misc.h"
  45 #include "libpspp/str.h"
  46 #include "libpspp/u8-istream.h"
  47 #include "output/journal.h"
  48 #include "output/text-item.h"
  49
  50 #include "gl/c-ctype.h"
  51 #include "gl/minmax.h"
  52 #include "gl/xalloc.h"
  53 #include "gl/xmemdup0.h"
  54
  55 #include "gettext.h"
  56 #define _(msgid) gettext (msgid)
  57 #define N_(msgid) msgid
  58
  59 /* A token within a lex_source. */
  60 struct lex_token
  61   {
  62     /* The regular token information. */
  63     struct token token;
  64
  65     /* Location of token in terms of the lex_source's buffer.
  66        src->tail <= line_pos <= token_pos <= src->head. */
  67     size_t token_pos;           /* Start of token. */
  68     size_t token_len;           /* Length of source for token in bytes. */
  69     size_t line_pos;            /* Start of line containing token_pos. */
  70     int first_line;             /* Line number at token_pos. */
  71   };
  72
  73 /* A source of tokens, corresponding to a syntax file.
  74
  75    This is conceptually a lex_reader wrapped with everything needed to convert
  76    its UTF-8 bytes into tokens. */
  77 struct lex_source
  78   {
  79     struct ll ll;               /* In lexer's list of sources. */
  80     struct lex_reader *reader;
  81     struct segmenter segmenter;
  82     bool eof;                   /* True if T_STOP was read from 'reader'. */
  83
  84     /* Buffer of UTF-8 bytes. */
  85     char *buffer;
  86     size_t allocated;           /* Number of bytes allocated. */
  87     size_t tail;                /* &buffer[0] offset into UTF-8 source. */
  88     size_t head;                /* &buffer[head - tail] offset into source. */
  89
  90     /* Positions in source file, tail <= pos <= head for each member here. */
  91     size_t journal_pos;         /* First byte not yet output to journal. */
  92     size_t seg_pos;             /* First byte not yet scanned as token. */
  93     size_t line_pos;            /* First byte of line containing seg_pos. */
  94
  95     int n_newlines;             /* Number of new-lines up to seg_pos. */
  96     bool suppress_next_newline;
  97
  98     /* Tokens. */
  99     struct deque deque;         /* Indexes into 'tokens'. */
 100     struct lex_token *tokens;   /* Lookahead tokens for parser. */
 101   };
 102
 103 static struct lex_source *lex_source_create (struct lex_reader *);
 104 static void lex_source_destroy (struct lex_source *);
 105
 106 /* Lexer. */
 107 struct lexer
 108   {
 109     struct ll_list sources;     /* Contains "struct lex_source"s. */
 110   };
 111
 112 static struct lex_source *lex_source__ (const struct lexer *);
 113 static const struct lex_token *lex_next__ (const struct lexer *, int n);
 114 static void lex_source_push_endcmd__ (struct lex_source *);
 115
 116 static void lex_source_pop__ (struct lex_source *);
 117 static bool lex_source_get__ (const struct lex_source *);
 118 static void lex_source_error_valist (struct lex_source *, int n0, int n1,
 119                                      const char *format, va_list)
 120    PRINTF_FORMAT (4, 0);
 121 static const struct lex_token *lex_source_next__ (const struct lex_source *,
 122                                                   int n);
 123 \f
 124 /* Initializes READER with the specified CLASS and otherwise some reasonable
 125    defaults.  The caller should fill in the others members as desired. */
 126 void
 127 lex_reader_init (struct lex_reader *reader,
 128                  const struct lex_reader_class *class)
 129 {
 130   reader->class = class;
 131   reader->syntax = LEX_SYNTAX_AUTO;
 132   reader->error = LEX_ERROR_INTERACTIVE;
 133   reader->file_name = NULL;
 134   reader->line_number = 0;
 135 }
 136
 137 /* Frees any file name already in READER and replaces it by a copy of
 138    FILE_NAME, or if FILE_NAME is null then clears any existing name. */
 139 void
 140 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
 141 {
 142   free (reader->file_name);
 143   reader->file_name = file_name != NULL ? xstrdup (file_name) : NULL;
 144 }
 145 \f
 146 /* Creates and returns a new lexer. */
 147 struct lexer *
 148 lex_create (void)
 149 {
 150   struct lexer *lexer = xzalloc (sizeof *lexer);
 151   ll_init (&lexer->sources);
 152   return lexer;
 153 }
 154
 155 /* Destroys LEXER. */
 156 void
 157 lex_destroy (struct lexer *lexer)
 158 {
 159   if (lexer != NULL)
 160     {
 161       struct lex_source *source, *next;
 162
 163       ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
 164         lex_source_destroy (source);
 165       free (lexer);
 166     }
 167 }
 168
 169 /* Inserts READER into LEXER so that the next token read by LEXER comes from
 170    READER.  Before the caller, LEXER must either be empty or at a T_ENDCMD
 171    token. */
 172 void
 173 lex_include (struct lexer *lexer, struct lex_reader *reader)
 174 {
 175   assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
 176   ll_push_head (&lexer->sources, &lex_source_create (reader)->ll);
 177 }
 178
 179 /* Appends READER to LEXER, so that it will be read after all other current
 180    readers have already been read. */
 181 void
 182 lex_append (struct lexer *lexer, struct lex_reader *reader)
 183 {
 184   ll_push_tail (&lexer->sources, &lex_source_create (reader)->ll);
 185 }
 186 \f
 187 /* Advacning. */
 188
 189 static struct lex_token *
 190 lex_push_token__ (struct lex_source *src)
 191 {
 192   struct lex_token *token;
 193
 194   if (deque_is_full (&src->deque))
 195     src->tokens = deque_expand (&src->deque, src->tokens, sizeof *src->tokens);
 196
 197   token = &src->tokens[deque_push_front (&src->deque)];
 198   token_init (&token->token);
 199   return token;
 200 }
 201
 202 static void
 203 lex_source_pop__ (struct lex_source *src)
 204 {
 205   token_destroy (&src->tokens[deque_pop_back (&src->deque)].token);
 206 }
 207
 208 static void
 209 lex_source_pop_front (struct lex_source *src)
 210 {
 211   token_destroy (&src->tokens[deque_pop_front (&src->deque)].token);
 212 }
 213
 214 /* Advances LEXER to the next token, consuming the current token. */
 215 void
 216 lex_get (struct lexer *lexer)
 217 {
 218   struct lex_source *src;
 219
 220   src = lex_source__ (lexer);
 221   if (src == NULL)
 222     return;
 223
 224   if (!deque_is_empty (&src->deque))
 225     lex_source_pop__ (src);
 226
 227   while (deque_is_empty (&src->deque))
 228     if (!lex_source_get__ (src))
 229       {
 230         lex_source_destroy (src);
 231         src = lex_source__ (lexer);
 232         if (src == NULL)
 233           return;
 234       }
 235 }
 236 \f
 237 /* Issuing errors. */
 238
 239 /* Prints a syntax error message containing the current token and
 240    given message MESSAGE (if non-null). */
 241 void
 242 lex_error (struct lexer *lexer, const char *format, ...)
 243 {
 244   va_list args;
 245
 246   va_start (args, format);
 247   lex_next_error_valist (lexer, 0, 0, format, args);
 248   va_end (args);
 249 }
 250
 251 /* Prints a syntax error message containing the current token and
 252    given message MESSAGE (if non-null). */
 253 void
 254 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
 255 {
 256   lex_next_error_valist (lexer, 0, 0, format, args);
 257 }
 258
 259 /* Prints a syntax error message containing the current token and
 260    given message MESSAGE (if non-null). */
 261 void
 262 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
 263 {
 264   va_list args;
 265
 266   va_start (args, format);
 267   lex_next_error_valist (lexer, n0, n1, format, args);
 268   va_end (args);
 269 }
 270
 271 /* Prints a syntax error message saying that OPTION0 or one of the other
 272    strings following it, up to the first NULL, is expected. */
 273 void
 274 lex_error_expecting (struct lexer *lexer, const char *option0, ...)
 275 {
 276   enum { MAX_OPTIONS = 8 };
 277   const char *options[MAX_OPTIONS + 1];
 278   va_list args;
 279   int n;
 280
 281   va_start (args, option0);
 282   options[0] = option0;
 283   n = 0;
 284   while (n + 1 < MAX_OPTIONS && options[n] != NULL)
 285     options[++n] = va_arg (args, const char *);
 286   va_end (args);
 287
 288   switch (n)
 289     {
 290     case 0:
 291       lex_error (lexer, NULL);
 292       break;
 293
 294     case 1:
 295       lex_error (lexer, _("expecting %s"), options[0]);
 296       break;
 297
 298     case 2:
 299       lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
 300       break;
 301
 302     case 3:
 303       lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
 304                  options[2]);
 305       break;
 306
 307     case 4:
 308       lex_error (lexer, _("expecting %s, %s, %s, or %s"),
 309                  options[0], options[1], options[2], options[3]);
 310       break;
 311
 312     case 5:
 313       lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
 314                  options[0], options[1], options[2], options[3], options[4]);
 315       break;
 316
 317     case 6:
 318       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
 319                  options[0], options[1], options[2], options[3], options[4],
 320                  options[5]);
 321       break;
 322
 323     case 7:
 324       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
 325                  options[0], options[1], options[2], options[3], options[4],
 326                  options[5], options[6]);
 327       break;
 328
 329     case 8:
 330       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
 331                  options[0], options[1], options[2], options[3], options[4],
 332                  options[5], options[6], options[7]);
 333       break;
 334
 335     default:
 336       NOT_REACHED ();
 337     }
 338 }
 339
 340 /* Reports an error to the effect that subcommand SBC may only be specified
 341    once.
 342
 343    This function does not take a lexer as an argument or use lex_error(),
 344    because the result would ordinarily just be redundant: "Syntax error at
 345    SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
 346    not help the user find the error. */
 347 void
 348 lex_sbc_only_once (const char *sbc)
 349 {
 350   msg (SE, _("Subcommand %s may only be specified once."), sbc);
 351 }
 352
 353 /* Reports an error to the effect that subcommand SBC is missing.
 354
 355    This function does not take a lexer as an argument or use lex_error(),
 356    because a missing subcommand can normally be detected only after the whole
 357    command has been parsed, and so lex_error() would always report "Syntax
 358    error at end of command", which does not help the user find the error. */
 359 void
 360 lex_sbc_missing (const char *sbc)
 361 {
 362   msg (SE, _("Required subcommand %s was not specified."), sbc);
 363 }
 364
 365 /* Prints a syntax error message containing the current token and
 366    given message MESSAGE (if non-null). */
 367 void
 368 lex_next_error_valist (struct lexer *lexer, int n0, int n1,
 369                        const char *format, va_list args)
 370 {
 371   struct lex_source *src = lex_source__ (lexer);
 372
 373   if (src != NULL)
 374     lex_source_error_valist (src, n0, n1, format, args);
 375   else
 376     {
 377       struct string s;
 378
 379       ds_init_empty (&s);
 380       ds_put_format (&s, _("Syntax error at end of input"));
 381       if (format != NULL)
 382         {
 383           ds_put_cstr (&s, ": ");
 384           ds_put_vformat (&s, format, args);
 385         }
 386       ds_put_byte (&s, '.');
 387       msg (SE, "%s", ds_cstr (&s));
 388       ds_destroy (&s);
 389     }
 390 }
 391
 392 /* Checks that we're at end of command.
 393    If so, returns a successful command completion code.
 394    If not, flags a syntax error and returns an error command
 395    completion code. */
 396 int
 397 lex_end_of_command (struct lexer *lexer)
 398 {
 399   if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
 400     {
 401       lex_error (lexer, _("expecting end of command"));
 402       return CMD_FAILURE;
 403     }
 404   else
 405     return CMD_SUCCESS;
 406 }
 407 \f
 408 /* Token testing functions. */
 409
 410 /* Returns true if the current token is a number. */
 411 bool
 412 lex_is_number (struct lexer *lexer)
 413 {
 414   return lex_next_is_number (lexer, 0);
 415 }
 416
 417 /* Returns true if the current token is a string. */
 418 bool
 419 lex_is_string (struct lexer *lexer)
 420 {
 421   return lex_next_is_string (lexer, 0);
 422 }
 423
 424 /* Returns the value of the current token, which must be a
 425    floating point number. */
 426 double
 427 lex_number (struct lexer *lexer)
 428 {
 429   return lex_next_number (lexer, 0);
 430 }
 431
 432 /* Returns true iff the current token is an integer. */
 433 bool
 434 lex_is_integer (struct lexer *lexer)
 435 {
 436   return lex_next_is_integer (lexer, 0);
 437 }
 438
 439 /* Returns the value of the current token, which must be an
 440    integer. */
 441 long
 442 lex_integer (struct lexer *lexer)
 443 {
 444   return lex_next_integer (lexer, 0);
 445 }
 446 \f
 447 /* Token testing functions with lookahead.
 448
 449    A value of 0 for N as an argument to any of these functions refers to the
 450    current token.  Lookahead is limited to the current command.  Any N greater
 451    than the number of tokens remaining in the current command will be treated
 452    as referring to a T_ENDCMD token. */
 453
 454 /* Returns true if the token N ahead of the current token is a number. */
 455 bool
 456 lex_next_is_number (struct lexer *lexer, int n)
 457 {
 458   enum token_type next_token = lex_next_token (lexer, n);
 459   return next_token == T_POS_NUM || next_token == T_NEG_NUM;
 460 }
 461
 462 /* Returns true if the token N ahead of the current token is a string. */
 463 bool
 464 lex_next_is_string (struct lexer *lexer, int n)
 465 {
 466   return lex_next_token (lexer, n) == T_STRING;
 467 }
 468
 469 /* Returns the value of the token N ahead of the current token, which must be a
 470    floating point number. */
 471 double
 472 lex_next_number (struct lexer *lexer, int n)
 473 {
 474   assert (lex_next_is_number (lexer, n));
 475   return lex_next_tokval (lexer, n);
 476 }
 477
 478 /* Returns true if the token N ahead of the current token is an integer. */
 479 bool
 480 lex_next_is_integer (struct lexer *lexer, int n)
 481 {
 482   double value;
 483
 484   if (!lex_next_is_number (lexer, n))
 485     return false;
 486
 487   value = lex_next_tokval (lexer, n);
 488   return value > LONG_MIN && value <= LONG_MAX && floor (value) == value;
 489 }
 490
 491 /* Returns the value of the token N ahead of the current token, which must be
 492    an integer. */
 493 long
 494 lex_next_integer (struct lexer *lexer, int n)
 495 {
 496   assert (lex_next_is_integer (lexer, n));
 497   return lex_next_tokval (lexer, n);
 498 }
 499 \f
 500 /* Token matching functions. */
 501
 502 /* If the current token has the specified TYPE, skips it and returns true.
 503    Otherwise, returns false. */
 504 bool
 505 lex_match (struct lexer *lexer, enum token_type type)
 506 {
 507   if (lex_token (lexer) == type)
 508     {
 509       lex_get (lexer);
 510       return true;
 511     }
 512   else
 513     return false;
 514 }
 515
 516 /* If the current token matches IDENTIFIER, skips it and returns true.
 517    IDENTIFIER may be abbreviated to its first three letters.  Otherwise,
 518    returns false.
 519
 520    IDENTIFIER must be an ASCII string. */
 521 bool
 522 lex_match_id (struct lexer *lexer, const char *identifier)
 523 {
 524   return lex_match_id_n (lexer, identifier, 3);
 525 }
 526
 527 /* If the current token is IDENTIFIER, skips it and returns true.  IDENTIFIER
 528    may be abbreviated to its first N letters.  Otherwise, returns false.
 529
 530    IDENTIFIER must be an ASCII string. */
 531 bool
 532 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
 533 {
 534   if (lex_token (lexer) == T_ID
 535       && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
 536     {
 537       lex_get (lexer);
 538       return true;
 539     }
 540   else
 541     return false;
 542 }
 543
 544 /* If the current token is integer X, skips it and returns true.  Otherwise,
 545    returns false. */
 546 bool
 547 lex_match_int (struct lexer *lexer, int x)
 548 {
 549   if (lex_is_integer (lexer) && lex_integer (lexer) == x)
 550     {
 551       lex_get (lexer);
 552       return true;
 553     }
 554   else
 555     return false;
 556 }
 557 \f
 558 /* Forced matches. */
 559
 560 /* If this token is IDENTIFIER, skips it and returns true.  IDENTIFIER may be
 561    abbreviated to its first 3 letters.  Otherwise, reports an error and returns
 562    false.
 563
 564    IDENTIFIER must be an ASCII string. */
 565 bool
 566 lex_force_match_id (struct lexer *lexer, const char *identifier)
 567 {
 568   if (lex_match_id (lexer, identifier))
 569     return true;
 570   else
 571     {
 572       lex_error_expecting (lexer, identifier, NULL_SENTINEL);
 573       return false;
 574     }
 575 }
 576
 577 /* If the current token has the specified TYPE, skips it and returns true.
 578    Otherwise, reports an error and returns false. */
 579 bool
 580 lex_force_match (struct lexer *lexer, enum token_type type)
 581 {
 582   if (lex_token (lexer) == type)
 583     {
 584       lex_get (lexer);
 585       return true;
 586     }
 587   else
 588     {
 589       char *s = xasprintf ("`%s'", token_type_to_string (type));
 590       lex_error_expecting (lexer, s, NULL_SENTINEL);
 591       free (s);
 592       return false;
 593     }
 594 }
 595
 596 /* If the current token is a string, does nothing and returns true.
 597    Otherwise, reports an error and returns false. */
 598 bool
 599 lex_force_string (struct lexer *lexer)
 600 {
 601   if (lex_is_string (lexer))
 602     return true;
 603   else
 604     {
 605       lex_error (lexer, _("expecting string"));
 606       return false;
 607     }
 608 }
 609
 610 /* If the current token is an integer, does nothing and returns true.
 611    Otherwise, reports an error and returns false. */
 612 bool
 613 lex_force_int (struct lexer *lexer)
 614 {
 615   if (lex_is_integer (lexer))
 616     return true;
 617   else
 618     {
 619       lex_error (lexer, _("expecting integer"));
 620       return false;
 621     }
 622 }
 623
 624 /* If the current token is a number, does nothing and returns true.
 625    Otherwise, reports an error and returns false. */
 626 bool
 627 lex_force_num (struct lexer *lexer)
 628 {
 629   if (lex_is_number (lexer))
 630     return true;
 631
 632   lex_error (lexer, _("expecting number"));
 633   return false;
 634 }
 635
 636 /* If the current token is an identifier, does nothing and returns true.
 637    Otherwise, reports an error and returns false. */
 638 bool
 639 lex_force_id (struct lexer *lexer)
 640 {
 641   if (lex_token (lexer) == T_ID)
 642     return true;
 643
 644   lex_error (lexer, _("expecting identifier"));
 645   return false;
 646 }
 647 \f
 648 /* Token accessors. */
 649
 650 /* Returns the type of LEXER's current token. */
 651 enum token_type
 652 lex_token (const struct lexer *lexer)
 653 {
 654   return lex_next_token (lexer, 0);
 655 }
 656
 657 /* Returns the number in LEXER's current token.
 658
 659    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
 660    tokens this function will always return zero. */
 661 double
 662 lex_tokval (const struct lexer *lexer)
 663 {
 664   return lex_next_tokval (lexer, 0);
 665 }
 666
 667 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
 668
 669    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 670    this functions this function will always return NULL.
 671
 672    The UTF-8 encoding of the returned string is correct for variable names and
 673    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 674    data_in() to use it in a "union value".  */
 675 const char *
 676 lex_tokcstr (const struct lexer *lexer)
 677 {
 678   return lex_next_tokcstr (lexer, 0);
 679 }
 680
 681 /* Returns the string in LEXER's current token, UTF-8 encoded.  The string is
 682    null-terminated (but the null terminator is not included in the returned
 683    substring's 'length').
 684
 685    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 686    this functions this function will always return NULL.
 687
 688    The UTF-8 encoding of the returned string is correct for variable names and
 689    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 690    data_in() to use it in a "union value".  */
 691 struct substring
 692 lex_tokss (const struct lexer *lexer)
 693 {
 694   return lex_next_tokss (lexer, 0);
 695 }
 696 \f
 697 /* Looking ahead.
 698
 699    A value of 0 for N as an argument to any of these functions refers to the
 700    current token.  Lookahead is limited to the current command.  Any N greater
 701    than the number of tokens remaining in the current command will be treated
 702    as referring to a T_ENDCMD token. */
 703
 704 static const struct lex_token *
 705 lex_next__ (const struct lexer *lexer_, int n)
 706 {
 707   struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
 708   struct lex_source *src = lex_source__ (lexer);
 709
 710   if (src != NULL)
 711     return lex_source_next__ (src, n);
 712   else
 713     {
 714       static const struct lex_token stop_token =
 715         { TOKEN_INITIALIZER (T_STOP, 0.0, ""), 0, 0, 0, 0 };
 716
 717       return &stop_token;
 718     }
 719 }
 720
 721 static const struct lex_token *
 722 lex_source_next__ (const struct lex_source *src, int n)
 723 {
 724   while (deque_count (&src->deque) <= n)
 725     {
 726       if (!deque_is_empty (&src->deque))
 727         {
 728           struct lex_token *front;
 729
 730           front = &src->tokens[deque_front (&src->deque, 0)];
 731           if (front->token.type == T_STOP || front->token.type == T_ENDCMD)
 732             return front;
 733         }
 734
 735       lex_source_get__ (src);
 736     }
 737
 738   return &src->tokens[deque_back (&src->deque, n)];
 739 }
 740
 741 /* Returns the "struct token" of the token N after the current one in LEXER.
 742    The returned pointer can be invalidated by pretty much any succeeding call
 743    into the lexer, although the string pointer within the returned token is
 744    only invalidated by consuming the token (e.g. with lex_get()). */
 745 const struct token *
 746 lex_next (const struct lexer *lexer, int n)
 747 {
 748   return &lex_next__ (lexer, n)->token;
 749 }
 750
 751 /* Returns the type of the token N after the current one in LEXER. */
 752 enum token_type
 753 lex_next_token (const struct lexer *lexer, int n)
 754 {
 755   return lex_next (lexer, n)->type;
 756 }
 757
 758 /* Returns the number in the tokn N after the current one in LEXER.
 759
 760    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
 761    tokens this function will always return zero. */
 762 double
 763 lex_next_tokval (const struct lexer *lexer, int n)
 764 {
 765   const struct token *token = lex_next (lexer, n);
 766   return token->number;
 767 }
 768
 769 /* Returns the null-terminated string in the token N after the current one, in
 770    UTF-8 encoding.
 771
 772    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 773    this functions this function will always return NULL.
 774
 775    The UTF-8 encoding of the returned string is correct for variable names and
 776    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 777    data_in() to use it in a "union value".  */
 778 const char *
 779 lex_next_tokcstr (const struct lexer *lexer, int n)
 780 {
 781   return lex_next_tokss (lexer, n).string;
 782 }
 783
 784 /* Returns the string in the token N after the current one, in UTF-8 encoding.
 785    The string is null-terminated (but the null terminator is not included in
 786    the returned substring's 'length').
 787
 788    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 789    this functions this function will always return NULL.
 790
 791    The UTF-8 encoding of the returned string is correct for variable names and
 792    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 793    data_in() to use it in a "union value".  */
 794 struct substring
 795 lex_next_tokss (const struct lexer *lexer, int n)
 796 {
 797   return lex_next (lexer, n)->string;
 798 }
 799
 800 /* If LEXER is positioned at the (pseudo)identifier S, skips it and returns
 801    true.  Otherwise, returns false.
 802
 803    S may consist of an arbitrary number of identifiers, integers, and
 804    punctuation e.g. "KRUSKAL-WALLIS", "2SLS", or "END INPUT PROGRAM".
 805    Identifiers may be abbreviated to their first three letters.  Currently only
 806    hyphens, slashes, and equals signs are supported as punctuation (but it
 807    would be easy to add more).
 808
 809    S must be an ASCII string. */
 810 bool
 811 lex_match_phrase (struct lexer *lexer, const char *s)
 812 {
 813   int tok_idx;
 814
 815   for (tok_idx = 0; ; tok_idx++)
 816     {
 817       enum token_type token;
 818       unsigned char c;
 819
 820       while (c_isspace (*s))
 821         s++;
 822
 823       c = *s;
 824       if (c == '\0')
 825         {
 826           int i;
 827
 828           for (i = 0; i < tok_idx; i++)
 829             lex_get (lexer);
 830           return true;
 831         }
 832
 833       token = lex_next_token (lexer, tok_idx);
 834       switch (c)
 835         {
 836         case '-':
 837           if (token != T_DASH)
 838             return false;
 839           s++;
 840           break;
 841
 842         case '/':
 843           if (token != T_SLASH)
 844             return false;
 845           s++;
 846           break;
 847
 848         case '=':
 849           if (token != T_EQUALS)
 850             return false;
 851           s++;
 852           break;
 853
 854         case '0': case '1': case '2': case '3': case '4':
 855         case '5': case '6': case '7': case '8': case '9':
 856           {
 857             unsigned int value;
 858
 859             if (token != T_POS_NUM)
 860               return false;
 861
 862             value = 0;
 863             do
 864               {
 865                 value = value * 10 + (*s++ - '0');
 866               }
 867             while (c_isdigit (*s));
 868
 869             if (lex_next_tokval (lexer, tok_idx) != value)
 870               return false;
 871           }
 872           break;
 873
 874         default:
 875           if (lex_is_id1 (c))
 876             {
 877               int len;
 878
 879               if (token != T_ID)
 880                 return false;
 881
 882               len = lex_id_get_length (ss_cstr (s));
 883               if (!lex_id_match (ss_buffer (s, len),
 884                                  lex_next_tokss (lexer, tok_idx)))
 885                 return false;
 886
 887               s += len;
 888             }
 889           else
 890             NOT_REACHED ();
 891         }
 892     }
 893 }
 894
 895 static int
 896 lex_source_get_first_line_number (const struct lex_source *src, int n)
 897 {
 898   return lex_source_next__ (src, n)->first_line;
 899 }
 900
 901 static int
 902 count_newlines (char *s, size_t length)
 903 {
 904   int n_newlines = 0;
 905   char *newline;
 906
 907   while ((newline = memchr (s, '\n', length)) != NULL)
 908     {
 909       n_newlines++;
 910       length -= (newline + 1) - s;
 911       s = newline + 1;
 912     }
 913
 914   return n_newlines;
 915 }
 916
 917 static int
 918 lex_source_get_last_line_number (const struct lex_source *src, int n)
 919 {
 920   const struct lex_token *token = lex_source_next__ (src, n);
 921
 922   if (token->first_line == 0)
 923     return 0;
 924   else
 925     {
 926       char *token_str = &src->buffer[token->token_pos - src->tail];
 927       return token->first_line + count_newlines (token_str, token->token_len) + 1;
 928     }
 929 }
 930
 931 static int
 932 count_columns (const char *s_, size_t length)
 933 {
 934   const uint8_t *s = CHAR_CAST (const uint8_t *, s_);
 935   int columns;
 936   size_t ofs;
 937   int mblen;
 938
 939   columns = 0;
 940   for (ofs = 0; ofs < length; ofs += mblen)
 941     {
 942       ucs4_t uc;
 943
 944       mblen = u8_mbtouc (&uc, s + ofs, length - ofs);
 945       if (uc != '\t')
 946         {
 947           int width = uc_width (uc, "UTF-8");
 948           if (width > 0)
 949             columns += width;
 950         }
 951       else
 952         columns = ROUND_UP (columns + 1, 8);
 953     }
 954
 955   return columns + 1;
 956 }
 957
 958 static int
 959 lex_source_get_first_column (const struct lex_source *src, int n)
 960 {
 961   const struct lex_token *token = lex_source_next__ (src, n);
 962   return count_columns (&src->buffer[token->line_pos - src->tail],
 963                         token->token_pos - token->line_pos);
 964 }
 965
 966 static int
 967 lex_source_get_last_column (const struct lex_source *src, int n)
 968 {
 969   const struct lex_token *token = lex_source_next__ (src, n);
 970   char *start, *end, *newline;
 971
 972   start = &src->buffer[token->line_pos - src->tail];
 973   end = &src->buffer[(token->token_pos + token->token_len) - src->tail];
 974   newline = memrchr (start, '\n', end - start);
 975   if (newline != NULL)
 976     start = newline + 1;
 977   return count_columns (start, end - start);
 978 }
 979
 980 /* Returns the 1-based line number of the start of the syntax that represents
 981    the token N after the current one in LEXER.  Returns 0 for a T_STOP token or
 982    if the token is drawn from a source that does not have line numbers. */
 983 int
 984 lex_get_first_line_number (const struct lexer *lexer, int n)
 985 {
 986   const struct lex_source *src = lex_source__ (lexer);
 987   return src != NULL ? lex_source_get_first_line_number (src, n) : 0;
 988 }
 989
 990 /* Returns the 1-based line number of the end of the syntax that represents the
 991    token N after the current one in LEXER, plus 1.  Returns 0 for a T_STOP
 992    token or if the token is drawn from a source that does not have line
 993    numbers.
 994
 995    Most of the time, a single token is wholly within a single line of syntax,
 996    but there are two exceptions: a T_STRING token can be made up of multiple
 997    segments on adjacent lines connected with "+" punctuators, and a T_NEG_NUM
 998    token can consist of a "-" on one line followed by the number on the next.
 999  */
1000 int
1001 lex_get_last_line_number (const struct lexer *lexer, int n)
1002 {
1003   const struct lex_source *src = lex_source__ (lexer);
1004   return src != NULL ? lex_source_get_last_line_number (src, n) : 0;
1005 }
1006
1007 /* Returns the 1-based column number of the start of the syntax that represents
1008    the token N after the current one in LEXER.  Returns 0 for a T_STOP
1009    token.
1010
1011    Column numbers are measured according to the width of characters as shown in
1012    a typical fixed-width font, in which CJK characters have width 2 and
1013    combining characters have width 0.  */
1014 int
1015 lex_get_first_column (const struct lexer *lexer, int n)
1016 {
1017   const struct lex_source *src = lex_source__ (lexer);
1018   return src != NULL ? lex_source_get_first_column (src, n) : 0;
1019 }
1020
1021 /* Returns the 1-based column number of the end of the syntax that represents
1022    the token N after the current one in LEXER, plus 1.  Returns 0 for a T_STOP
1023    token.
1024
1025    Column numbers are measured according to the width of characters as shown in
1026    a typical fixed-width font, in which CJK characters have width 2 and
1027    combining characters have width 0.  */
1028 int
1029 lex_get_last_column (const struct lexer *lexer, int n)
1030 {
1031   const struct lex_source *src = lex_source__ (lexer);
1032   return src != NULL ? lex_source_get_last_column (src, n) : 0;
1033 }
1034
1035 /* Returns the name of the syntax file from which the current command is drawn.
1036    Returns NULL for a T_STOP token or if the command's source does not have
1037    line numbers.
1038
1039    There is no version of this function that takes an N argument because
1040    lookahead only works to the end of a command and any given command is always
1041    within a single syntax file. */
1042 const char *
1043 lex_get_file_name (const struct lexer *lexer)
1044 {
1045   struct lex_source *src = lex_source__ (lexer);
1046   return src == NULL ? NULL : src->reader->file_name;
1047 }
1048
1049 /* Returns the syntax mode for the syntax file from which the current drawn is
1050    drawn.  Returns LEX_SYNTAX_AUTO for a T_STOP token or if the command's
1051    source does not have line numbers.
1052
1053    There is no version of this function that takes an N argument because
1054    lookahead only works to the end of a command and any given command is always
1055    within a single syntax file. */
1056 enum lex_syntax_mode
1057 lex_get_syntax_mode (const struct lexer *lexer)
1058 {
1059   struct lex_source *src = lex_source__ (lexer);
1060   return src == NULL ? LEX_SYNTAX_AUTO : src->reader->syntax;
1061 }
1062
1063 /* Returns the error mode for the syntax file from which the current drawn is
1064    drawn.  Returns LEX_ERROR_INTERACTIVE for a T_STOP token or if the command's
1065    source does not have line numbers.
1066
1067    There is no version of this function that takes an N argument because
1068    lookahead only works to the end of a command and any given command is always
1069    within a single syntax file. */
1070 enum lex_error_mode
1071 lex_get_error_mode (const struct lexer *lexer)
1072 {
1073   struct lex_source *src = lex_source__ (lexer);
1074   return src == NULL ? LEX_ERROR_INTERACTIVE : src->reader->error;
1075 }
1076
1077 /* If the source that LEXER is currently reading has error mode
1078    LEX_ERROR_INTERACTIVE, discards all buffered input and tokens, so that the
1079    next token to be read comes directly from whatever is next read from the
1080    stream.
1081
1082    It makes sense to call this function after encountering an error in a
1083    command entered on the console, because usually the user would prefer not to
1084    have cascading errors. */
1085 void
1086 lex_interactive_reset (struct lexer *lexer)
1087 {
1088   struct lex_source *src = lex_source__ (lexer);
1089   if (src != NULL && src->reader->error == LEX_ERROR_INTERACTIVE)
1090     {
1091       src->head = src->tail = 0;
1092       src->journal_pos = src->seg_pos = src->line_pos = 0;
1093       src->n_newlines = 0;
1094       src->suppress_next_newline = false;
1095       segmenter_init (&src->segmenter, segmenter_get_mode (&src->segmenter));
1096       while (!deque_is_empty (&src->deque))
1097         lex_source_pop__ (src);
1098       lex_source_push_endcmd__ (src);
1099     }
1100 }
1101
1102 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1103 void
1104 lex_discard_rest_of_command (struct lexer *lexer)
1105 {
1106   while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1107     lex_get (lexer);
1108 }
1109
1110 /* Discards all lookahead tokens in LEXER, then discards all input sources
1111    until it encounters one with error mode LEX_ERROR_INTERACTIVE or until it
1112    runs out of input sources. */
1113 void
1114 lex_discard_noninteractive (struct lexer *lexer)
1115 {
1116   struct lex_source *src = lex_source__ (lexer);
1117
1118   if (src != NULL)
1119     {
1120       while (!deque_is_empty (&src->deque))
1121         lex_source_pop__ (src);
1122
1123       for (; src != NULL && src->reader->error != LEX_ERROR_INTERACTIVE;
1124            src = lex_source__ (lexer))
1125         lex_source_destroy (src);
1126     }
1127 }
1128 \f
1129 static size_t
1130 lex_source_max_tail__ (const struct lex_source *src)
1131 {
1132   const struct lex_token *token;
1133   size_t max_tail;
1134
1135   assert (src->seg_pos >= src->line_pos);
1136   max_tail = MIN (src->journal_pos, src->line_pos);
1137
1138   /* Use the oldest token also.  (We know that src->deque cannot be empty
1139      because we are in the process of adding a new token, which is already
1140      initialized enough to use here.) */
1141   token = &src->tokens[deque_back (&src->deque, 0)];
1142   assert (token->token_pos >= token->line_pos);
1143   max_tail = MIN (max_tail, token->line_pos);
1144
1145   return max_tail;
1146 }
1147
1148 static void
1149 lex_source_expand__ (struct lex_source *src)
1150 {
1151   if (src->head - src->tail >= src->allocated)
1152     {
1153       size_t max_tail = lex_source_max_tail__ (src);
1154       if (max_tail > src->tail)
1155         {
1156           /* Advance the tail, freeing up room at the head. */
1157           memmove (src->buffer, src->buffer + (max_tail - src->tail),
1158                    src->head - max_tail);
1159           src->tail = max_tail;
1160         }
1161       else
1162         {
1163           /* Buffer is completely full.  Expand it. */
1164           src->buffer = x2realloc (src->buffer, &src->allocated);
1165         }
1166     }
1167   else
1168     {
1169       /* There's space available at the head of the buffer.  Nothing to do. */
1170     }
1171 }
1172
1173 static void
1174 lex_source_read__ (struct lex_source *src)
1175 {
1176   do
1177     {
1178       size_t head_ofs;
1179       size_t n;
1180
1181       lex_source_expand__ (src);
1182
1183       head_ofs = src->head - src->tail;
1184       n = src->reader->class->read (src->reader, &src->buffer[head_ofs],
1185                                     src->allocated - head_ofs,
1186                                     segmenter_get_prompt (&src->segmenter));
1187       if (n == 0)
1188         {
1189           /* End of input.
1190
1191              Ensure that the input always ends in a new-line followed by a null
1192              byte, as required by the segmenter library. */
1193
1194           if (src->head == src->tail
1195               || src->buffer[src->head - src->tail - 1] != '\n')
1196             src->buffer[src->head++ - src->tail] = '\n';
1197
1198           lex_source_expand__ (src);
1199           src->buffer[src->head++ - src->tail] = '\0';
1200
1201           return;
1202         }
1203
1204       src->head += n;
1205     }
1206   while (!memchr (&src->buffer[src->seg_pos - src->tail], '\n',
1207                   src->head - src->seg_pos));
1208 }
1209
1210 static struct lex_source *
1211 lex_source__ (const struct lexer *lexer)
1212 {
1213   return (ll_is_empty (&lexer->sources) ? NULL
1214           : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1215 }
1216
1217 static struct substring
1218 lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
1219 {
1220   const struct lex_token *token0 = lex_source_next__ (src, n0);
1221   const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1));
1222   size_t start = token0->token_pos;
1223   size_t end = token1->token_pos + token1->token_len;
1224
1225   return ss_buffer (&src->buffer[start - src->tail], end - start);
1226 }
1227
1228 static void
1229 lex_ellipsize__ (struct substring in, char *out, size_t out_size)
1230 {
1231   size_t out_maxlen;
1232   size_t out_len;
1233   int mblen;
1234
1235   assert (out_size >= 16);
1236   out_maxlen = out_size - (in.length >= out_size ? 3 : 0) - 1;
1237   for (out_len = 0; out_len < in.length; out_len += mblen)
1238     {
1239       if (in.string[out_len] == '\n'
1240           || (in.string[out_len] == '\r'
1241               && out_len + 1 < in.length
1242               && in.string[out_len + 1] == '\n'))
1243         break;
1244
1245       mblen = u8_mblen (CHAR_CAST (const uint8_t *, in.string + out_len),
1246                         in.length - out_len);
1247       if (out_len + mblen > out_maxlen)
1248         break;
1249     }
1250
1251   memcpy (out, in.string, out_len);
1252   strcpy (&out[out_len], out_len < in.length ? "..." : "");
1253 }
1254
1255 static void
1256 lex_source_error_valist (struct lex_source *src, int n0, int n1,
1257                          const char *format, va_list args)
1258 {
1259   const struct lex_token *token;
1260   struct string s;
1261   struct msg m;
1262
1263   ds_init_empty (&s);
1264
1265   token = lex_source_next__ (src, n0);
1266   if (token->token.type == T_ENDCMD)
1267     ds_put_cstr (&s, _("Syntax error at end of command"));
1268   else
1269     {
1270       struct substring syntax = lex_source_get_syntax__ (src, n0, n1);
1271       if (!ss_is_empty (syntax))
1272         {
1273           char syntax_cstr[64];
1274
1275           lex_ellipsize__ (syntax, syntax_cstr, sizeof syntax_cstr);
1276           ds_put_format (&s, _("Syntax error at `%s'"), syntax_cstr);
1277         }
1278       else
1279         ds_put_cstr (&s, _("Syntax error"));
1280     }
1281
1282   if (format)
1283     {
1284       ds_put_cstr (&s, ": ");
1285       ds_put_vformat (&s, format, args);
1286     }
1287   ds_put_byte (&s, '.');
1288
1289   m.category = MSG_C_SYNTAX;
1290   m.severity = MSG_S_ERROR;
1291   m.file_name = src->reader->file_name;
1292   m.first_line = lex_source_get_first_line_number (src, n0);
1293   m.last_line = lex_source_get_last_line_number (src, n1);
1294   m.first_column = lex_source_get_first_column (src, n0);
1295   m.last_column = lex_source_get_last_column (src, n1);
1296   m.text = ds_steal_cstr (&s);
1297   msg_emit (&m);
1298 }
1299
1300 static void PRINTF_FORMAT (2, 3)
1301 lex_get_error (struct lex_source *src, const char *format, ...)
1302 {
1303   va_list args;
1304   int n;
1305
1306   va_start (args, format);
1307
1308   n = deque_count (&src->deque) - 1;
1309   lex_source_error_valist (src, n, n, format, args);
1310   lex_source_pop_front (src);
1311
1312   va_end (args);
1313 }
1314
1315 static bool
1316 lex_source_get__ (const struct lex_source *src_)
1317 {
1318   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1319
1320   struct state
1321     {
1322       struct segmenter segmenter;
1323       enum segment_type last_segment;
1324       int newlines;
1325       size_t line_pos;
1326       size_t seg_pos;
1327     };
1328
1329   struct state state, saved;
1330   enum scan_result result;
1331   struct scanner scanner;
1332   struct lex_token *token;
1333   int n_lines;
1334   int i;
1335
1336   if (src->eof)
1337     return false;
1338
1339   state.segmenter = src->segmenter;
1340   state.newlines = 0;
1341   state.seg_pos = src->seg_pos;
1342   state.line_pos = src->line_pos;
1343   saved = state;
1344
1345   token = lex_push_token__ (src);
1346   scanner_init (&scanner, &token->token);
1347   token->line_pos = src->line_pos;
1348   token->token_pos = src->seg_pos;
1349   if (src->reader->line_number > 0)
1350     token->first_line = src->reader->line_number + src->n_newlines;
1351   else
1352     token->first_line = 0;
1353
1354   for (;;)
1355     {
1356       enum segment_type type;
1357       const char *segment;
1358       size_t seg_maxlen;
1359       int seg_len;
1360
1361       segment = &src->buffer[state.seg_pos - src->tail];
1362       seg_maxlen = src->head - state.seg_pos;
1363       seg_len = segmenter_push (&state.segmenter, segment, seg_maxlen, &type);
1364       if (seg_len < 0)
1365         {
1366           lex_source_read__ (src);
1367           continue;
1368         }
1369
1370       state.last_segment = type;
1371       state.seg_pos += seg_len;
1372       if (type == SEG_NEWLINE)
1373         {
1374           state.newlines++;
1375           state.line_pos = state.seg_pos;
1376         }
1377
1378       result = scanner_push (&scanner, type, ss_buffer (segment, seg_len),
1379                              &token->token);
1380       if (result == SCAN_SAVE)
1381         saved = state;
1382       else if (result == SCAN_BACK)
1383         {
1384           state = saved;
1385           break;
1386         }
1387       else if (result == SCAN_DONE)
1388         break;
1389     }
1390
1391   n_lines = state.newlines;
1392   if (state.last_segment == SEG_END_COMMAND && !src->suppress_next_newline)
1393     {
1394       n_lines++;
1395       src->suppress_next_newline = true;
1396     }
1397   else if (n_lines > 0 && src->suppress_next_newline)
1398     {
1399       n_lines--;
1400       src->suppress_next_newline = false;
1401     }
1402   for (i = 0; i < n_lines; i++)
1403     {
1404       const char *newline;
1405       const char *line;
1406       size_t line_len;
1407       char *syntax;
1408
1409       line = &src->buffer[src->journal_pos - src->tail];
1410       newline = rawmemchr (line, '\n');
1411       line_len = newline - line;
1412       if (line_len > 0 && line[line_len - 1] == '\r')
1413         line_len--;
1414
1415       syntax = malloc (line_len + 2);
1416       memcpy (syntax, line, line_len);
1417       syntax[line_len] = '\n';
1418       syntax[line_len + 1] = '\0';
1419
1420       text_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX, syntax));
1421
1422       src->journal_pos += newline - line + 1;
1423     }
1424
1425   token->token_len = state.seg_pos - src->seg_pos;
1426
1427   src->segmenter = state.segmenter;
1428   src->seg_pos = state.seg_pos;
1429   src->line_pos = state.line_pos;
1430   src->n_newlines += state.newlines;
1431
1432   switch (token->token.type)
1433     {
1434     default:
1435       break;
1436
1437     case T_STOP:
1438       token->token.type = T_ENDCMD;
1439       src->eof = true;
1440       break;
1441
1442     case SCAN_BAD_HEX_LENGTH:
1443       lex_get_error (src, _("String of hex digits has %d characters, which "
1444                             "is not a multiple of 2"),
1445                      (int) token->token.number);
1446       break;
1447
1448     case SCAN_BAD_HEX_DIGIT:
1449     case SCAN_BAD_UNICODE_DIGIT:
1450       lex_get_error (src, _("`%c' is not a valid hex digit"),
1451                      (int) token->token.number);
1452       break;
1453
1454     case SCAN_BAD_UNICODE_LENGTH:
1455       lex_get_error (src, _("Unicode string contains %d bytes, which is "
1456                             "not in the valid range of 1 to 8 bytes"),
1457                      (int) token->token.number);
1458       break;
1459
1460     case SCAN_BAD_UNICODE_CODE_POINT:
1461       lex_get_error (src, _("U+%04X is not a valid Unicode code point"),
1462                      (int) token->token.number);
1463       break;
1464
1465     case SCAN_EXPECTED_QUOTE:
1466       lex_get_error (src, _("Unterminated string constant"));
1467       break;
1468
1469     case SCAN_EXPECTED_EXPONENT:
1470       lex_get_error (src, _("Missing exponent following `%s'"),
1471                      token->token.string.string);
1472       break;
1473
1474     case SCAN_UNEXPECTED_DOT:
1475       lex_get_error (src, _("Unexpected `.' in middle of command"));
1476       break;
1477
1478     case SCAN_UNEXPECTED_CHAR:
1479       {
1480         char c_name[16];
1481         lex_get_error (src, _("Bad character %s in input"),
1482                        uc_name (token->token.number, c_name));
1483       }
1484       break;
1485
1486     case SCAN_SKIP:
1487       lex_source_pop_front (src);
1488       break;
1489     }
1490
1491   return true;
1492 }
1493 \f
1494 static void
1495 lex_source_push_endcmd__ (struct lex_source *src)
1496 {
1497   struct lex_token *token = lex_push_token__ (src);
1498   token->token.type = T_ENDCMD;
1499   token->token_pos = 0;
1500   token->token_len = 0;
1501   token->line_pos = 0;
1502   token->first_line = 0;
1503 }
1504
1505 static struct lex_source *
1506 lex_source_create (struct lex_reader *reader)
1507 {
1508   struct lex_source *src;
1509   enum segmenter_mode mode;
1510
1511   src = xzalloc (sizeof *src);
1512   src->reader = reader;
1513
1514   if (reader->syntax == LEX_SYNTAX_AUTO)
1515     mode = SEG_MODE_AUTO;
1516   else if (reader->syntax == LEX_SYNTAX_INTERACTIVE)
1517     mode = SEG_MODE_INTERACTIVE;
1518   else if (reader->syntax == LEX_SYNTAX_BATCH)
1519     mode = SEG_MODE_BATCH;
1520   else
1521     NOT_REACHED ();
1522   segmenter_init (&src->segmenter, mode);
1523
1524   src->tokens = deque_init (&src->deque, 4, sizeof *src->tokens);
1525
1526   lex_source_push_endcmd__ (src);
1527
1528   return src;
1529 }
1530
1531 static void
1532 lex_source_destroy (struct lex_source *src)
1533 {
1534   char *file_name = src->reader->file_name;
1535   if (src->reader->class->destroy != NULL)
1536     src->reader->class->destroy (src->reader);
1537   free (file_name);
1538   free (src->buffer);
1539   while (!deque_is_empty (&src->deque))
1540     lex_source_pop__ (src);
1541   free (src->tokens);
1542   ll_remove (&src->ll);
1543   free (src);
1544 }
1545 \f
1546 struct lex_file_reader
1547   {
1548     struct lex_reader reader;
1549     struct u8_istream *istream;
1550     char *file_name;
1551   };
1552
1553 static struct lex_reader_class lex_file_reader_class;
1554
1555 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
1556    from stdin if FILE_NAME is "-").  The file is expected to be encoded with
1557    ENCODING, which should take one of the forms accepted by
1558    u8_istream_for_file().  SYNTAX and ERROR become the syntax mode and error
1559    mode of the new reader, respectively.
1560
1561    Returns a null pointer if FILE_NAME cannot be opened. */
1562 struct lex_reader *
1563 lex_reader_for_file (const char *file_name, const char *encoding,
1564                      enum lex_syntax_mode syntax,
1565                      enum lex_error_mode error)
1566 {
1567   struct lex_file_reader *r;
1568   struct u8_istream *istream;
1569
1570   istream = (!strcmp(file_name, "-")
1571              ? u8_istream_for_fd (encoding, STDIN_FILENO)
1572              : u8_istream_for_file (encoding, file_name, O_RDONLY));
1573   if (istream == NULL)
1574     {
1575       msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
1576       return NULL;
1577     }
1578
1579   r = xmalloc (sizeof *r);
1580   lex_reader_init (&r->reader, &lex_file_reader_class);
1581   r->reader.syntax = syntax;
1582   r->reader.error = error;
1583   r->reader.file_name = xstrdup (file_name);
1584   r->reader.line_number = 1;
1585   r->istream = istream;
1586   r->file_name = xstrdup (file_name);
1587
1588   return &r->reader;
1589 }
1590
1591 static struct lex_file_reader *
1592 lex_file_reader_cast (struct lex_reader *r)
1593 {
1594   return UP_CAST (r, struct lex_file_reader, reader);
1595 }
1596
1597 static size_t
1598 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
1599                enum prompt_style prompt_style UNUSED)
1600 {
1601   struct lex_file_reader *r = lex_file_reader_cast (r_);
1602   ssize_t n_read = u8_istream_read (r->istream, buf, n);
1603   if (n_read < 0)
1604     {
1605       msg (ME, _("Error reading `%s': %s."), r->file_name, strerror (errno));
1606       return 0;
1607     }
1608   return n_read;
1609 }
1610
1611 static void
1612 lex_file_close (struct lex_reader *r_)
1613 {
1614   struct lex_file_reader *r = lex_file_reader_cast (r_);
1615
1616   if (u8_istream_fileno (r->istream) != STDIN_FILENO)
1617     {
1618       if (u8_istream_close (r->istream) != 0)
1619         msg (ME, _("Error closing `%s': %s."), r->file_name, strerror (errno));
1620     }
1621   else
1622     u8_istream_free (r->istream);
1623
1624   free (r->file_name);
1625   free (r);
1626 }
1627
1628 static struct lex_reader_class lex_file_reader_class =
1629   {
1630     lex_file_read,
1631     lex_file_close
1632   };
1633 \f
1634 struct lex_string_reader
1635   {
1636     struct lex_reader reader;
1637     struct substring s;
1638     size_t offset;
1639   };
1640
1641 static struct lex_reader_class lex_string_reader_class;
1642
1643 /* Creates and returns a new lex_reader for the contents of S, which must be
1644    encoded in UTF-8.  The new reader takes ownership of S and will free it
1645    with ss_dealloc() when it is closed. */
1646 struct lex_reader *
1647 lex_reader_for_substring_nocopy (struct substring s)
1648 {
1649   struct lex_string_reader *r;
1650
1651   r = xmalloc (sizeof *r);
1652   lex_reader_init (&r->reader, &lex_string_reader_class);
1653   r->reader.syntax = LEX_SYNTAX_INTERACTIVE;
1654   r->s = s;
1655   r->offset = 0;
1656
1657   return &r->reader;
1658 }
1659
1660 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
1661    which must be encoded in UTF-8.  The caller retains ownership of S. */
1662 struct lex_reader *
1663 lex_reader_for_string (const char *s)
1664 {
1665   struct substring ss;
1666   ss_alloc_substring (&ss, ss_cstr (s));
1667   return lex_reader_for_substring_nocopy (ss);
1668 }
1669
1670 /* Formats FORMAT as a printf()-like format string and creates and returns a
1671    new lex_reader for the formatted result.  */
1672 struct lex_reader *
1673 lex_reader_for_format (const char *format, ...)
1674 {
1675   struct lex_reader *r;
1676   va_list args;
1677
1678   va_start (args, format);
1679   r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)));
1680   va_end (args);
1681
1682   return r;
1683 }
1684
1685 static struct lex_string_reader *
1686 lex_string_reader_cast (struct lex_reader *r)
1687 {
1688   return UP_CAST (r, struct lex_string_reader, reader);
1689 }
1690
1691 static size_t
1692 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
1693                  enum prompt_style prompt_style UNUSED)
1694 {
1695   struct lex_string_reader *r = lex_string_reader_cast (r_);
1696   size_t chunk;
1697
1698   chunk = MIN (n, r->s.length - r->offset);
1699   memcpy (buf, r->s.string + r->offset, chunk);
1700   r->offset += chunk;
1701
1702   return chunk;
1703 }
1704
1705 static void
1706 lex_string_close (struct lex_reader *r_)
1707 {
1708   struct lex_string_reader *r = lex_string_reader_cast (r_);
1709
1710   ss_dealloc (&r->s);
1711   free (r);
1712 }
1713
1714 static struct lex_reader_class lex_string_reader_class =
1715   {
1716     lex_string_read,
1717     lex_string_close
1718   };