src/language/lexer/lexer.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "language/lexer/lexer.h"
  20
  21 #include <errno.h>
  22 #include <fcntl.h>
  23 #include <limits.h>
  24 #include <math.h>
  25 #include <stdarg.h>
  26 #include <stdlib.h>
  27 #include <string.h>
  28 #include <unictype.h>
  29 #include <unistd.h>
  30 #include <unistr.h>
  31 #include <uniwidth.h>
  32
  33 #include "data/file-name.h"
  34 #include "language/command.h"
  35 #include "language/lexer/scan.h"
  36 #include "language/lexer/segment.h"
  37 #include "language/lexer/token.h"
  38 #include "libpspp/assertion.h"
  39 #include "libpspp/cast.h"
  40 #include "libpspp/deque.h"
  41 #include "libpspp/i18n.h"
  42 #include "libpspp/ll.h"
  43 #include "libpspp/message.h"
  44 #include "libpspp/misc.h"
  45 #include "libpspp/str.h"
  46 #include "libpspp/u8-istream.h"
  47 #include "output/journal.h"
  48 #include "output/text-item.h"
  49
  50 #include "gl/c-ctype.h"
  51 #include "gl/minmax.h"
  52 #include "gl/xalloc.h"
  53 #include "gl/xmemdup0.h"
  54
  55 #include "gettext.h"
  56 #define _(msgid) gettext (msgid)
  57 #define N_(msgid) msgid
  58
  59 /* A token within a lex_source. */
  60 struct lex_token
  61   {
  62     /* The regular token information. */
  63     struct token token;
  64
  65     /* Location of token in terms of the lex_source's buffer.
  66        src->tail <= line_pos <= token_pos <= src->head. */
  67     size_t token_pos;           /* Start of token. */
  68     size_t token_len;           /* Length of source for token in bytes. */
  69     size_t line_pos;            /* Start of line containing token_pos. */
  70     int first_line;             /* Line number at token_pos. */
  71   };
  72
  73 /* A source of tokens, corresponding to a syntax file.
  74
  75    This is conceptually a lex_reader wrapped with everything needed to convert
  76    its UTF-8 bytes into tokens. */
  77 struct lex_source
  78   {
  79     struct ll ll;               /* In lexer's list of sources. */
  80     struct lex_reader *reader;
  81     struct segmenter segmenter;
  82     bool eof;                   /* True if T_STOP was read from 'reader'. */
  83
  84     /* Buffer of UTF-8 bytes. */
  85     char *buffer;
  86     size_t allocated;           /* Number of bytes allocated. */
  87     size_t tail;                /* &buffer[0] offset into UTF-8 source. */
  88     size_t head;                /* &buffer[head - tail] offset into source. */
  89
  90     /* Positions in source file, tail <= pos <= head for each member here. */
  91     size_t journal_pos;         /* First byte not yet output to journal. */
  92     size_t seg_pos;             /* First byte not yet scanned as token. */
  93     size_t line_pos;            /* First byte of line containing seg_pos. */
  94
  95     int n_newlines;             /* Number of new-lines up to seg_pos. */
  96     bool suppress_next_newline;
  97
  98     /* Tokens. */
  99     struct deque deque;         /* Indexes into 'tokens'. */
 100     struct lex_token *tokens;   /* Lookahead tokens for parser. */
 101   };
 102
 103 static struct lex_source *lex_source_create (struct lex_reader *);
 104 static void lex_source_destroy (struct lex_source *);
 105
 106 /* Lexer. */
 107 struct lexer
 108   {
 109     struct ll_list sources;     /* Contains "struct lex_source"s. */
 110   };
 111
 112 static struct lex_source *lex_source__ (const struct lexer *);
 113 static const struct lex_token *lex_next__ (const struct lexer *, int n);
 114 static void lex_source_push_endcmd__ (struct lex_source *);
 115
 116 static void lex_source_pop__ (struct lex_source *);
 117 static bool lex_source_get__ (const struct lex_source *);
 118 static void lex_source_error_valist (struct lex_source *, int n0, int n1,
 119                                      const char *format, va_list)
 120    PRINTF_FORMAT (4, 0);
 121 static const struct lex_token *lex_source_next__ (const struct lex_source *,
 122                                                   int n);
 123 \f
 124 /* Initializes READER with the specified CLASS and otherwise some reasonable
 125    defaults.  The caller should fill in the others members as desired. */
 126 void
 127 lex_reader_init (struct lex_reader *reader,
 128                  const struct lex_reader_class *class)
 129 {
 130   reader->class = class;
 131   reader->syntax = LEX_SYNTAX_AUTO;
 132   reader->error = LEX_ERROR_INTERACTIVE;
 133   reader->file_name = NULL;
 134   reader->line_number = 0;
 135 }
 136
 137 /* Frees any file name already in READER and replaces it by a copy of
 138    FILE_NAME, or if FILE_NAME is null then clears any existing name. */
 139 void
 140 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
 141 {
 142   free (reader->file_name);
 143   reader->file_name = file_name != NULL ? xstrdup (file_name) : NULL;
 144 }
 145 \f
 146 /* Creates and returns a new lexer. */
 147 struct lexer *
 148 lex_create (void)
 149 {
 150   struct lexer *lexer = xzalloc (sizeof *lexer);
 151   ll_init (&lexer->sources);
 152   return lexer;
 153 }
 154
 155 /* Destroys LEXER. */
 156 void
 157 lex_destroy (struct lexer *lexer)
 158 {
 159   if (lexer != NULL)
 160     {
 161       struct lex_source *source, *next;
 162
 163       ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
 164         lex_source_destroy (source);
 165       free (lexer);
 166     }
 167 }
 168
 169 /* Inserts READER into LEXER so that the next token read by LEXER comes from
 170    READER.  Before the caller, LEXER must either be empty or at a T_ENDCMD
 171    token. */
 172 void
 173 lex_include (struct lexer *lexer, struct lex_reader *reader)
 174 {
 175   assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
 176   ll_push_head (&lexer->sources, &lex_source_create (reader)->ll);
 177 }
 178
 179 /* Appends READER to LEXER, so that it will be read after all other current
 180    readers have already been read. */
 181 void
 182 lex_append (struct lexer *lexer, struct lex_reader *reader)
 183 {
 184   ll_push_tail (&lexer->sources, &lex_source_create (reader)->ll);
 185 }
 186 \f
 187 /* Advacning. */
 188
 189 static struct lex_token *
 190 lex_push_token__ (struct lex_source *src)
 191 {
 192   struct lex_token *token;
 193
 194   if (deque_is_full (&src->deque))
 195     src->tokens = deque_expand (&src->deque, src->tokens, sizeof *src->tokens);
 196
 197   token = &src->tokens[deque_push_front (&src->deque)];
 198   token_init (&token->token);
 199   return token;
 200 }
 201
 202 static void
 203 lex_source_pop__ (struct lex_source *src)
 204 {
 205   token_destroy (&src->tokens[deque_pop_back (&src->deque)].token);
 206 }
 207
 208 static void
 209 lex_source_pop_front (struct lex_source *src)
 210 {
 211   token_destroy (&src->tokens[deque_pop_front (&src->deque)].token);
 212 }
 213
 214 /* Advances LEXER to the next token, consuming the current token. */
 215 void
 216 lex_get (struct lexer *lexer)
 217 {
 218   struct lex_source *src;
 219
 220   src = lex_source__ (lexer);
 221   if (src == NULL)
 222     return;
 223
 224   if (!deque_is_empty (&src->deque))
 225     lex_source_pop__ (src);
 226
 227   while (deque_is_empty (&src->deque))
 228     if (!lex_source_get__ (src))
 229       {
 230         lex_source_destroy (src);
 231         src = lex_source__ (lexer);
 232         if (src == NULL)
 233           return;
 234       }
 235 }
 236 \f
 237 /* Issuing errors. */
 238
 239 /* Prints a syntax error message containing the current token and
 240    given message MESSAGE (if non-null). */
 241 void
 242 lex_error (struct lexer *lexer, const char *format, ...)
 243 {
 244   va_list args;
 245
 246   va_start (args, format);
 247   lex_next_error_valist (lexer, 0, 0, format, args);
 248   va_end (args);
 249 }
 250
 251 /* Prints a syntax error message containing the current token and
 252    given message MESSAGE (if non-null). */
 253 void
 254 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
 255 {
 256   lex_next_error_valist (lexer, 0, 0, format, args);
 257 }
 258
 259 /* Prints a syntax error message containing the current token and
 260    given message MESSAGE (if non-null). */
 261 void
 262 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
 263 {
 264   va_list args;
 265
 266   va_start (args, format);
 267   lex_next_error_valist (lexer, n0, n1, format, args);
 268   va_end (args);
 269 }
 270
 271 /* Reports an error to the effect that subcommand SBC may only be
 272    specified once. */
 273 void
 274 lex_sbc_only_once (const char *sbc)
 275 {
 276   msg (SE, _("Subcommand %s may only be specified once."), sbc);
 277 }
 278
 279 /* Reports an error to the effect that subcommand SBC is
 280    missing. */
 281 void
 282 lex_sbc_missing (struct lexer *lexer, const char *sbc)
 283 {
 284   lex_error (lexer, _("missing required subcommand %s"), sbc);
 285 }
 286
 287 /* Prints a syntax error message containing the current token and
 288    given message MESSAGE (if non-null). */
 289 void
 290 lex_next_error_valist (struct lexer *lexer, int n0, int n1,
 291                        const char *format, va_list args)
 292 {
 293   struct lex_source *src = lex_source__ (lexer);
 294
 295   if (src != NULL)
 296     lex_source_error_valist (src, n0, n1, format, args);
 297   else
 298     {
 299       struct string s;
 300
 301       ds_init_empty (&s);
 302       ds_put_format (&s, _("Syntax error at end of input"));
 303       if (format != NULL)
 304         {
 305           ds_put_cstr (&s, ": ");
 306           ds_put_vformat (&s, format, args);
 307         }
 308       ds_put_byte (&s, '.');
 309       msg (SE, "%s", ds_cstr (&s));
 310       ds_destroy (&s);
 311     }
 312 }
 313
 314 /* Checks that we're at end of command.
 315    If so, returns a successful command completion code.
 316    If not, flags a syntax error and returns an error command
 317    completion code. */
 318 int
 319 lex_end_of_command (struct lexer *lexer)
 320 {
 321   if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
 322     {
 323       lex_error (lexer, _("expecting end of command"));
 324       return CMD_FAILURE;
 325     }
 326   else
 327     return CMD_SUCCESS;
 328 }
 329 \f
 330 /* Token testing functions. */
 331
 332 /* Returns true if the current token is a number. */
 333 bool
 334 lex_is_number (struct lexer *lexer)
 335 {
 336   return lex_next_is_number (lexer, 0);
 337 }
 338
 339 /* Returns true if the current token is a string. */
 340 bool
 341 lex_is_string (struct lexer *lexer)
 342 {
 343   return lex_next_is_string (lexer, 0);
 344 }
 345
 346 /* Returns the value of the current token, which must be a
 347    floating point number. */
 348 double
 349 lex_number (struct lexer *lexer)
 350 {
 351   return lex_next_number (lexer, 0);
 352 }
 353
 354 /* Returns true iff the current token is an integer. */
 355 bool
 356 lex_is_integer (struct lexer *lexer)
 357 {
 358   return lex_next_is_integer (lexer, 0);
 359 }
 360
 361 /* Returns the value of the current token, which must be an
 362    integer. */
 363 long
 364 lex_integer (struct lexer *lexer)
 365 {
 366   return lex_next_integer (lexer, 0);
 367 }
 368 \f
 369 /* Token testing functions with lookahead.
 370
 371    A value of 0 for N as an argument to any of these functions refers to the
 372    current token.  Lookahead is limited to the current command.  Any N greater
 373    than the number of tokens remaining in the current command will be treated
 374    as referring to a T_ENDCMD token. */
 375
 376 /* Returns true if the token N ahead of the current token is a number. */
 377 bool
 378 lex_next_is_number (struct lexer *lexer, int n)
 379 {
 380   enum token_type next_token = lex_next_token (lexer, n);
 381   return next_token == T_POS_NUM || next_token == T_NEG_NUM;
 382 }
 383
 384 /* Returns true if the token N ahead of the current token is a string. */
 385 bool
 386 lex_next_is_string (struct lexer *lexer, int n)
 387 {
 388   return lex_next_token (lexer, n) == T_STRING;
 389 }
 390
 391 /* Returns the value of the token N ahead of the current token, which must be a
 392    floating point number. */
 393 double
 394 lex_next_number (struct lexer *lexer, int n)
 395 {
 396   assert (lex_next_is_number (lexer, n));
 397   return lex_next_tokval (lexer, n);
 398 }
 399
 400 /* Returns true if the token N ahead of the current token is an integer. */
 401 bool
 402 lex_next_is_integer (struct lexer *lexer, int n)
 403 {
 404   double value;
 405
 406   if (!lex_next_is_number (lexer, n))
 407     return false;
 408
 409   value = lex_next_tokval (lexer, n);
 410   return value > LONG_MIN && value <= LONG_MAX && floor (value) == value;
 411 }
 412
 413 /* Returns the value of the token N ahead of the current token, which must be
 414    an integer. */
 415 long
 416 lex_next_integer (struct lexer *lexer, int n)
 417 {
 418   assert (lex_next_is_integer (lexer, n));
 419   return lex_next_tokval (lexer, n);
 420 }
 421 \f
 422 /* Token matching functions. */
 423
 424 /* If the current token has the specified TYPE, skips it and returns true.
 425    Otherwise, returns false. */
 426 bool
 427 lex_match (struct lexer *lexer, enum token_type type)
 428 {
 429   if (lex_token (lexer) == type)
 430     {
 431       lex_get (lexer);
 432       return true;
 433     }
 434   else
 435     return false;
 436 }
 437
 438 /* If the current token matches IDENTIFIER, skips it and returns true.
 439    IDENTIFIER may be abbreviated to its first three letters.  Otherwise,
 440    returns false.
 441
 442    IDENTIFIER must be an ASCII string. */
 443 bool
 444 lex_match_id (struct lexer *lexer, const char *identifier)
 445 {
 446   return lex_match_id_n (lexer, identifier, 3);
 447 }
 448
 449 /* If the current token is IDENTIFIER, skips it and returns true.  IDENTIFIER
 450    may be abbreviated to its first N letters.  Otherwise, returns false.
 451
 452    IDENTIFIER must be an ASCII string. */
 453 bool
 454 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
 455 {
 456   if (lex_token (lexer) == T_ID
 457       && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
 458     {
 459       lex_get (lexer);
 460       return true;
 461     }
 462   else
 463     return false;
 464 }
 465
 466 /* If the current token is integer X, skips it and returns true.  Otherwise,
 467    returns false. */
 468 bool
 469 lex_match_int (struct lexer *lexer, int x)
 470 {
 471   if (lex_is_integer (lexer) && lex_integer (lexer) == x)
 472     {
 473       lex_get (lexer);
 474       return true;
 475     }
 476   else
 477     return false;
 478 }
 479 \f
 480 /* Forced matches. */
 481
 482 /* If this token is IDENTIFIER, skips it and returns true.  IDENTIFIER may be
 483    abbreviated to its first 3 letters.  Otherwise, reports an error and returns
 484    false.
 485
 486    IDENTIFIER must be an ASCII string. */
 487 bool
 488 lex_force_match_id (struct lexer *lexer, const char *identifier)
 489 {
 490   if (lex_match_id (lexer, identifier))
 491     return true;
 492   else
 493     {
 494       lex_error (lexer, _("expecting `%s'"), identifier);
 495       return false;
 496     }
 497 }
 498
 499 /* If the current token has the specified TYPE, skips it and returns true.
 500    Otherwise, reports an error and returns false. */
 501 bool
 502 lex_force_match (struct lexer *lexer, enum token_type type)
 503 {
 504   if (lex_token (lexer) == type)
 505     {
 506       lex_get (lexer);
 507       return true;
 508     }
 509   else
 510     {
 511       lex_error (lexer, _("expecting `%s'"), token_type_to_string (type));
 512       return false;
 513     }
 514 }
 515
 516 /* If the current token is a string, does nothing and returns true.
 517    Otherwise, reports an error and returns false. */
 518 bool
 519 lex_force_string (struct lexer *lexer)
 520 {
 521   if (lex_is_string (lexer))
 522     return true;
 523   else
 524     {
 525       lex_error (lexer, _("expecting string"));
 526       return false;
 527     }
 528 }
 529
 530 /* If the current token is an integer, does nothing and returns true.
 531    Otherwise, reports an error and returns false. */
 532 bool
 533 lex_force_int (struct lexer *lexer)
 534 {
 535   if (lex_is_integer (lexer))
 536     return true;
 537   else
 538     {
 539       lex_error (lexer, _("expecting integer"));
 540       return false;
 541     }
 542 }
 543
 544 /* If the current token is a number, does nothing and returns true.
 545    Otherwise, reports an error and returns false. */
 546 bool
 547 lex_force_num (struct lexer *lexer)
 548 {
 549   if (lex_is_number (lexer))
 550     return true;
 551
 552   lex_error (lexer, _("expecting number"));
 553   return false;
 554 }
 555
 556 /* If the current token is an identifier, does nothing and returns true.
 557    Otherwise, reports an error and returns false. */
 558 bool
 559 lex_force_id (struct lexer *lexer)
 560 {
 561   if (lex_token (lexer) == T_ID)
 562     return true;
 563
 564   lex_error (lexer, _("expecting identifier"));
 565   return false;
 566 }
 567 \f
 568 /* Token accessors. */
 569
 570 /* Returns the type of LEXER's current token. */
 571 enum token_type
 572 lex_token (const struct lexer *lexer)
 573 {
 574   return lex_next_token (lexer, 0);
 575 }
 576
 577 /* Returns the number in LEXER's current token.
 578
 579    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
 580    tokens this function will always return zero. */
 581 double
 582 lex_tokval (const struct lexer *lexer)
 583 {
 584   return lex_next_tokval (lexer, 0);
 585 }
 586
 587 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
 588
 589    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 590    this functions this function will always return NULL.
 591
 592    The UTF-8 encoding of the returned string is correct for variable names and
 593    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 594    data_in() to use it in a "union value".  */
 595 const char *
 596 lex_tokcstr (const struct lexer *lexer)
 597 {
 598   return lex_next_tokcstr (lexer, 0);
 599 }
 600
 601 /* Returns the string in LEXER's current token, UTF-8 encoded.  The string is
 602    null-terminated (but the null terminator is not included in the returned
 603    substring's 'length').
 604
 605    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 606    this functions this function will always return NULL.
 607
 608    The UTF-8 encoding of the returned string is correct for variable names and
 609    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 610    data_in() to use it in a "union value".  */
 611 struct substring
 612 lex_tokss (const struct lexer *lexer)
 613 {
 614   return lex_next_tokss (lexer, 0);
 615 }
 616 \f
 617 /* Looking ahead.
 618
 619    A value of 0 for N as an argument to any of these functions refers to the
 620    current token.  Lookahead is limited to the current command.  Any N greater
 621    than the number of tokens remaining in the current command will be treated
 622    as referring to a T_ENDCMD token. */
 623
 624 static const struct lex_token *
 625 lex_next__ (const struct lexer *lexer_, int n)
 626 {
 627   struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
 628   struct lex_source *src = lex_source__ (lexer);
 629
 630   if (src != NULL)
 631     return lex_source_next__ (src, n);
 632   else
 633     {
 634       static const struct lex_token stop_token =
 635         { TOKEN_INITIALIZER (T_STOP, 0.0, ""), 0, 0, 0, 0 };
 636
 637       return &stop_token;
 638     }
 639 }
 640
 641 static const struct lex_token *
 642 lex_source_next__ (const struct lex_source *src, int n)
 643 {
 644   while (deque_count (&src->deque) <= n)
 645     {
 646       if (!deque_is_empty (&src->deque))
 647         {
 648           struct lex_token *front;
 649
 650           front = &src->tokens[deque_front (&src->deque, 0)];
 651           if (front->token.type == T_STOP || front->token.type == T_ENDCMD)
 652             return front;
 653         }
 654
 655       lex_source_get__ (src);
 656     }
 657
 658   return &src->tokens[deque_back (&src->deque, n)];
 659 }
 660
 661 /* Returns the "struct token" of the token N after the current one in LEXER.
 662    The returned pointer can be invalidated by pretty much any succeeding call
 663    into the lexer, although the string pointer within the returned token is
 664    only invalidated by consuming the token (e.g. with lex_get()). */
 665 const struct token *
 666 lex_next (const struct lexer *lexer, int n)
 667 {
 668   return &lex_next__ (lexer, n)->token;
 669 }
 670
 671 /* Returns the type of the token N after the current one in LEXER. */
 672 enum token_type
 673 lex_next_token (const struct lexer *lexer, int n)
 674 {
 675   return lex_next (lexer, n)->type;
 676 }
 677
 678 /* Returns the number in the tokn N after the current one in LEXER.
 679
 680    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
 681    tokens this function will always return zero. */
 682 double
 683 lex_next_tokval (const struct lexer *lexer, int n)
 684 {
 685   const struct token *token = lex_next (lexer, n);
 686   return token->number;
 687 }
 688
 689 /* Returns the null-terminated string in the token N after the current one, in
 690    UTF-8 encoding.
 691
 692    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 693    this functions this function will always return NULL.
 694
 695    The UTF-8 encoding of the returned string is correct for variable names and
 696    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 697    data_in() to use it in a "union value".  */
 698 const char *
 699 lex_next_tokcstr (const struct lexer *lexer, int n)
 700 {
 701   return lex_next_tokss (lexer, n).string;
 702 }
 703
 704 /* Returns the string in the token N after the current one, in UTF-8 encoding.
 705    The string is null-terminated (but the null terminator is not included in
 706    the returned substring's 'length').
 707
 708    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 709    this functions this function will always return NULL.
 710
 711    The UTF-8 encoding of the returned string is correct for variable names and
 712    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 713    data_in() to use it in a "union value".  */
 714 struct substring
 715 lex_next_tokss (const struct lexer *lexer, int n)
 716 {
 717   return lex_next (lexer, n)->string;
 718 }
 719
 720 /* If LEXER is positioned at the (pseudo)identifier S, skips it and returns
 721    true.  Otherwise, returns false.
 722
 723    S may consist of an arbitrary number of identifiers, integers, and
 724    punctuation e.g. "KRUSKAL-WALLIS", "2SLS", or "END INPUT PROGRAM".
 725    Identifiers may be abbreviated to their first three letters.  Currently only
 726    hyphens, slashes, and equals signs are supported as punctuation (but it
 727    would be easy to add more).
 728
 729    S must be an ASCII string. */
 730 bool
 731 lex_match_phrase (struct lexer *lexer, const char *s)
 732 {
 733   int tok_idx;
 734
 735   for (tok_idx = 0; ; tok_idx++)
 736     {
 737       enum token_type token;
 738       unsigned char c;
 739
 740       while (c_isspace (*s))
 741         s++;
 742
 743       c = *s;
 744       if (c == '\0')
 745         {
 746           int i;
 747
 748           for (i = 0; i < tok_idx; i++)
 749             lex_get (lexer);
 750           return true;
 751         }
 752
 753       token = lex_next_token (lexer, tok_idx);
 754       switch (c)
 755         {
 756         case '-':
 757           if (token != T_DASH)
 758             return false;
 759           s++;
 760           break;
 761
 762         case '/':
 763           if (token != T_SLASH)
 764             return false;
 765           s++;
 766           break;
 767
 768         case '=':
 769           if (token != T_EQUALS)
 770             return false;
 771           s++;
 772           break;
 773
 774         case '0': case '1': case '2': case '3': case '4':
 775         case '5': case '6': case '7': case '8': case '9':
 776           {
 777             unsigned int value;
 778
 779             if (token != T_POS_NUM)
 780               return false;
 781
 782             value = 0;
 783             do
 784               {
 785                 value = value * 10 + (*s++ - '0');
 786               }
 787             while (c_isdigit (*s));
 788
 789             if (lex_next_tokval (lexer, tok_idx) != value)
 790               return false;
 791           }
 792           break;
 793
 794         default:
 795           if (lex_is_id1 (c))
 796             {
 797               int len;
 798
 799               if (token != T_ID)
 800                 return false;
 801
 802               len = lex_id_get_length (ss_cstr (s));
 803               if (!lex_id_match (ss_buffer (s, len),
 804                                  lex_next_tokss (lexer, tok_idx)))
 805                 return false;
 806
 807               s += len;
 808             }
 809           else
 810             NOT_REACHED ();
 811         }
 812     }
 813 }
 814
 815 static int
 816 lex_source_get_first_line_number (const struct lex_source *src, int n)
 817 {
 818   return lex_source_next__ (src, n)->first_line;
 819 }
 820
 821 static int
 822 count_newlines (char *s, size_t length)
 823 {
 824   int n_newlines = 0;
 825   char *newline;
 826
 827   while ((newline = memchr (s, '\n', length)) != NULL)
 828     {
 829       n_newlines++;
 830       length -= (newline + 1) - s;
 831       s = newline + 1;
 832     }
 833
 834   return n_newlines;
 835 }
 836
 837 static int
 838 lex_source_get_last_line_number (const struct lex_source *src, int n)
 839 {
 840   const struct lex_token *token = lex_source_next__ (src, n);
 841
 842   if (token->first_line == 0)
 843     return 0;
 844   else
 845     {
 846       char *token_str = &src->buffer[token->token_pos - src->tail];
 847       return token->first_line + count_newlines (token_str, token->token_len) + 1;
 848     }
 849 }
 850
 851 static int
 852 count_columns (const char *s_, size_t length)
 853 {
 854   const uint8_t *s = CHAR_CAST (const uint8_t *, s_);
 855   int columns;
 856   size_t ofs;
 857   int mblen;
 858
 859   columns = 0;
 860   for (ofs = 0; ofs < length; ofs += mblen)
 861     {
 862       ucs4_t uc;
 863
 864       mblen = u8_mbtouc (&uc, s + ofs, length - ofs);
 865       if (uc != '\t')
 866         {
 867           int width = uc_width (uc, "UTF-8");
 868           if (width > 0)
 869             columns += width;
 870         }
 871       else
 872         columns = ROUND_UP (columns + 1, 8);
 873     }
 874
 875   return columns + 1;
 876 }
 877
 878 static int
 879 lex_source_get_first_column (const struct lex_source *src, int n)
 880 {
 881   const struct lex_token *token = lex_source_next__ (src, n);
 882   return count_columns (&src->buffer[token->line_pos - src->tail],
 883                         token->token_pos - token->line_pos);
 884 }
 885
 886 static int
 887 lex_source_get_last_column (const struct lex_source *src, int n)
 888 {
 889   const struct lex_token *token = lex_source_next__ (src, n);
 890   char *start, *end, *newline;
 891
 892   start = &src->buffer[token->line_pos - src->tail];
 893   end = &src->buffer[(token->token_pos + token->token_len) - src->tail];
 894   newline = memrchr (start, '\n', end - start);
 895   if (newline != NULL)
 896     start = newline + 1;
 897   return count_columns (start, end - start);
 898 }
 899
 900 /* Returns the 1-based line number of the start of the syntax that represents
 901    the token N after the current one in LEXER.  Returns 0 for a T_STOP token or
 902    if the token is drawn from a source that does not have line numbers. */
 903 int
 904 lex_get_first_line_number (const struct lexer *lexer, int n)
 905 {
 906   const struct lex_source *src = lex_source__ (lexer);
 907   return src != NULL ? lex_source_get_first_line_number (src, n) : 0;
 908 }
 909
 910 /* Returns the 1-based line number of the end of the syntax that represents the
 911    token N after the current one in LEXER, plus 1.  Returns 0 for a T_STOP
 912    token or if the token is drawn from a source that does not have line
 913    numbers.
 914
 915    Most of the time, a single token is wholly within a single line of syntax,
 916    but there are two exceptions: a T_STRING token can be made up of multiple
 917    segments on adjacent lines connected with "+" punctuators, and a T_NEG_NUM
 918    token can consist of a "-" on one line followed by the number on the next.
 919  */
 920 int
 921 lex_get_last_line_number (const struct lexer *lexer, int n)
 922 {
 923   const struct lex_source *src = lex_source__ (lexer);
 924   return src != NULL ? lex_source_get_last_line_number (src, n) : 0;
 925 }
 926
 927 /* Returns the 1-based column number of the start of the syntax that represents
 928    the token N after the current one in LEXER.  Returns 0 for a T_STOP
 929    token.
 930
 931    Column numbers are measured according to the width of characters as shown in
 932    a typical fixed-width font, in which CJK characters have width 2 and
 933    combining characters have width 0.  */
 934 int
 935 lex_get_first_column (const struct lexer *lexer, int n)
 936 {
 937   const struct lex_source *src = lex_source__ (lexer);
 938   return src != NULL ? lex_source_get_first_column (src, n) : 0;
 939 }
 940
 941 /* Returns the 1-based column number of the end of the syntax that represents
 942    the token N after the current one in LEXER, plus 1.  Returns 0 for a T_STOP
 943    token.
 944
 945    Column numbers are measured according to the width of characters as shown in
 946    a typical fixed-width font, in which CJK characters have width 2 and
 947    combining characters have width 0.  */
 948 int
 949 lex_get_last_column (const struct lexer *lexer, int n)
 950 {
 951   const struct lex_source *src = lex_source__ (lexer);
 952   return src != NULL ? lex_source_get_last_column (src, n) : 0;
 953 }
 954
 955 /* Returns the name of the syntax file from which the current command is drawn.
 956    Returns NULL for a T_STOP token or if the command's source does not have
 957    line numbers.
 958
 959    There is no version of this function that takes an N argument because
 960    lookahead only works to the end of a command and any given command is always
 961    within a single syntax file. */
 962 const char *
 963 lex_get_file_name (const struct lexer *lexer)
 964 {
 965   struct lex_source *src = lex_source__ (lexer);
 966   return src == NULL ? NULL : src->reader->file_name;
 967 }
 968
 969 /* Returns the syntax mode for the syntax file from which the current drawn is
 970    drawn.  Returns LEX_SYNTAX_AUTO for a T_STOP token or if the command's
 971    source does not have line numbers.
 972
 973    There is no version of this function that takes an N argument because
 974    lookahead only works to the end of a command and any given command is always
 975    within a single syntax file. */
 976 enum lex_syntax_mode
 977 lex_get_syntax_mode (const struct lexer *lexer)
 978 {
 979   struct lex_source *src = lex_source__ (lexer);
 980   return src == NULL ? LEX_SYNTAX_AUTO : src->reader->syntax;
 981 }
 982
 983 /* Returns the error mode for the syntax file from which the current drawn is
 984    drawn.  Returns LEX_ERROR_INTERACTIVE for a T_STOP token or if the command's
 985    source does not have line numbers.
 986
 987    There is no version of this function that takes an N argument because
 988    lookahead only works to the end of a command and any given command is always
 989    within a single syntax file. */
 990 enum lex_error_mode
 991 lex_get_error_mode (const struct lexer *lexer)
 992 {
 993   struct lex_source *src = lex_source__ (lexer);
 994   return src == NULL ? LEX_ERROR_INTERACTIVE : src->reader->error;
 995 }
 996
 997 /* If the source that LEXER is currently reading has error mode
 998    LEX_ERROR_INTERACTIVE, discards all buffered input and tokens, so that the
 999    next token to be read comes directly from whatever is next read from the
1000    stream.
1001
1002    It makes sense to call this function after encountering an error in a
1003    command entered on the console, because usually the user would prefer not to
1004    have cascading errors. */
1005 void
1006 lex_interactive_reset (struct lexer *lexer)
1007 {
1008   struct lex_source *src = lex_source__ (lexer);
1009   if (src != NULL && src->reader->error == LEX_ERROR_INTERACTIVE)
1010     {
1011       src->head = src->tail = 0;
1012       src->journal_pos = src->seg_pos = src->line_pos = 0;
1013       src->n_newlines = 0;
1014       src->suppress_next_newline = false;
1015       segmenter_init (&src->segmenter, segmenter_get_mode (&src->segmenter));
1016       while (!deque_is_empty (&src->deque))
1017         lex_source_pop__ (src);
1018       lex_source_push_endcmd__ (src);
1019     }
1020 }
1021
1022 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1023 void
1024 lex_discard_rest_of_command (struct lexer *lexer)
1025 {
1026   while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1027     lex_get (lexer);
1028 }
1029
1030 /* Discards all lookahead tokens in LEXER, then discards all input sources
1031    until it encounters one with error mode LEX_ERROR_INTERACTIVE or until it
1032    runs out of input sources. */
1033 void
1034 lex_discard_noninteractive (struct lexer *lexer)
1035 {
1036   struct lex_source *src = lex_source__ (lexer);
1037
1038   if (src != NULL)
1039     {
1040       while (!deque_is_empty (&src->deque))
1041         lex_source_pop__ (src);
1042
1043       for (; src != NULL && src->reader->error != LEX_ERROR_INTERACTIVE;
1044            src = lex_source__ (lexer))
1045         lex_source_destroy (src);
1046     }
1047 }
1048 \f
1049 static size_t
1050 lex_source_max_tail__ (const struct lex_source *src)
1051 {
1052   const struct lex_token *token;
1053   size_t max_tail;
1054
1055   assert (src->seg_pos >= src->line_pos);
1056   max_tail = MIN (src->journal_pos, src->line_pos);
1057
1058   /* Use the oldest token also.  (We know that src->deque cannot be empty
1059      because we are in the process of adding a new token, which is already
1060      initialized enough to use here.) */
1061   token = &src->tokens[deque_back (&src->deque, 0)];
1062   assert (token->token_pos >= token->line_pos);
1063   max_tail = MIN (max_tail, token->line_pos);
1064
1065   return max_tail;
1066 }
1067
1068 static void
1069 lex_source_expand__ (struct lex_source *src)
1070 {
1071   if (src->head - src->tail >= src->allocated)
1072     {
1073       size_t max_tail = lex_source_max_tail__ (src);
1074       if (max_tail > src->tail)
1075         {
1076           /* Advance the tail, freeing up room at the head. */
1077           memmove (src->buffer, src->buffer + (max_tail - src->tail),
1078                    src->head - max_tail);
1079           src->tail = max_tail;
1080         }
1081       else
1082         {
1083           /* Buffer is completely full.  Expand it. */
1084           src->buffer = x2realloc (src->buffer, &src->allocated);
1085         }
1086     }
1087   else
1088     {
1089       /* There's space available at the head of the buffer.  Nothing to do. */
1090     }
1091 }
1092
1093 static void
1094 lex_source_read__ (struct lex_source *src)
1095 {
1096   do
1097     {
1098       size_t head_ofs;
1099       size_t n;
1100
1101       lex_source_expand__ (src);
1102
1103       head_ofs = src->head - src->tail;
1104       n = src->reader->class->read (src->reader, &src->buffer[head_ofs],
1105                                     src->allocated - head_ofs,
1106                                     segmenter_get_prompt (&src->segmenter));
1107       if (n == 0)
1108         {
1109           /* End of input.
1110
1111              Ensure that the input always ends in a new-line followed by a null
1112              byte, as required by the segmenter library. */
1113
1114           if (src->head == src->tail
1115               || src->buffer[src->head - src->tail - 1] != '\n')
1116             src->buffer[src->head++ - src->tail] = '\n';
1117
1118           lex_source_expand__ (src);
1119           src->buffer[src->head++ - src->tail] = '\0';
1120
1121           return;
1122         }
1123
1124       src->head += n;
1125     }
1126   while (!memchr (&src->buffer[src->seg_pos - src->tail], '\n',
1127                   src->head - src->seg_pos));
1128 }
1129
1130 static struct lex_source *
1131 lex_source__ (const struct lexer *lexer)
1132 {
1133   return (ll_is_empty (&lexer->sources) ? NULL
1134           : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1135 }
1136
1137 static struct substring
1138 lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
1139 {
1140   const struct lex_token *token0 = lex_source_next__ (src, n0);
1141   const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1));
1142   size_t start = token0->token_pos;
1143   size_t end = token1->token_pos + token1->token_len;
1144
1145   return ss_buffer (&src->buffer[start - src->tail], end - start);
1146 }
1147
1148 static void
1149 lex_ellipsize__ (struct substring in, char *out, size_t out_size)
1150 {
1151   size_t out_maxlen;
1152   size_t out_len;
1153   int mblen;
1154
1155   assert (out_size >= 16);
1156   out_maxlen = out_size - (in.length >= out_size ? 3 : 0) - 1;
1157   for (out_len = 0; out_len < in.length; out_len += mblen)
1158     {
1159       if (in.string[out_len] == '\n'
1160           || (in.string[out_len] == '\r'
1161               && out_len + 1 < in.length
1162               && in.string[out_len + 1] == '\n'))
1163         break;
1164
1165       mblen = u8_mblen (CHAR_CAST (const uint8_t *, in.string + out_len),
1166                         in.length - out_len);
1167       if (out_len + mblen > out_maxlen)
1168         break;
1169     }
1170
1171   memcpy (out, in.string, out_len);
1172   strcpy (&out[out_len], out_len < in.length ? "..." : "");
1173 }
1174
1175 static void
1176 lex_source_error_valist (struct lex_source *src, int n0, int n1,
1177                          const char *format, va_list args)
1178 {
1179   const struct lex_token *token;
1180   struct string s;
1181   struct msg m;
1182
1183   ds_init_empty (&s);
1184
1185   token = lex_source_next__ (src, n0);
1186   if (token->token.type == T_ENDCMD)
1187     ds_put_cstr (&s, _("Syntax error at end of command"));
1188   else
1189     {
1190       struct substring syntax = lex_source_get_syntax__ (src, n0, n1);
1191       if (!ss_is_empty (syntax))
1192         {
1193           char syntax_cstr[64];
1194
1195           lex_ellipsize__ (syntax, syntax_cstr, sizeof syntax_cstr);
1196           ds_put_format (&s, _("Syntax error at `%s'"), syntax_cstr);
1197         }
1198       else
1199         ds_put_cstr (&s, _("Syntax error"));
1200     }
1201
1202   if (format)
1203     {
1204       ds_put_cstr (&s, ": ");
1205       ds_put_vformat (&s, format, args);
1206     }
1207   ds_put_byte (&s, '.');
1208
1209   m.category = MSG_C_SYNTAX;
1210   m.severity = MSG_S_ERROR;
1211   m.file_name = src->reader->file_name;
1212   m.first_line = lex_source_get_first_line_number (src, n0);
1213   m.last_line = lex_source_get_last_line_number (src, n1);
1214   m.first_column = lex_source_get_first_column (src, n0);
1215   m.last_column = lex_source_get_last_column (src, n1);
1216   m.text = ds_steal_cstr (&s);
1217   msg_emit (&m);
1218 }
1219
1220 static void PRINTF_FORMAT (2, 3)
1221 lex_get_error (struct lex_source *src, const char *format, ...)
1222 {
1223   va_list args;
1224   int n;
1225
1226   va_start (args, format);
1227
1228   n = deque_count (&src->deque) - 1;
1229   lex_source_error_valist (src, n, n, format, args);
1230   lex_source_pop_front (src);
1231
1232   va_end (args);
1233 }
1234
1235 static bool
1236 lex_source_get__ (const struct lex_source *src_)
1237 {
1238   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1239
1240   struct state
1241     {
1242       struct segmenter segmenter;
1243       enum segment_type last_segment;
1244       int newlines;
1245       size_t line_pos;
1246       size_t seg_pos;
1247     };
1248
1249   struct state state, saved;
1250   enum scan_result result;
1251   struct scanner scanner;
1252   struct lex_token *token;
1253   int n_lines;
1254   int i;
1255
1256   if (src->eof)
1257     return false;
1258
1259   state.segmenter = src->segmenter;
1260   state.newlines = 0;
1261   state.seg_pos = src->seg_pos;
1262   state.line_pos = src->line_pos;
1263   saved = state;
1264
1265   token = lex_push_token__ (src);
1266   scanner_init (&scanner, &token->token);
1267   token->line_pos = src->line_pos;
1268   token->token_pos = src->seg_pos;
1269   if (src->reader->line_number > 0)
1270     token->first_line = src->reader->line_number + src->n_newlines;
1271   else
1272     token->first_line = 0;
1273
1274   for (;;)
1275     {
1276       enum segment_type type;
1277       const char *segment;
1278       size_t seg_maxlen;
1279       int seg_len;
1280
1281       segment = &src->buffer[state.seg_pos - src->tail];
1282       seg_maxlen = src->head - state.seg_pos;
1283       seg_len = segmenter_push (&state.segmenter, segment, seg_maxlen, &type);
1284       if (seg_len < 0)
1285         {
1286           lex_source_read__ (src);
1287           continue;
1288         }
1289
1290       state.last_segment = type;
1291       state.seg_pos += seg_len;
1292       if (type == SEG_NEWLINE)
1293         {
1294           state.newlines++;
1295           state.line_pos = state.seg_pos;
1296         }
1297
1298       result = scanner_push (&scanner, type, ss_buffer (segment, seg_len),
1299                              &token->token);
1300       if (result == SCAN_SAVE)
1301         saved = state;
1302       else if (result == SCAN_BACK)
1303         {
1304           state = saved;
1305           break;
1306         }
1307       else if (result == SCAN_DONE)
1308         break;
1309     }
1310
1311   n_lines = state.newlines;
1312   if (state.last_segment == SEG_END_COMMAND && !src->suppress_next_newline)
1313     {
1314       n_lines++;
1315       src->suppress_next_newline = true;
1316     }
1317   else if (n_lines > 0 && src->suppress_next_newline)
1318     {
1319       n_lines--;
1320       src->suppress_next_newline = false;
1321     }
1322   for (i = 0; i < n_lines; i++)
1323     {
1324       const char *newline;
1325       const char *line;
1326       size_t line_len;
1327       char *syntax;
1328
1329       line = &src->buffer[src->journal_pos - src->tail];
1330       newline = rawmemchr (line, '\n');
1331       line_len = newline - line;
1332       if (line_len > 0 && line[line_len - 1] == '\r')
1333         line_len--;
1334
1335       syntax = malloc (line_len + 2);
1336       memcpy (syntax, line, line_len);
1337       syntax[line_len] = '\n';
1338       syntax[line_len + 1] = '\0';
1339
1340       text_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX, syntax));
1341
1342       src->journal_pos += newline - line + 1;
1343     }
1344
1345   token->token_len = state.seg_pos - src->seg_pos;
1346
1347   src->segmenter = state.segmenter;
1348   src->seg_pos = state.seg_pos;
1349   src->line_pos = state.line_pos;
1350   src->n_newlines += state.newlines;
1351
1352   switch (token->token.type)
1353     {
1354     default:
1355       break;
1356
1357     case T_STOP:
1358       token->token.type = T_ENDCMD;
1359       src->eof = true;
1360       break;
1361
1362     case SCAN_BAD_HEX_LENGTH:
1363       lex_get_error (src, _("String of hex digits has %d characters, which "
1364                             "is not a multiple of 2"),
1365                      (int) token->token.number);
1366       break;
1367
1368     case SCAN_BAD_HEX_DIGIT:
1369     case SCAN_BAD_UNICODE_DIGIT:
1370       lex_get_error (src, _("`%c' is not a valid hex digit"),
1371                      (int) token->token.number);
1372       break;
1373
1374     case SCAN_BAD_UNICODE_LENGTH:
1375       lex_get_error (src, _("Unicode string contains %d bytes, which is "
1376                             "not in the valid range of 1 to 8 bytes"),
1377                      (int) token->token.number);
1378       break;
1379
1380     case SCAN_BAD_UNICODE_CODE_POINT:
1381       lex_get_error (src, _("U+%04X is not a valid Unicode code point"),
1382                      (int) token->token.number);
1383       break;
1384
1385     case SCAN_EXPECTED_QUOTE:
1386       lex_get_error (src, _("Unterminated string constant"));
1387       break;
1388
1389     case SCAN_EXPECTED_EXPONENT:
1390       lex_get_error (src, _("Missing exponent following `%s'"),
1391                      token->token.string.string);
1392       break;
1393
1394     case SCAN_UNEXPECTED_DOT:
1395       lex_get_error (src, _("Unexpected `.' in middle of command"));
1396       break;
1397
1398     case SCAN_UNEXPECTED_CHAR:
1399       {
1400         char c_name[16];
1401         lex_get_error (src, _("Bad character %s in input"),
1402                        uc_name (token->token.number, c_name));
1403       }
1404       break;
1405
1406     case SCAN_SKIP:
1407       lex_source_pop_front (src);
1408       break;
1409     }
1410
1411   return true;
1412 }
1413 \f
1414 static void
1415 lex_source_push_endcmd__ (struct lex_source *src)
1416 {
1417   struct lex_token *token = lex_push_token__ (src);
1418   token->token.type = T_ENDCMD;
1419   token->token_pos = 0;
1420   token->token_len = 0;
1421   token->line_pos = 0;
1422   token->first_line = 0;
1423 }
1424
1425 static struct lex_source *
1426 lex_source_create (struct lex_reader *reader)
1427 {
1428   struct lex_source *src;
1429   enum segmenter_mode mode;
1430
1431   src = xzalloc (sizeof *src);
1432   src->reader = reader;
1433
1434   if (reader->syntax == LEX_SYNTAX_AUTO)
1435     mode = SEG_MODE_AUTO;
1436   else if (reader->syntax == LEX_SYNTAX_INTERACTIVE)
1437     mode = SEG_MODE_INTERACTIVE;
1438   else if (reader->syntax == LEX_SYNTAX_BATCH)
1439     mode = SEG_MODE_BATCH;
1440   else
1441     NOT_REACHED ();
1442   segmenter_init (&src->segmenter, mode);
1443
1444   src->tokens = deque_init (&src->deque, 4, sizeof *src->tokens);
1445
1446   lex_source_push_endcmd__ (src);
1447
1448   return src;
1449 }
1450
1451 static void
1452 lex_source_destroy (struct lex_source *src)
1453 {
1454   char *file_name = src->reader->file_name;
1455   if (src->reader->class->close != NULL)
1456     src->reader->class->close (src->reader);
1457   free (file_name);
1458   free (src->buffer);
1459   while (!deque_is_empty (&src->deque))
1460     lex_source_pop__ (src);
1461   free (src->tokens);
1462   ll_remove (&src->ll);
1463   free (src);
1464 }
1465 \f
1466 struct lex_file_reader
1467   {
1468     struct lex_reader reader;
1469     struct u8_istream *istream;
1470     char *file_name;
1471   };
1472
1473 static struct lex_reader_class lex_file_reader_class;
1474
1475 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
1476    from stdin if FILE_NAME is "-").  The file is expected to be encoded with
1477    ENCODING, which should take one of the forms accepted by
1478    u8_istream_for_file().  SYNTAX and ERROR become the syntax mode and error
1479    mode of the new reader, respectively.
1480
1481    Returns a null pointer if FILE_NAME cannot be opened. */
1482 struct lex_reader *
1483 lex_reader_for_file (const char *file_name, const char *encoding,
1484                      enum lex_syntax_mode syntax,
1485                      enum lex_error_mode error)
1486 {
1487   struct lex_file_reader *r;
1488   struct u8_istream *istream;
1489
1490   istream = (!strcmp(file_name, "-")
1491              ? u8_istream_for_fd (encoding, STDIN_FILENO)
1492              : u8_istream_for_file (encoding, file_name, O_RDONLY));
1493   if (istream == NULL)
1494     {
1495       msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
1496       return NULL;
1497     }
1498
1499   r = xmalloc (sizeof *r);
1500   lex_reader_init (&r->reader, &lex_file_reader_class);
1501   r->reader.syntax = syntax;
1502   r->reader.error = error;
1503   r->reader.file_name = xstrdup (file_name);
1504   r->reader.line_number = 1;
1505   r->istream = istream;
1506   r->file_name = xstrdup (file_name);
1507
1508   return &r->reader;
1509 }
1510
1511 static struct lex_file_reader *
1512 lex_file_reader_cast (struct lex_reader *r)
1513 {
1514   return UP_CAST (r, struct lex_file_reader, reader);
1515 }
1516
1517 static size_t
1518 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
1519                enum prompt_style prompt_style UNUSED)
1520 {
1521   struct lex_file_reader *r = lex_file_reader_cast (r_);
1522   ssize_t n_read = u8_istream_read (r->istream, buf, n);
1523   if (n_read < 0)
1524     {
1525       msg (ME, _("Error reading `%s': %s."), r->file_name, strerror (errno));
1526       return 0;
1527     }
1528   return n_read;
1529 }
1530
1531 static void
1532 lex_file_close (struct lex_reader *r_)
1533 {
1534   struct lex_file_reader *r = lex_file_reader_cast (r_);
1535
1536   if (u8_istream_fileno (r->istream) != STDIN_FILENO)
1537     {
1538       if (u8_istream_close (r->istream) != 0)
1539         msg (ME, _("Error closing `%s': %s."), r->file_name, strerror (errno));
1540     }
1541   else
1542     u8_istream_free (r->istream);
1543
1544   free (r->file_name);
1545   free (r);
1546 }
1547
1548 static struct lex_reader_class lex_file_reader_class =
1549   {
1550     lex_file_read,
1551     lex_file_close
1552   };
1553 \f
1554 struct lex_string_reader
1555   {
1556     struct lex_reader reader;
1557     struct substring s;
1558     size_t offset;
1559   };
1560
1561 static struct lex_reader_class lex_string_reader_class;
1562
1563 /* Creates and returns a new lex_reader for the contents of S, which must be
1564    encoded in UTF-8.  The new reader takes ownership of S and will free it
1565    with ss_dealloc() when it is closed. */
1566 struct lex_reader *
1567 lex_reader_for_substring_nocopy (struct substring s)
1568 {
1569   struct lex_string_reader *r;
1570
1571   r = xmalloc (sizeof *r);
1572   lex_reader_init (&r->reader, &lex_string_reader_class);
1573   r->reader.syntax = LEX_SYNTAX_INTERACTIVE;
1574   r->s = s;
1575   r->offset = 0;
1576
1577   return &r->reader;
1578 }
1579
1580 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
1581    which must be encoded in UTF-8.  The caller retains ownership of S. */
1582 struct lex_reader *
1583 lex_reader_for_string (const char *s)
1584 {
1585   struct substring ss;
1586   ss_alloc_substring (&ss, ss_cstr (s));
1587   return lex_reader_for_substring_nocopy (ss);
1588 }
1589
1590 /* Formats FORMAT as a printf()-like format string and creates and returns a
1591    new lex_reader for the formatted result.  */
1592 struct lex_reader *
1593 lex_reader_for_format (const char *format, ...)
1594 {
1595   struct lex_reader *r;
1596   va_list args;
1597
1598   va_start (args, format);
1599   r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)));
1600   va_end (args);
1601
1602   return r;
1603 }
1604
1605 static struct lex_string_reader *
1606 lex_string_reader_cast (struct lex_reader *r)
1607 {
1608   return UP_CAST (r, struct lex_string_reader, reader);
1609 }
1610
1611 static size_t
1612 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
1613                  enum prompt_style prompt_style UNUSED)
1614 {
1615   struct lex_string_reader *r = lex_string_reader_cast (r_);
1616   size_t chunk;
1617
1618   chunk = MIN (n, r->s.length - r->offset);
1619   memcpy (buf, r->s.string + r->offset, chunk);
1620   r->offset += chunk;
1621
1622   return chunk;
1623 }
1624
1625 static void
1626 lex_string_close (struct lex_reader *r_)
1627 {
1628   struct lex_string_reader *r = lex_string_reader_cast (r_);
1629
1630   ss_dealloc (&r->s);
1631   free (r);
1632 }
1633
1634 static struct lex_reader_class lex_string_reader_class =
1635   {
1636     lex_string_read,
1637     lex_string_close
1638   };