pintos-os.org Git - pspp/blob - src/language/lexer/lexer.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "language/lexer/lexer.h"
  20
  21 #include <errno.h>
  22 #include <fcntl.h>
  23 #include <limits.h>
  24 #include <math.h>
  25 #include <stdarg.h>
  26 #include <stdlib.h>
  27 #include <string.h>
  28 #include <unictype.h>
  29 #include <unistd.h>
  30 #include <unistr.h>
  31 #include <uniwidth.h>
  32
  33 #include "language/command.h"
  34 #include "language/lexer/scan.h"
  35 #include "language/lexer/segment.h"
  36 #include "language/lexer/token.h"
  37 #include "libpspp/assertion.h"
  38 #include "libpspp/cast.h"
  39 #include "libpspp/deque.h"
  40 #include "libpspp/i18n.h"
  41 #include "libpspp/ll.h"
  42 #include "libpspp/message.h"
  43 #include "libpspp/misc.h"
  44 #include "libpspp/str.h"
  45 #include "libpspp/u8-istream.h"
  46 #include "output/journal.h"
  47 #include "output/output-item.h"
  48
  49 #include "gl/c-ctype.h"
  50 #include "gl/minmax.h"
  51 #include "gl/xalloc.h"
  52 #include "gl/xmemdup0.h"
  53
  54 #include "gettext.h"
  55 #define _(msgid) gettext (msgid)
  56 #define N_(msgid) msgid
  57
  58 /* A token within a lex_source. */
  59 struct lex_token
  60   {
  61     /* The regular token information. */
  62     struct token token;
  63
  64     /* Location of token in terms of the lex_source's buffer.
  65        src->tail <= line_pos <= token_pos <= src->head. */
  66     size_t token_pos;           /* Start of token. */
  67     size_t token_len;           /* Length of source for token in bytes. */
  68     size_t line_pos;            /* Start of line containing token_pos. */
  69     int first_line;             /* Line number at token_pos. */
  70   };
  71
  72 /* A source of tokens, corresponding to a syntax file.
  73
  74    This is conceptually a lex_reader wrapped with everything needed to convert
  75    its UTF-8 bytes into tokens. */
  76 struct lex_source
  77   {
  78     struct ll ll;               /* In lexer's list of sources. */
  79     struct lex_reader *reader;
  80     struct segmenter segmenter;
  81     bool eof;                   /* True if T_STOP was read from 'reader'. */
  82
  83     /* Buffer of UTF-8 bytes. */
  84     char *buffer;
  85     size_t allocated;           /* Number of bytes allocated. */
  86     size_t tail;                /* &buffer[0] offset into UTF-8 source. */
  87     size_t head;                /* &buffer[head - tail] offset into source. */
  88
  89     /* Positions in source file, tail <= pos <= head for each member here. */
  90     size_t journal_pos;         /* First byte not yet output to journal. */
  91     size_t seg_pos;             /* First byte not yet scanned as token. */
  92     size_t line_pos;            /* First byte of line containing seg_pos. */
  93
  94     int n_newlines;             /* Number of new-lines up to seg_pos. */
  95     bool suppress_next_newline;
  96
  97     /* Tokens. */
  98     struct deque deque;         /* Indexes into 'tokens'. */
  99     struct lex_token *tokens;   /* Lookahead tokens for parser. */
 100   };
 101
 102 static struct lex_source *lex_source_create (struct lex_reader *);
 103 static void lex_source_destroy (struct lex_source *);
 104
 105 /* Lexer. */
 106 struct lexer
 107   {
 108     struct ll_list sources;     /* Contains "struct lex_source"s. */
 109   };
 110
 111 static struct lex_source *lex_source__ (const struct lexer *);
 112 static const struct lex_token *lex_next__ (const struct lexer *, int n);
 113 static void lex_source_push_endcmd__ (struct lex_source *);
 114
 115 static void lex_source_pop__ (struct lex_source *);
 116 static bool lex_source_get__ (const struct lex_source *);
 117 static void lex_source_error_valist (struct lex_source *, int n0, int n1,
 118                                      const char *format, va_list)
 119    PRINTF_FORMAT (4, 0);
 120 static const struct lex_token *lex_source_next__ (const struct lex_source *,
 121                                                   int n);
 122 \f
 123 /* Initializes READER with the specified CLASS and otherwise some reasonable
 124    defaults.  The caller should fill in the others members as desired. */
 125 void
 126 lex_reader_init (struct lex_reader *reader,
 127                  const struct lex_reader_class *class)
 128 {
 129   reader->class = class;
 130   reader->syntax = LEX_SYNTAX_AUTO;
 131   reader->error = LEX_ERROR_CONTINUE;
 132   reader->file_name = NULL;
 133   reader->encoding = NULL;
 134   reader->line_number = 0;
 135   reader->eof = false;
 136 }
 137
 138 /* Frees any file name already in READER and replaces it by a copy of
 139    FILE_NAME, or if FILE_NAME is null then clears any existing name. */
 140 void
 141 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
 142 {
 143   free (reader->file_name);
 144   reader->file_name = xstrdup_if_nonnull (file_name);
 145 }
 146 \f
 147 /* Creates and returns a new lexer. */
 148 struct lexer *
 149 lex_create (void)
 150 {
 151   struct lexer *lexer = xzalloc (sizeof *lexer);
 152   ll_init (&lexer->sources);
 153   return lexer;
 154 }
 155
 156 /* Destroys LEXER. */
 157 void
 158 lex_destroy (struct lexer *lexer)
 159 {
 160   if (lexer != NULL)
 161     {
 162       struct lex_source *source, *next;
 163
 164       ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
 165         lex_source_destroy (source);
 166       free (lexer);
 167     }
 168 }
 169
 170 /* Inserts READER into LEXER so that the next token read by LEXER comes from
 171    READER.  Before the caller, LEXER must either be empty or at a T_ENDCMD
 172    token. */
 173 void
 174 lex_include (struct lexer *lexer, struct lex_reader *reader)
 175 {
 176   assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
 177   ll_push_head (&lexer->sources, &lex_source_create (reader)->ll);
 178 }
 179
 180 /* Appends READER to LEXER, so that it will be read after all other current
 181    readers have already been read. */
 182 void
 183 lex_append (struct lexer *lexer, struct lex_reader *reader)
 184 {
 185   ll_push_tail (&lexer->sources, &lex_source_create (reader)->ll);
 186 }
 187 \f
 188 /* Advancing. */
 189
 190 static struct lex_token *
 191 lex_push_token__ (struct lex_source *src)
 192 {
 193   struct lex_token *token;
 194
 195   if (deque_is_full (&src->deque))
 196     src->tokens = deque_expand (&src->deque, src->tokens, sizeof *src->tokens);
 197
 198   token = &src->tokens[deque_push_front (&src->deque)];
 199   token_init (&token->token);
 200   return token;
 201 }
 202
 203 static void
 204 lex_source_pop__ (struct lex_source *src)
 205 {
 206   token_destroy (&src->tokens[deque_pop_back (&src->deque)].token);
 207 }
 208
 209 static void
 210 lex_source_pop_front (struct lex_source *src)
 211 {
 212   token_destroy (&src->tokens[deque_pop_front (&src->deque)].token);
 213 }
 214
 215 /* Advances LEXER to the next token, consuming the current token. */
 216 void
 217 lex_get (struct lexer *lexer)
 218 {
 219   struct lex_source *src;
 220
 221   src = lex_source__ (lexer);
 222   if (src == NULL)
 223     return;
 224
 225   if (!deque_is_empty (&src->deque))
 226     lex_source_pop__ (src);
 227
 228   while (deque_is_empty (&src->deque))
 229     if (!lex_source_get__ (src))
 230       {
 231         lex_source_destroy (src);
 232         src = lex_source__ (lexer);
 233         if (src == NULL)
 234           return;
 235       }
 236 }
 237 \f
 238 /* Issuing errors. */
 239
 240 /* Prints a syntax error message containing the current token and
 241    given message MESSAGE (if non-null). */
 242 void
 243 lex_error (struct lexer *lexer, const char *format, ...)
 244 {
 245   va_list args;
 246
 247   va_start (args, format);
 248   lex_next_error_valist (lexer, 0, 0, format, args);
 249   va_end (args);
 250 }
 251
 252 /* Prints a syntax error message containing the current token and
 253    given message MESSAGE (if non-null). */
 254 void
 255 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
 256 {
 257   lex_next_error_valist (lexer, 0, 0, format, args);
 258 }
 259
 260 /* Prints a syntax error message containing the current token and
 261    given message MESSAGE (if non-null). */
 262 void
 263 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
 264 {
 265   va_list args;
 266
 267   va_start (args, format);
 268   lex_next_error_valist (lexer, n0, n1, format, args);
 269   va_end (args);
 270 }
 271
 272 /* Prints a syntax error message saying that one of the strings provided as
 273    varargs, up to the first NULL, is expected. */
 274 void
 275 (lex_error_expecting) (struct lexer *lexer, ...)
 276 {
 277   va_list args;
 278
 279   va_start (args, lexer);
 280   lex_error_expecting_valist (lexer, args);
 281   va_end (args);
 282 }
 283
 284 /* Prints a syntax error message saying that one of the options provided in
 285    ARGS, up to the first NULL, is expected. */
 286 void
 287 lex_error_expecting_valist (struct lexer *lexer, va_list args)
 288 {
 289   enum { MAX_OPTIONS = 9 };
 290   const char *options[MAX_OPTIONS];
 291   int n = 0;
 292   while (n < MAX_OPTIONS)
 293     {
 294       const char *option = va_arg (args, const char *);
 295       if (!option)
 296         break;
 297
 298       options[n++] = option;
 299     }
 300   lex_error_expecting_array (lexer, options, n);
 301 }
 302
 303 void
 304 lex_error_expecting_array (struct lexer *lexer, const char **options, size_t n)
 305 {
 306   switch (n)
 307     {
 308     case 0:
 309       lex_error (lexer, NULL);
 310       break;
 311
 312     case 1:
 313       lex_error (lexer, _("expecting %s"), options[0]);
 314       break;
 315
 316     case 2:
 317       lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
 318       break;
 319
 320     case 3:
 321       lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
 322                  options[2]);
 323       break;
 324
 325     case 4:
 326       lex_error (lexer, _("expecting %s, %s, %s, or %s"),
 327                  options[0], options[1], options[2], options[3]);
 328       break;
 329
 330     case 5:
 331       lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
 332                  options[0], options[1], options[2], options[3], options[4]);
 333       break;
 334
 335     case 6:
 336       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
 337                  options[0], options[1], options[2], options[3], options[4],
 338                  options[5]);
 339       break;
 340
 341     case 7:
 342       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
 343                  options[0], options[1], options[2], options[3], options[4],
 344                  options[5], options[6]);
 345       break;
 346
 347     case 8:
 348       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
 349                  options[0], options[1], options[2], options[3], options[4],
 350                  options[5], options[6], options[7]);
 351       break;
 352
 353     default:
 354       lex_error (lexer, NULL);
 355     }
 356 }
 357
 358 /* Reports an error to the effect that subcommand SBC may only be specified
 359    once.
 360
 361    This function does not take a lexer as an argument or use lex_error(),
 362    because the result would ordinarily just be redundant: "Syntax error at
 363    SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
 364    not help the user find the error. */
 365 void
 366 lex_sbc_only_once (const char *sbc)
 367 {
 368   msg (SE, _("Subcommand %s may only be specified once."), sbc);
 369 }
 370
 371 /* Reports an error to the effect that subcommand SBC is missing.
 372
 373    This function does not take a lexer as an argument or use lex_error(),
 374    because a missing subcommand can normally be detected only after the whole
 375    command has been parsed, and so lex_error() would always report "Syntax
 376    error at end of command", which does not help the user find the error. */
 377 void
 378 lex_sbc_missing (const char *sbc)
 379 {
 380   msg (SE, _("Required subcommand %s was not specified."), sbc);
 381 }
 382
 383 /* Reports an error to the effect that specification SPEC may only be specified
 384    once within subcommand SBC. */
 385 void
 386 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
 387 {
 388   lex_error (lexer, _("%s may only be specified once within subcommand %s"),
 389              spec, sbc);
 390 }
 391
 392 /* Reports an error to the effect that specification SPEC is missing within
 393    subcommand SBC. */
 394 void
 395 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
 396 {
 397   lex_error (lexer, _("Required %s specification missing from %s subcommand"),
 398              sbc, spec);
 399 }
 400
 401 /* Prints a syntax error message containing the current token and
 402    given message MESSAGE (if non-null). */
 403 void
 404 lex_next_error_valist (struct lexer *lexer, int n0, int n1,
 405                        const char *format, va_list args)
 406 {
 407   struct lex_source *src = lex_source__ (lexer);
 408
 409   if (src != NULL)
 410     lex_source_error_valist (src, n0, n1, format, args);
 411   else
 412     {
 413       struct string s;
 414
 415       ds_init_empty (&s);
 416       ds_put_format (&s, _("Syntax error at end of input"));
 417       if (format != NULL)
 418         {
 419           ds_put_cstr (&s, ": ");
 420           ds_put_vformat (&s, format, args);
 421         }
 422       ds_put_byte (&s, '.');
 423       msg (SE, "%s", ds_cstr (&s));
 424       ds_destroy (&s);
 425     }
 426 }
 427
 428 /* Checks that we're at end of command.
 429    If so, returns a successful command completion code.
 430    If not, flags a syntax error and returns an error command
 431    completion code. */
 432 int
 433 lex_end_of_command (struct lexer *lexer)
 434 {
 435   if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
 436     {
 437       lex_error (lexer, _("expecting end of command"));
 438       return CMD_FAILURE;
 439     }
 440   else
 441     return CMD_SUCCESS;
 442 }
 443 \f
 444 /* Token testing functions. */
 445
 446 /* Returns true if the current token is a number. */
 447 bool
 448 lex_is_number (const struct lexer *lexer)
 449 {
 450   return lex_next_is_number (lexer, 0);
 451 }
 452
 453 /* Returns true if the current token is a string. */
 454 bool
 455 lex_is_string (const struct lexer *lexer)
 456 {
 457   return lex_next_is_string (lexer, 0);
 458 }
 459
 460 /* Returns the value of the current token, which must be a
 461    floating point number. */
 462 double
 463 lex_number (const struct lexer *lexer)
 464 {
 465   return lex_next_number (lexer, 0);
 466 }
 467
 468 /* Returns true iff the current token is an integer. */
 469 bool
 470 lex_is_integer (const struct lexer *lexer)
 471 {
 472   return lex_next_is_integer (lexer, 0);
 473 }
 474
 475 /* Returns the value of the current token, which must be an
 476    integer. */
 477 long
 478 lex_integer (const struct lexer *lexer)
 479 {
 480   return lex_next_integer (lexer, 0);
 481 }
 482 \f
 483 /* Token testing functions with lookahead.
 484
 485    A value of 0 for N as an argument to any of these functions refers to the
 486    current token.  Lookahead is limited to the current command.  Any N greater
 487    than the number of tokens remaining in the current command will be treated
 488    as referring to a T_ENDCMD token. */
 489
 490 /* Returns true if the token N ahead of the current token is a number. */
 491 bool
 492 lex_next_is_number (const struct lexer *lexer, int n)
 493 {
 494   enum token_type next_token = lex_next_token (lexer, n);
 495   return next_token == T_POS_NUM || next_token == T_NEG_NUM;
 496 }
 497
 498 /* Returns true if the token N ahead of the current token is a string. */
 499 bool
 500 lex_next_is_string (const struct lexer *lexer, int n)
 501 {
 502   return lex_next_token (lexer, n) == T_STRING;
 503 }
 504
 505 /* Returns the value of the token N ahead of the current token, which must be a
 506    floating point number. */
 507 double
 508 lex_next_number (const struct lexer *lexer, int n)
 509 {
 510   assert (lex_next_is_number (lexer, n));
 511   return lex_next_tokval (lexer, n);
 512 }
 513
 514 /* Returns true if the token N ahead of the current token is an integer. */
 515 bool
 516 lex_next_is_integer (const struct lexer *lexer, int n)
 517 {
 518   double value;
 519
 520   if (!lex_next_is_number (lexer, n))
 521     return false;
 522
 523   value = lex_next_tokval (lexer, n);
 524   return value > LONG_MIN && value <= LONG_MAX && floor (value) == value;
 525 }
 526
 527 /* Returns the value of the token N ahead of the current token, which must be
 528    an integer. */
 529 long
 530 lex_next_integer (const struct lexer *lexer, int n)
 531 {
 532   assert (lex_next_is_integer (lexer, n));
 533   return lex_next_tokval (lexer, n);
 534 }
 535 \f
 536 /* Token matching functions. */
 537
 538 /* If the current token has the specified TYPE, skips it and returns true.
 539    Otherwise, returns false. */
 540 bool
 541 lex_match (struct lexer *lexer, enum token_type type)
 542 {
 543   if (lex_token (lexer) == type)
 544     {
 545       lex_get (lexer);
 546       return true;
 547     }
 548   else
 549     return false;
 550 }
 551
 552 /* If the current token matches IDENTIFIER, skips it and returns true.
 553    IDENTIFIER may be abbreviated to its first three letters.  Otherwise,
 554    returns false.
 555
 556    IDENTIFIER must be an ASCII string. */
 557 bool
 558 lex_match_id (struct lexer *lexer, const char *identifier)
 559 {
 560   return lex_match_id_n (lexer, identifier, 3);
 561 }
 562
 563 /* If the current token is IDENTIFIER, skips it and returns true.  IDENTIFIER
 564    may be abbreviated to its first N letters.  Otherwise, returns false.
 565
 566    IDENTIFIER must be an ASCII string. */
 567 bool
 568 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
 569 {
 570   if (lex_token (lexer) == T_ID
 571       && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
 572     {
 573       lex_get (lexer);
 574       return true;
 575     }
 576   else
 577     return false;
 578 }
 579
 580 /* If the current token is integer X, skips it and returns true.  Otherwise,
 581    returns false. */
 582 bool
 583 lex_match_int (struct lexer *lexer, int x)
 584 {
 585   if (lex_is_integer (lexer) && lex_integer (lexer) == x)
 586     {
 587       lex_get (lexer);
 588       return true;
 589     }
 590   else
 591     return false;
 592 }
 593 \f
 594 /* Forced matches. */
 595
 596 /* If this token is IDENTIFIER, skips it and returns true.  IDENTIFIER may be
 597    abbreviated to its first 3 letters.  Otherwise, reports an error and returns
 598    false.
 599
 600    IDENTIFIER must be an ASCII string. */
 601 bool
 602 lex_force_match_id (struct lexer *lexer, const char *identifier)
 603 {
 604   if (lex_match_id (lexer, identifier))
 605     return true;
 606   else
 607     {
 608       lex_error_expecting (lexer, identifier);
 609       return false;
 610     }
 611 }
 612
 613 /* If the current token has the specified TYPE, skips it and returns true.
 614    Otherwise, reports an error and returns false. */
 615 bool
 616 lex_force_match (struct lexer *lexer, enum token_type type)
 617 {
 618   if (lex_token (lexer) == type)
 619     {
 620       lex_get (lexer);
 621       return true;
 622     }
 623   else
 624     {
 625       const char *type_string = token_type_to_string (type);
 626       if (type_string)
 627         {
 628           char *s = xasprintf ("`%s'", type_string);
 629           lex_error_expecting (lexer, s);
 630           free (s);
 631         }
 632       else
 633         lex_error_expecting (lexer, token_type_to_name (type));
 634
 635       return false;
 636     }
 637 }
 638
 639 /* If the current token is a string, does nothing and returns true.
 640    Otherwise, reports an error and returns false. */
 641 bool
 642 lex_force_string (struct lexer *lexer)
 643 {
 644   if (lex_is_string (lexer))
 645     return true;
 646   else
 647     {
 648       lex_error (lexer, _("expecting string"));
 649       return false;
 650     }
 651 }
 652
 653 /* If the current token is a string or an identifier, does nothing and returns
 654    true.  Otherwise, reports an error and returns false.
 655
 656    This is meant for use in syntactic situations where we want to encourage the
 657    user to supply a quoted string, but for compatibility we also accept
 658    identifiers.  (One example of such a situation is file names.)  Therefore,
 659    the error message issued when the current token is wrong only says that a
 660    string is expected and doesn't mention that an identifier would also be
 661    accepted. */
 662 bool
 663 lex_force_string_or_id (struct lexer *lexer)
 664 {
 665   return lex_token (lexer) == T_ID || lex_force_string (lexer);
 666 }
 667
 668 /* If the current token is an integer, does nothing and returns true.
 669    Otherwise, reports an error and returns false. */
 670 bool
 671 lex_force_int (struct lexer *lexer)
 672 {
 673   if (lex_is_integer (lexer))
 674     return true;
 675   else
 676     {
 677       lex_error (lexer, _("expecting integer"));
 678       return false;
 679     }
 680 }
 681
 682 /* If the current token is a number, does nothing and returns true.
 683    Otherwise, reports an error and returns false. */
 684 bool
 685 lex_force_num (struct lexer *lexer)
 686 {
 687   if (lex_is_number (lexer))
 688     return true;
 689
 690   lex_error (lexer, _("expecting number"));
 691   return false;
 692 }
 693
 694 /* If the current token is an identifier, does nothing and returns true.
 695    Otherwise, reports an error and returns false. */
 696 bool
 697 lex_force_id (struct lexer *lexer)
 698 {
 699   if (lex_token (lexer) == T_ID)
 700     return true;
 701
 702   lex_error (lexer, _("expecting identifier"));
 703   return false;
 704 }
 705 \f
 706 /* Token accessors. */
 707
 708 /* Returns the type of LEXER's current token. */
 709 enum token_type
 710 lex_token (const struct lexer *lexer)
 711 {
 712   return lex_next_token (lexer, 0);
 713 }
 714
 715 /* Returns the number in LEXER's current token.
 716
 717    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
 718    tokens this function will always return zero. */
 719 double
 720 lex_tokval (const struct lexer *lexer)
 721 {
 722   return lex_next_tokval (lexer, 0);
 723 }
 724
 725 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
 726
 727    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 728    this functions this function will always return NULL.
 729
 730    The UTF-8 encoding of the returned string is correct for variable names and
 731    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 732    data_in() to use it in a "union value".  */
 733 const char *
 734 lex_tokcstr (const struct lexer *lexer)
 735 {
 736   return lex_next_tokcstr (lexer, 0);
 737 }
 738
 739 /* Returns the string in LEXER's current token, UTF-8 encoded.  The string is
 740    null-terminated (but the null terminator is not included in the returned
 741    substring's 'length').
 742
 743    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 744    this functions this function will always return NULL.
 745
 746    The UTF-8 encoding of the returned string is correct for variable names and
 747    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 748    data_in() to use it in a "union value".  */
 749 struct substring
 750 lex_tokss (const struct lexer *lexer)
 751 {
 752   return lex_next_tokss (lexer, 0);
 753 }
 754 \f
 755 /* Looking ahead.
 756
 757    A value of 0 for N as an argument to any of these functions refers to the
 758    current token.  Lookahead is limited to the current command.  Any N greater
 759    than the number of tokens remaining in the current command will be treated
 760    as referring to a T_ENDCMD token. */
 761
 762 static const struct lex_token *
 763 lex_next__ (const struct lexer *lexer_, int n)
 764 {
 765   struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
 766   struct lex_source *src = lex_source__ (lexer);
 767
 768   if (src != NULL)
 769     return lex_source_next__ (src, n);
 770   else
 771     {
 772       static const struct lex_token stop_token =
 773         { TOKEN_INITIALIZER (T_STOP, 0.0, ""), 0, 0, 0, 0 };
 774
 775       return &stop_token;
 776     }
 777 }
 778
 779 static const struct lex_token *
 780 lex_source_next__ (const struct lex_source *src, int n)
 781 {
 782   while (deque_count (&src->deque) <= n)
 783     {
 784       if (!deque_is_empty (&src->deque))
 785         {
 786           struct lex_token *front;
 787
 788           front = &src->tokens[deque_front (&src->deque, 0)];
 789           if (front->token.type == T_STOP || front->token.type == T_ENDCMD)
 790             return front;
 791         }
 792
 793       lex_source_get__ (src);
 794     }
 795
 796   return &src->tokens[deque_back (&src->deque, n)];
 797 }
 798
 799 /* Returns the "struct token" of the token N after the current one in LEXER.
 800    The returned pointer can be invalidated by pretty much any succeeding call
 801    into the lexer, although the string pointer within the returned token is
 802    only invalidated by consuming the token (e.g. with lex_get()). */
 803 const struct token *
 804 lex_next (const struct lexer *lexer, int n)
 805 {
 806   return &lex_next__ (lexer, n)->token;
 807 }
 808
 809 /* Returns the type of the token N after the current one in LEXER. */
 810 enum token_type
 811 lex_next_token (const struct lexer *lexer, int n)
 812 {
 813   return lex_next (lexer, n)->type;
 814 }
 815
 816 /* Returns the number in the tokn N after the current one in LEXER.
 817
 818    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
 819    tokens this function will always return zero. */
 820 double
 821 lex_next_tokval (const struct lexer *lexer, int n)
 822 {
 823   const struct token *token = lex_next (lexer, n);
 824   return token->number;
 825 }
 826
 827 /* Returns the null-terminated string in the token N after the current one, in
 828    UTF-8 encoding.
 829
 830    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 831    this functions this function will always return NULL.
 832
 833    The UTF-8 encoding of the returned string is correct for variable names and
 834    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 835    data_in() to use it in a "union value".  */
 836 const char *
 837 lex_next_tokcstr (const struct lexer *lexer, int n)
 838 {
 839   return lex_next_tokss (lexer, n).string;
 840 }
 841
 842 /* Returns the string in the token N after the current one, in UTF-8 encoding.
 843    The string is null-terminated (but the null terminator is not included in
 844    the returned substring's 'length').
 845
 846    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 847    this functions this function will always return NULL.
 848
 849    The UTF-8 encoding of the returned string is correct for variable names and
 850    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 851    data_in() to use it in a "union value".  */
 852 struct substring
 853 lex_next_tokss (const struct lexer *lexer, int n)
 854 {
 855   return lex_next (lexer, n)->string;
 856 }
 857
 858 static bool
 859 lex_tokens_match (const struct token *actual, const struct token *expected)
 860 {
 861   if (actual->type != expected->type)
 862     return false;
 863
 864   switch (actual->type)
 865     {
 866     case T_POS_NUM:
 867     case T_NEG_NUM:
 868       return actual->number == expected->number;
 869
 870     case T_ID:
 871       return lex_id_match (expected->string, actual->string);
 872
 873     case T_STRING:
 874       return (actual->string.length == expected->string.length
 875               && !memcmp (actual->string.string, expected->string.string,
 876                           actual->string.length));
 877
 878     default:
 879       return true;
 880     }
 881 }
 882
 883 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
 884    skips it and returns true.  Otherwise, returns false.
 885
 886    S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
 887    "2SLS", or "END INPUT PROGRAM".  Identifiers may be abbreviated to their
 888    first three letters. */
 889 bool
 890 lex_match_phrase (struct lexer *lexer, const char *s)
 891 {
 892   struct string_lexer slex;
 893   struct token token;
 894   int i;
 895
 896   i = 0;
 897   string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE);
 898   while (string_lexer_next (&slex, &token))
 899     if (token.type != SCAN_SKIP)
 900       {
 901         bool match = lex_tokens_match (lex_next (lexer, i++), &token);
 902         token_destroy (&token);
 903         if (!match)
 904           return false;
 905       }
 906
 907   while (i-- > 0)
 908     lex_get (lexer);
 909   return true;
 910 }
 911
 912 static int
 913 lex_source_get_first_line_number (const struct lex_source *src, int n)
 914 {
 915   return lex_source_next__ (src, n)->first_line;
 916 }
 917
 918 static int
 919 count_newlines (char *s, size_t length)
 920 {
 921   int n_newlines = 0;
 922   char *newline;
 923
 924   while ((newline = memchr (s, '\n', length)) != NULL)
 925     {
 926       n_newlines++;
 927       length -= (newline + 1) - s;
 928       s = newline + 1;
 929     }
 930
 931   return n_newlines;
 932 }
 933
 934 static int
 935 lex_source_get_last_line_number (const struct lex_source *src, int n)
 936 {
 937   const struct lex_token *token = lex_source_next__ (src, n);
 938
 939   if (token->first_line == 0)
 940     return 0;
 941   else
 942     {
 943       char *token_str = &src->buffer[token->token_pos - src->tail];
 944       return token->first_line + count_newlines (token_str, token->token_len) + 1;
 945     }
 946 }
 947
 948 static int
 949 count_columns (const char *s_, size_t length)
 950 {
 951   const uint8_t *s = CHAR_CAST (const uint8_t *, s_);
 952   int columns;
 953   size_t ofs;
 954   int mblen;
 955
 956   columns = 0;
 957   for (ofs = 0; ofs < length; ofs += mblen)
 958     {
 959       ucs4_t uc;
 960
 961       mblen = u8_mbtouc (&uc, s + ofs, length - ofs);
 962       if (uc != '\t')
 963         {
 964           int width = uc_width (uc, "UTF-8");
 965           if (width > 0)
 966             columns += width;
 967         }
 968       else
 969         columns = ROUND_UP (columns + 1, 8);
 970     }
 971
 972   return columns + 1;
 973 }
 974
 975 static int
 976 lex_source_get_first_column (const struct lex_source *src, int n)
 977 {
 978   const struct lex_token *token = lex_source_next__ (src, n);
 979   return count_columns (&src->buffer[token->line_pos - src->tail],
 980                         token->token_pos - token->line_pos);
 981 }
 982
 983 static int
 984 lex_source_get_last_column (const struct lex_source *src, int n)
 985 {
 986   const struct lex_token *token = lex_source_next__ (src, n);
 987   char *start, *end, *newline;
 988
 989   start = &src->buffer[token->line_pos - src->tail];
 990   end = &src->buffer[(token->token_pos + token->token_len) - src->tail];
 991   newline = memrchr (start, '\n', end - start);
 992   if (newline != NULL)
 993     start = newline + 1;
 994   return count_columns (start, end - start);
 995 }
 996
 997 /* Returns the 1-based line number of the start of the syntax that represents
 998    the token N after the current one in LEXER.  Returns 0 for a T_STOP token or
 999    if the token is drawn from a source that does not have line numbers. */
1000 int
1001 lex_get_first_line_number (const struct lexer *lexer, int n)
1002 {
1003   const struct lex_source *src = lex_source__ (lexer);
1004   return src != NULL ? lex_source_get_first_line_number (src, n) : 0;
1005 }
1006
1007 /* Returns the 1-based line number of the end of the syntax that represents the
1008    token N after the current one in LEXER, plus 1.  Returns 0 for a T_STOP
1009    token or if the token is drawn from a source that does not have line
1010    numbers.
1011
1012    Most of the time, a single token is wholly within a single line of syntax,
1013    but there are two exceptions: a T_STRING token can be made up of multiple
1014    segments on adjacent lines connected with "+" punctuators, and a T_NEG_NUM
1015    token can consist of a "-" on one line followed by the number on the next.
1016  */
1017 int
1018 lex_get_last_line_number (const struct lexer *lexer, int n)
1019 {
1020   const struct lex_source *src = lex_source__ (lexer);
1021   return src != NULL ? lex_source_get_last_line_number (src, n) : 0;
1022 }
1023
1024 /* Returns the 1-based column number of the start of the syntax that represents
1025    the token N after the current one in LEXER.  Returns 0 for a T_STOP
1026    token.
1027
1028    Column numbers are measured according to the width of characters as shown in
1029    a typical fixed-width font, in which CJK characters have width 2 and
1030    combining characters have width 0.  */
1031 int
1032 lex_get_first_column (const struct lexer *lexer, int n)
1033 {
1034   const struct lex_source *src = lex_source__ (lexer);
1035   return src != NULL ? lex_source_get_first_column (src, n) : 0;
1036 }
1037
1038 /* Returns the 1-based column number of the end of the syntax that represents
1039    the token N after the current one in LEXER, plus 1.  Returns 0 for a T_STOP
1040    token.
1041
1042    Column numbers are measured according to the width of characters as shown in
1043    a typical fixed-width font, in which CJK characters have width 2 and
1044    combining characters have width 0.  */
1045 int
1046 lex_get_last_column (const struct lexer *lexer, int n)
1047 {
1048   const struct lex_source *src = lex_source__ (lexer);
1049   return src != NULL ? lex_source_get_last_column (src, n) : 0;
1050 }
1051
1052 /* Returns the name of the syntax file from which the current command is drawn.
1053    Returns NULL for a T_STOP token or if the command's source does not have
1054    line numbers.
1055
1056    There is no version of this function that takes an N argument because
1057    lookahead only works to the end of a command and any given command is always
1058    within a single syntax file. */
1059 const char *
1060 lex_get_file_name (const struct lexer *lexer)
1061 {
1062   struct lex_source *src = lex_source__ (lexer);
1063   return src == NULL ? NULL : src->reader->file_name;
1064 }
1065
1066 const char *
1067 lex_get_encoding (const struct lexer *lexer)
1068 {
1069   struct lex_source *src = lex_source__ (lexer);
1070   return src == NULL ? NULL : src->reader->encoding;
1071 }
1072
1073
1074 /* Returns the syntax mode for the syntax file from which the current drawn is
1075    drawn.  Returns LEX_SYNTAX_AUTO for a T_STOP token or if the command's
1076    source does not have line numbers.
1077
1078    There is no version of this function that takes an N argument because
1079    lookahead only works to the end of a command and any given command is always
1080    within a single syntax file. */
1081 enum lex_syntax_mode
1082 lex_get_syntax_mode (const struct lexer *lexer)
1083 {
1084   struct lex_source *src = lex_source__ (lexer);
1085   return src == NULL ? LEX_SYNTAX_AUTO : src->reader->syntax;
1086 }
1087
1088 /* Returns the error mode for the syntax file from which the current drawn is
1089    drawn.  Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1090    source does not have line numbers.
1091
1092    There is no version of this function that takes an N argument because
1093    lookahead only works to the end of a command and any given command is always
1094    within a single syntax file. */
1095 enum lex_error_mode
1096 lex_get_error_mode (const struct lexer *lexer)
1097 {
1098   struct lex_source *src = lex_source__ (lexer);
1099   return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1100 }
1101
1102 /* If the source that LEXER is currently reading has error mode
1103    LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1104    token to be read comes directly from whatever is next read from the stream.
1105
1106    It makes sense to call this function after encountering an error in a
1107    command entered on the console, because usually the user would prefer not to
1108    have cascading errors. */
1109 void
1110 lex_interactive_reset (struct lexer *lexer)
1111 {
1112   struct lex_source *src = lex_source__ (lexer);
1113   if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1114     {
1115       src->head = src->tail = 0;
1116       src->journal_pos = src->seg_pos = src->line_pos = 0;
1117       src->n_newlines = 0;
1118       src->suppress_next_newline = false;
1119       segmenter_init (&src->segmenter, segmenter_get_mode (&src->segmenter));
1120       while (!deque_is_empty (&src->deque))
1121         lex_source_pop__ (src);
1122       lex_source_push_endcmd__ (src);
1123     }
1124 }
1125
1126 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1127 void
1128 lex_discard_rest_of_command (struct lexer *lexer)
1129 {
1130   while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1131     lex_get (lexer);
1132 }
1133
1134 /* Discards all lookahead tokens in LEXER, then discards all input sources
1135    until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1136    runs out of input sources. */
1137 void
1138 lex_discard_noninteractive (struct lexer *lexer)
1139 {
1140   struct lex_source *src = lex_source__ (lexer);
1141
1142   if (src != NULL)
1143     {
1144       while (!deque_is_empty (&src->deque))
1145         lex_source_pop__ (src);
1146
1147       for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1148            src = lex_source__ (lexer))
1149         lex_source_destroy (src);
1150     }
1151 }
1152 \f
1153 static size_t
1154 lex_source_max_tail__ (const struct lex_source *src)
1155 {
1156   const struct lex_token *token;
1157   size_t max_tail;
1158
1159   assert (src->seg_pos >= src->line_pos);
1160   max_tail = MIN (src->journal_pos, src->line_pos);
1161
1162   /* Use the oldest token also.  (We know that src->deque cannot be empty
1163      because we are in the process of adding a new token, which is already
1164      initialized enough to use here.) */
1165   token = &src->tokens[deque_back (&src->deque, 0)];
1166   assert (token->token_pos >= token->line_pos);
1167   max_tail = MIN (max_tail, token->line_pos);
1168
1169   return max_tail;
1170 }
1171
1172 static void
1173 lex_source_expand__ (struct lex_source *src)
1174 {
1175   if (src->head - src->tail >= src->allocated)
1176     {
1177       size_t max_tail = lex_source_max_tail__ (src);
1178       if (max_tail > src->tail)
1179         {
1180           /* Advance the tail, freeing up room at the head. */
1181           memmove (src->buffer, src->buffer + (max_tail - src->tail),
1182                    src->head - max_tail);
1183           src->tail = max_tail;
1184         }
1185       else
1186         {
1187           /* Buffer is completely full.  Expand it. */
1188           src->buffer = x2realloc (src->buffer, &src->allocated);
1189         }
1190     }
1191   else
1192     {
1193       /* There's space available at the head of the buffer.  Nothing to do. */
1194     }
1195 }
1196
1197 static void
1198 lex_source_read__ (struct lex_source *src)
1199 {
1200   do
1201     {
1202       lex_source_expand__ (src);
1203
1204       size_t head_ofs = src->head - src->tail;
1205       size_t space = src->allocated - head_ofs;
1206       enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1207       size_t n = src->reader->class->read (src->reader, &src->buffer[head_ofs],
1208                                            space, prompt);
1209       assert (n <= space);
1210
1211       if (n == 0)
1212         {
1213           /* End of input. */
1214           src->reader->eof = true;
1215           lex_source_expand__ (src);
1216           return;
1217         }
1218
1219       src->head += n;
1220     }
1221   while (!memchr (&src->buffer[src->seg_pos - src->tail], '\n',
1222                   src->head - src->seg_pos));
1223 }
1224
1225 static struct lex_source *
1226 lex_source__ (const struct lexer *lexer)
1227 {
1228   return (ll_is_empty (&lexer->sources) ? NULL
1229           : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1230 }
1231
1232 static struct substring
1233 lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
1234 {
1235   const struct lex_token *token0 = lex_source_next__ (src, n0);
1236   const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1));
1237   size_t start = token0->token_pos;
1238   size_t end = token1->token_pos + token1->token_len;
1239
1240   return ss_buffer (&src->buffer[start - src->tail], end - start);
1241 }
1242
1243 static void
1244 lex_ellipsize__ (struct substring in, char *out, size_t out_size)
1245 {
1246   size_t out_maxlen;
1247   size_t out_len;
1248   int mblen;
1249
1250   assert (out_size >= 16);
1251   out_maxlen = out_size - 1;
1252   if (in.length > out_maxlen - 3)
1253     out_maxlen -= 3;
1254
1255   for (out_len = 0; out_len < in.length; out_len += mblen)
1256     {
1257       if (in.string[out_len] == '\n'
1258           || in.string[out_len] == '\0'
1259           || (in.string[out_len] == '\r'
1260               && out_len + 1 < in.length
1261               && in.string[out_len + 1] == '\n'))
1262         break;
1263
1264       mblen = u8_mblen (CHAR_CAST (const uint8_t *, in.string + out_len),
1265                         in.length - out_len);
1266
1267       if (mblen < 0)
1268         break;
1269
1270       if (out_len + mblen > out_maxlen)
1271         break;
1272     }
1273
1274   memcpy (out, in.string, out_len);
1275   strcpy (&out[out_len], out_len < in.length ? "..." : "");
1276 }
1277
1278 static void
1279 lex_source_error_valist (struct lex_source *src, int n0, int n1,
1280                          const char *format, va_list args)
1281 {
1282   const struct lex_token *token;
1283   struct string s;
1284
1285   ds_init_empty (&s);
1286
1287   token = lex_source_next__ (src, n0);
1288   if (token->token.type == T_ENDCMD)
1289     ds_put_cstr (&s, _("Syntax error at end of command"));
1290   else
1291     {
1292       struct substring syntax = lex_source_get_syntax__ (src, n0, n1);
1293       if (!ss_is_empty (syntax))
1294         {
1295           char syntax_cstr[64];
1296
1297           lex_ellipsize__ (syntax, syntax_cstr, sizeof syntax_cstr);
1298           ds_put_format (&s, _("Syntax error at `%s'"), syntax_cstr);
1299         }
1300       else
1301         ds_put_cstr (&s, _("Syntax error"));
1302     }
1303
1304   if (format)
1305     {
1306       ds_put_cstr (&s, ": ");
1307       ds_put_vformat (&s, format, args);
1308     }
1309   ds_put_byte (&s, '.');
1310
1311   struct msg m = {
1312     .category = MSG_C_SYNTAX,
1313     .severity = MSG_S_ERROR,
1314     .file_name = src->reader->file_name,
1315     .first_line = lex_source_get_first_line_number (src, n0),
1316     .last_line = lex_source_get_last_line_number (src, n1),
1317     .first_column = lex_source_get_first_column (src, n0),
1318     .last_column = lex_source_get_last_column (src, n1),
1319     .text = ds_steal_cstr (&s),
1320   };
1321   msg_emit (&m);
1322 }
1323
1324 static void PRINTF_FORMAT (2, 3)
1325 lex_get_error (struct lex_source *src, const char *format, ...)
1326 {
1327   va_list args;
1328   int n;
1329
1330   va_start (args, format);
1331
1332   n = deque_count (&src->deque) - 1;
1333   lex_source_error_valist (src, n, n, format, args);
1334   lex_source_pop_front (src);
1335
1336   va_end (args);
1337 }
1338
1339 /* Attempts to append an additional token into SRC's deque, reading more from
1340    the underlying lex_reader if necessary..  Returns true if successful, false
1341    if the deque already represents (a suffix of) the whole lex_reader's
1342    contents, */
1343 static bool
1344 lex_source_get__ (const struct lex_source *src_)
1345 {
1346   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1347   if (src->eof)
1348     return false;
1349
1350   /* State maintained while scanning tokens.  Usually we only need a single
1351      state, but scanner_push() can return SCAN_SAVE to indicate that the state
1352      needs to be saved and possibly restored later with SCAN_BACK. */
1353   struct state
1354     {
1355       struct segmenter segmenter;
1356       enum segment_type last_segment;
1357       int newlines;             /* Number of newlines encountered so far. */
1358       /* Maintained here so we can update lex_source's similar members when we
1359          finish. */
1360       size_t line_pos;
1361       size_t seg_pos;
1362     };
1363
1364   /* Initialize state. */
1365   struct state state =
1366     {
1367       .segmenter = src->segmenter,
1368       .newlines = 0,
1369       .seg_pos = src->seg_pos,
1370       .line_pos = src->line_pos,
1371     };
1372   struct state saved = state;
1373
1374   /* Append a new token to SRC and initialize it. */
1375   struct lex_token *token = lex_push_token__ (src);
1376   struct scanner scanner;
1377   scanner_init (&scanner, &token->token);
1378   token->line_pos = src->line_pos;
1379   token->token_pos = src->seg_pos;
1380   if (src->reader->line_number > 0)
1381     token->first_line = src->reader->line_number + src->n_newlines;
1382   else
1383     token->first_line = 0;
1384
1385   /* Extract segments and pass them through the scanner until we obtain a
1386      token. */
1387   for (;;)
1388     {
1389       /* Extract a segment. */
1390       const char *segment = &src->buffer[state.seg_pos - src->tail];
1391       size_t seg_maxlen = src->head - state.seg_pos;
1392       enum segment_type type;
1393       int seg_len = segmenter_push (&state.segmenter, segment, seg_maxlen,
1394                                     src->reader->eof, &type);
1395       if (seg_len < 0)
1396         {
1397           /* The segmenter needs more input to produce a segment. */
1398           assert (!src->reader->eof);
1399           lex_source_read__ (src);
1400           continue;
1401         }
1402
1403       /* Update state based on the segment. */
1404       state.last_segment = type;
1405       state.seg_pos += seg_len;
1406       if (type == SEG_NEWLINE)
1407         {
1408           state.newlines++;
1409           state.line_pos = state.seg_pos;
1410         }
1411
1412       /* Pass the segment into the scanner and try to get a token out. */
1413       enum scan_result result = scanner_push (&scanner, type,
1414                                               ss_buffer (segment, seg_len),
1415                                               &token->token);
1416       if (result == SCAN_SAVE)
1417         saved = state;
1418       else if (result == SCAN_BACK)
1419         {
1420           state = saved;
1421           break;
1422         }
1423       else if (result == SCAN_DONE)
1424         break;
1425     }
1426
1427   /* If we've reached the end of a line, or the end of a command, then pass
1428      the line to the output engine as a syntax text item.  */
1429   int n_lines = state.newlines;
1430   if (state.last_segment == SEG_END_COMMAND && !src->suppress_next_newline)
1431     {
1432       n_lines++;
1433       src->suppress_next_newline = true;
1434     }
1435   else if (n_lines > 0 && src->suppress_next_newline)
1436     {
1437       n_lines--;
1438       src->suppress_next_newline = false;
1439     }
1440   for (int i = 0; i < n_lines; i++)
1441     {
1442       /* Beginning of line. */
1443       const char *line = &src->buffer[src->journal_pos - src->tail];
1444
1445       /* Calculate line length, including \n or \r\n end-of-line if present.
1446
1447          We use src->head even though that may be beyond what we've actually
1448          converted to tokens (which is only through state.line_pos).  That's
1449          because, if we're emitting the line due to SEG_END_COMMAND, we want to
1450          take the whole line through the newline, not just through the '.'. */
1451       size_t max_len = src->head - src->journal_pos;
1452       const char *newline = memchr (line, '\n', max_len);
1453       size_t line_len = newline ? newline - line + 1 : max_len;
1454
1455       /* Calculate line length excluding end-of-line. */
1456       size_t copy_len = line_len;
1457       if (copy_len > 0 && line[copy_len - 1] == '\n')
1458         copy_len--;
1459       if (copy_len > 0 && line[copy_len - 1] == '\r')
1460         copy_len--;
1461
1462       /* Submit the line as syntax. */
1463       output_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX,
1464                                                    xmemdup0 (line, copy_len),
1465                                                    NULL));
1466
1467       src->journal_pos += line_len;
1468     }
1469
1470   token->token_len = state.seg_pos - src->seg_pos;
1471
1472   src->segmenter = state.segmenter;
1473   src->seg_pos = state.seg_pos;
1474   src->line_pos = state.line_pos;
1475   src->n_newlines += state.newlines;
1476
1477   switch (token->token.type)
1478     {
1479     default:
1480       break;
1481
1482     case T_STOP:
1483       token->token.type = T_ENDCMD;
1484       src->eof = true;
1485       break;
1486
1487     case SCAN_BAD_HEX_LENGTH:
1488       lex_get_error (src, _("String of hex digits has %d characters, which "
1489                             "is not a multiple of 2"),
1490                      (int) token->token.number);
1491       break;
1492
1493     case SCAN_BAD_HEX_DIGIT:
1494     case SCAN_BAD_UNICODE_DIGIT:
1495       lex_get_error (src, _("`%c' is not a valid hex digit"),
1496                      (int) token->token.number);
1497       break;
1498
1499     case SCAN_BAD_UNICODE_LENGTH:
1500       lex_get_error (src, _("Unicode string contains %d bytes, which is "
1501                             "not in the valid range of 1 to 8 bytes"),
1502                      (int) token->token.number);
1503       break;
1504
1505     case SCAN_BAD_UNICODE_CODE_POINT:
1506       lex_get_error (src, _("U+%04X is not a valid Unicode code point"),
1507                      (int) token->token.number);
1508       break;
1509
1510     case SCAN_EXPECTED_QUOTE:
1511       lex_get_error (src, _("Unterminated string constant"));
1512       break;
1513
1514     case SCAN_EXPECTED_EXPONENT:
1515       lex_get_error (src, _("Missing exponent following `%s'"),
1516                      token->token.string.string);
1517       break;
1518
1519     case SCAN_UNEXPECTED_DOT:
1520       lex_get_error (src, _("Unexpected `.' in middle of command"));
1521       break;
1522
1523     case SCAN_UNEXPECTED_CHAR:
1524       {
1525         char c_name[16];
1526         lex_get_error (src, _("Bad character %s in input"),
1527                        uc_name (token->token.number, c_name));
1528       }
1529       break;
1530
1531     case SCAN_SKIP:
1532       lex_source_pop_front (src);
1533       break;
1534     }
1535
1536   return true;
1537 }
1538 \f
1539 static void
1540 lex_source_push_endcmd__ (struct lex_source *src)
1541 {
1542   struct lex_token *token = lex_push_token__ (src);
1543   token->token.type = T_ENDCMD;
1544   token->token_pos = 0;
1545   token->token_len = 0;
1546   token->line_pos = 0;
1547   token->first_line = 0;
1548 }
1549
1550 static struct lex_source *
1551 lex_source_create (struct lex_reader *reader)
1552 {
1553   struct lex_source *src;
1554   enum segmenter_mode mode;
1555
1556   src = xzalloc (sizeof *src);
1557   src->reader = reader;
1558
1559   if (reader->syntax == LEX_SYNTAX_AUTO)
1560     mode = SEG_MODE_AUTO;
1561   else if (reader->syntax == LEX_SYNTAX_INTERACTIVE)
1562     mode = SEG_MODE_INTERACTIVE;
1563   else if (reader->syntax == LEX_SYNTAX_BATCH)
1564     mode = SEG_MODE_BATCH;
1565   else
1566     NOT_REACHED ();
1567   segmenter_init (&src->segmenter, mode);
1568
1569   src->tokens = deque_init (&src->deque, 4, sizeof *src->tokens);
1570
1571   lex_source_push_endcmd__ (src);
1572
1573   return src;
1574 }
1575
1576 static void
1577 lex_source_destroy (struct lex_source *src)
1578 {
1579   char *file_name = src->reader->file_name;
1580   char *encoding = src->reader->encoding;
1581   if (src->reader->class->destroy != NULL)
1582     src->reader->class->destroy (src->reader);
1583   free (file_name);
1584   free (encoding);
1585   free (src->buffer);
1586   while (!deque_is_empty (&src->deque))
1587     lex_source_pop__ (src);
1588   free (src->tokens);
1589   ll_remove (&src->ll);
1590   free (src);
1591 }
1592 \f
1593 struct lex_file_reader
1594   {
1595     struct lex_reader reader;
1596     struct u8_istream *istream;
1597   };
1598
1599 static struct lex_reader_class lex_file_reader_class;
1600
1601 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
1602    from stdin if FILE_NAME is "-").  The file is expected to be encoded with
1603    ENCODING, which should take one of the forms accepted by
1604    u8_istream_for_file().  SYNTAX and ERROR become the syntax mode and error
1605    mode of the new reader, respectively.
1606
1607    Returns a null pointer if FILE_NAME cannot be opened. */
1608 struct lex_reader *
1609 lex_reader_for_file (const char *file_name, const char *encoding,
1610                      enum lex_syntax_mode syntax,
1611                      enum lex_error_mode error)
1612 {
1613   struct lex_file_reader *r;
1614   struct u8_istream *istream;
1615
1616   istream = (!strcmp(file_name, "-")
1617              ? u8_istream_for_fd (encoding, STDIN_FILENO)
1618              : u8_istream_for_file (encoding, file_name, O_RDONLY));
1619   if (istream == NULL)
1620     {
1621       msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
1622       return NULL;
1623     }
1624
1625   r = xmalloc (sizeof *r);
1626   lex_reader_init (&r->reader, &lex_file_reader_class);
1627   r->reader.syntax = syntax;
1628   r->reader.error = error;
1629   r->reader.file_name = xstrdup (file_name);
1630   r->reader.encoding = xstrdup_if_nonnull (encoding);
1631   r->reader.line_number = 1;
1632   r->istream = istream;
1633
1634   return &r->reader;
1635 }
1636
1637 static struct lex_file_reader *
1638 lex_file_reader_cast (struct lex_reader *r)
1639 {
1640   return UP_CAST (r, struct lex_file_reader, reader);
1641 }
1642
1643 static size_t
1644 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
1645                enum prompt_style prompt_style UNUSED)
1646 {
1647   struct lex_file_reader *r = lex_file_reader_cast (r_);
1648   ssize_t n_read = u8_istream_read (r->istream, buf, n);
1649   if (n_read < 0)
1650     {
1651       msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
1652       return 0;
1653     }
1654   return n_read;
1655 }
1656
1657 static void
1658 lex_file_close (struct lex_reader *r_)
1659 {
1660   struct lex_file_reader *r = lex_file_reader_cast (r_);
1661
1662   if (u8_istream_fileno (r->istream) != STDIN_FILENO)
1663     {
1664       if (u8_istream_close (r->istream) != 0)
1665         msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
1666     }
1667   else
1668     u8_istream_free (r->istream);
1669
1670   free (r);
1671 }
1672
1673 static struct lex_reader_class lex_file_reader_class =
1674   {
1675     lex_file_read,
1676     lex_file_close
1677   };
1678 \f
1679 struct lex_string_reader
1680   {
1681     struct lex_reader reader;
1682     struct substring s;
1683     size_t offset;
1684   };
1685
1686 static struct lex_reader_class lex_string_reader_class;
1687
1688 /* Creates and returns a new lex_reader for the contents of S, which must be
1689    encoded in the given ENCODING.  The new reader takes ownership of S and will free it
1690    with ss_dealloc() when it is closed. */
1691 struct lex_reader *
1692 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
1693 {
1694   struct lex_string_reader *r;
1695
1696   r = xmalloc (sizeof *r);
1697   lex_reader_init (&r->reader, &lex_string_reader_class);
1698   r->reader.syntax = LEX_SYNTAX_AUTO;
1699   r->reader.encoding = xstrdup_if_nonnull (encoding);
1700   r->s = s;
1701   r->offset = 0;
1702
1703   return &r->reader;
1704 }
1705
1706 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
1707    which must be encoded in ENCODING.  The caller retains ownership of S. */
1708 struct lex_reader *
1709 lex_reader_for_string (const char *s, const char *encoding)
1710 {
1711   struct substring ss;
1712   ss_alloc_substring (&ss, ss_cstr (s));
1713   return lex_reader_for_substring_nocopy (ss, encoding);
1714 }
1715
1716 /* Formats FORMAT as a printf()-like format string and creates and returns a
1717    new lex_reader for the formatted result.  */
1718 struct lex_reader *
1719 lex_reader_for_format (const char *format, const char *encoding, ...)
1720 {
1721   struct lex_reader *r;
1722   va_list args;
1723
1724   va_start (args, encoding);
1725   r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
1726   va_end (args);
1727
1728   return r;
1729 }
1730
1731 static struct lex_string_reader *
1732 lex_string_reader_cast (struct lex_reader *r)
1733 {
1734   return UP_CAST (r, struct lex_string_reader, reader);
1735 }
1736
1737 static size_t
1738 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
1739                  enum prompt_style prompt_style UNUSED)
1740 {
1741   struct lex_string_reader *r = lex_string_reader_cast (r_);
1742   size_t chunk;
1743
1744   chunk = MIN (n, r->s.length - r->offset);
1745   memcpy (buf, r->s.string + r->offset, chunk);
1746   r->offset += chunk;
1747
1748   return chunk;
1749 }
1750
1751 static void
1752 lex_string_close (struct lex_reader *r_)
1753 {
1754   struct lex_string_reader *r = lex_string_reader_cast (r_);
1755
1756   ss_dealloc (&r->s);
1757   free (r);
1758 }
1759
1760 static struct lex_reader_class lex_string_reader_class =
1761   {
1762     lex_string_read,
1763     lex_string_close
1764   };