pintos-os.org Git - pspp/blob - src/language/lexer/lexer.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "language/lexer/lexer.h"
  20
  21 #include <errno.h>
  22 #include <fcntl.h>
  23 #include <limits.h>
  24 #include <math.h>
  25 #include <stdarg.h>
  26 #include <stdlib.h>
  27 #include <string.h>
  28 #include <unictype.h>
  29 #include <unistd.h>
  30 #include <unistr.h>
  31 #include <uniwidth.h>
  32
  33 #include "language/command.h"
  34 #include "language/lexer/scan.h"
  35 #include "language/lexer/segment.h"
  36 #include "language/lexer/token.h"
  37 #include "libpspp/assertion.h"
  38 #include "libpspp/cast.h"
  39 #include "libpspp/deque.h"
  40 #include "libpspp/i18n.h"
  41 #include "libpspp/ll.h"
  42 #include "libpspp/message.h"
  43 #include "libpspp/macro.h"
  44 #include "libpspp/misc.h"
  45 #include "libpspp/str.h"
  46 #include "libpspp/u8-istream.h"
  47 #include "output/journal.h"
  48 #include "output/output-item.h"
  49
  50 #include "gl/c-ctype.h"
  51 #include "gl/minmax.h"
  52 #include "gl/xalloc.h"
  53 #include "gl/xmemdup0.h"
  54
  55 #include "gettext.h"
  56 #define _(msgid) gettext (msgid)
  57 #define N_(msgid) msgid
  58
  59 /* A token within a lex_source. */
  60 struct lex_token
  61   {
  62     /* The regular token information. */
  63     struct token token;
  64
  65     /* Location of token in terms of the lex_source's buffer.
  66        src->tail <= line_pos <= token_pos <= src->head. */
  67     size_t token_pos;           /* Start of token. */
  68     size_t token_len;           /* Length of source for token in bytes. */
  69     size_t line_pos;            /* Start of line containing token_pos. */
  70     int first_line;             /* Line number at token_pos. */
  71   };
  72
  73 /* A source of tokens, corresponding to a syntax file.
  74
  75    This is conceptually a lex_reader wrapped with everything needed to convert
  76    its UTF-8 bytes into tokens. */
  77 struct lex_source
  78   {
  79     struct ll ll;               /* In lexer's list of sources. */
  80     struct lex_reader *reader;
  81     struct lexer *lexer;
  82     struct segmenter segmenter;
  83     bool eof;                   /* True if T_STOP was read from 'reader'. */
  84
  85     /* Buffer of UTF-8 bytes. */
  86     char *buffer;
  87     size_t allocated;           /* Number of bytes allocated. */
  88     size_t tail;                /* &buffer[0] offset into UTF-8 source. */
  89     size_t head;                /* &buffer[head - tail] offset into source. */
  90
  91     /* Positions in source file, tail <= pos <= head for each member here. */
  92     size_t journal_pos;         /* First byte not yet output to journal. */
  93     size_t seg_pos;             /* First byte not yet scanned as token. */
  94     size_t line_pos;            /* First byte of line containing seg_pos. */
  95
  96     int n_newlines;             /* Number of new-lines up to seg_pos. */
  97     bool suppress_next_newline;
  98
  99     /* Tokens. */
 100     struct deque deque;         /* Indexes into 'tokens'. */
 101     struct lex_token *tokens;   /* Lookahead tokens for parser. */
 102   };
 103
 104 static struct lex_source *lex_source_create (struct lex_reader *);
 105 static void lex_source_destroy (struct lex_source *);
 106
 107 /* Lexer. */
 108 struct lexer
 109   {
 110     struct ll_list sources;     /* Contains "struct lex_source"s. */
 111   };
 112
 113 static struct lex_source *lex_source__ (const struct lexer *);
 114 static const struct lex_token *lex_next__ (const struct lexer *, int n);
 115 static void lex_source_push_endcmd__ (struct lex_source *);
 116
 117 static void lex_source_pop__ (struct lex_source *);
 118 static bool lex_source_get (const struct lex_source *);
 119 static void lex_source_error_valist (struct lex_source *, int n0, int n1,
 120                                      const char *format, va_list)
 121    PRINTF_FORMAT (4, 0);
 122 static const struct lex_token *lex_source_next__ (const struct lex_source *,
 123                                                   int n);
 124 \f
 125 /* Initializes READER with the specified CLASS and otherwise some reasonable
 126    defaults.  The caller should fill in the others members as desired. */
 127 void
 128 lex_reader_init (struct lex_reader *reader,
 129                  const struct lex_reader_class *class)
 130 {
 131   reader->class = class;
 132   reader->syntax = SEG_MODE_AUTO;
 133   reader->error = LEX_ERROR_CONTINUE;
 134   reader->file_name = NULL;
 135   reader->encoding = NULL;
 136   reader->line_number = 0;
 137   reader->eof = false;
 138 }
 139
 140 /* Frees any file name already in READER and replaces it by a copy of
 141    FILE_NAME, or if FILE_NAME is null then clears any existing name. */
 142 void
 143 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
 144 {
 145   free (reader->file_name);
 146   reader->file_name = xstrdup_if_nonnull (file_name);
 147 }
 148 \f
 149 /* Creates and returns a new lexer. */
 150 struct lexer *
 151 lex_create (void)
 152 {
 153   struct lexer *lexer = xzalloc (sizeof *lexer);
 154   ll_init (&lexer->sources);
 155   return lexer;
 156 }
 157
 158 /* Destroys LEXER. */
 159 void
 160 lex_destroy (struct lexer *lexer)
 161 {
 162   if (lexer != NULL)
 163     {
 164       struct lex_source *source, *next;
 165
 166       ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
 167         lex_source_destroy (source);
 168       free (lexer);
 169     }
 170 }
 171
 172 /* Inserts READER into LEXER so that the next token read by LEXER comes from
 173    READER.  Before the caller, LEXER must either be empty or at a T_ENDCMD
 174    token. */
 175 void
 176 lex_include (struct lexer *lexer, struct lex_reader *reader)
 177 {
 178   assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
 179   ll_push_head (&lexer->sources, &lex_source_create (lexer, reader)->ll);
 180 }
 181
 182 /* Appends READER to LEXER, so that it will be read after all other current
 183    readers have already been read. */
 184 void
 185 lex_append (struct lexer *lexer, struct lex_reader *reader)
 186 {
 187   ll_push_tail (&lexer->sources, &lex_source_create (lexer, reader)->ll);
 188 }
 189 \f
 190 /* Advancing. */
 191
 192 static struct lex_token *
 193 lex_push_token__ (struct lex_source *src)
 194 {
 195   struct lex_token *token;
 196
 197   if (deque_is_full (&src->deque))
 198     src->tokens = deque_expand (&src->deque, src->tokens, sizeof *src->tokens);
 199
 200   token = &src->tokens[deque_push_front (&src->deque)];
 201   token->token = (struct token) { .type = T_STOP };
 202   return token;
 203 }
 204
 205 static void
 206 lex_source_pop__ (struct lex_source *src)
 207 {
 208   token_uninit (&src->tokens[deque_pop_back (&src->deque)].token);
 209 }
 210
 211 static void
 212 lex_source_pop_front (struct lex_source *src)
 213 {
 214   token_uninit (&src->tokens[deque_pop_front (&src->deque)].token);
 215 }
 216
 217 /* Advances LEXER to the next token, consuming the current token. */
 218 void
 219 lex_get (struct lexer *lexer)
 220 {
 221   struct lex_source *src;
 222
 223   src = lex_source__ (lexer);
 224   if (src == NULL)
 225     return;
 226
 227   if (!deque_is_empty (&src->deque))
 228     lex_source_pop__ (src);
 229
 230   while (deque_is_empty (&src->deque))
 231     if (!lex_source_get (src))
 232       {
 233         lex_source_destroy (src);
 234         src = lex_source__ (lexer);
 235         if (src == NULL)
 236           return;
 237       }
 238 }
 239 \f
 240 /* Issuing errors. */
 241
 242 /* Prints a syntax error message containing the current token and
 243    given message MESSAGE (if non-null). */
 244 void
 245 lex_error (struct lexer *lexer, const char *format, ...)
 246 {
 247   va_list args;
 248
 249   va_start (args, format);
 250   lex_next_error_valist (lexer, 0, 0, format, args);
 251   va_end (args);
 252 }
 253
 254 /* Prints a syntax error message containing the current token and
 255    given message MESSAGE (if non-null). */
 256 void
 257 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
 258 {
 259   lex_next_error_valist (lexer, 0, 0, format, args);
 260 }
 261
 262 /* Prints a syntax error message containing the current token and
 263    given message MESSAGE (if non-null). */
 264 void
 265 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
 266 {
 267   va_list args;
 268
 269   va_start (args, format);
 270   lex_next_error_valist (lexer, n0, n1, format, args);
 271   va_end (args);
 272 }
 273
 274 /* Prints a syntax error message saying that one of the strings provided as
 275    varargs, up to the first NULL, is expected. */
 276 void
 277 (lex_error_expecting) (struct lexer *lexer, ...)
 278 {
 279   va_list args;
 280
 281   va_start (args, lexer);
 282   lex_error_expecting_valist (lexer, args);
 283   va_end (args);
 284 }
 285
 286 /* Prints a syntax error message saying that one of the options provided in
 287    ARGS, up to the first NULL, is expected. */
 288 void
 289 lex_error_expecting_valist (struct lexer *lexer, va_list args)
 290 {
 291   enum { MAX_OPTIONS = 9 };
 292   const char *options[MAX_OPTIONS];
 293   int n = 0;
 294   while (n < MAX_OPTIONS)
 295     {
 296       const char *option = va_arg (args, const char *);
 297       if (!option)
 298         break;
 299
 300       options[n++] = option;
 301     }
 302   lex_error_expecting_array (lexer, options, n);
 303 }
 304
 305 void
 306 lex_error_expecting_array (struct lexer *lexer, const char **options, size_t n)
 307 {
 308   switch (n)
 309     {
 310     case 0:
 311       lex_error (lexer, NULL);
 312       break;
 313
 314     case 1:
 315       lex_error (lexer, _("expecting %s"), options[0]);
 316       break;
 317
 318     case 2:
 319       lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
 320       break;
 321
 322     case 3:
 323       lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
 324                  options[2]);
 325       break;
 326
 327     case 4:
 328       lex_error (lexer, _("expecting %s, %s, %s, or %s"),
 329                  options[0], options[1], options[2], options[3]);
 330       break;
 331
 332     case 5:
 333       lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
 334                  options[0], options[1], options[2], options[3], options[4]);
 335       break;
 336
 337     case 6:
 338       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
 339                  options[0], options[1], options[2], options[3], options[4],
 340                  options[5]);
 341       break;
 342
 343     case 7:
 344       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
 345                  options[0], options[1], options[2], options[3], options[4],
 346                  options[5], options[6]);
 347       break;
 348
 349     case 8:
 350       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
 351                  options[0], options[1], options[2], options[3], options[4],
 352                  options[5], options[6], options[7]);
 353       break;
 354
 355     default:
 356       lex_error (lexer, NULL);
 357     }
 358 }
 359
 360 /* Reports an error to the effect that subcommand SBC may only be specified
 361    once.
 362
 363    This function does not take a lexer as an argument or use lex_error(),
 364    because the result would ordinarily just be redundant: "Syntax error at
 365    SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
 366    not help the user find the error. */
 367 void
 368 lex_sbc_only_once (const char *sbc)
 369 {
 370   msg (SE, _("Subcommand %s may only be specified once."), sbc);
 371 }
 372
 373 /* Reports an error to the effect that subcommand SBC is missing.
 374
 375    This function does not take a lexer as an argument or use lex_error(),
 376    because a missing subcommand can normally be detected only after the whole
 377    command has been parsed, and so lex_error() would always report "Syntax
 378    error at end of command", which does not help the user find the error. */
 379 void
 380 lex_sbc_missing (const char *sbc)
 381 {
 382   msg (SE, _("Required subcommand %s was not specified."), sbc);
 383 }
 384
 385 /* Reports an error to the effect that specification SPEC may only be specified
 386    once within subcommand SBC. */
 387 void
 388 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
 389 {
 390   lex_error (lexer, _("%s may only be specified once within subcommand %s"),
 391              spec, sbc);
 392 }
 393
 394 /* Reports an error to the effect that specification SPEC is missing within
 395    subcommand SBC. */
 396 void
 397 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
 398 {
 399   lex_error (lexer, _("Required %s specification missing from %s subcommand"),
 400              sbc, spec);
 401 }
 402
 403 /* Prints a syntax error message containing the current token and
 404    given message MESSAGE (if non-null). */
 405 void
 406 lex_next_error_valist (struct lexer *lexer, int n0, int n1,
 407                        const char *format, va_list args)
 408 {
 409   struct lex_source *src = lex_source__ (lexer);
 410
 411   if (src != NULL)
 412     lex_source_error_valist (src, n0, n1, format, args);
 413   else
 414     {
 415       struct string s;
 416
 417       ds_init_empty (&s);
 418       ds_put_format (&s, _("Syntax error at end of input"));
 419       if (format != NULL)
 420         {
 421           ds_put_cstr (&s, ": ");
 422           ds_put_vformat (&s, format, args);
 423         }
 424       ds_put_byte (&s, '.');
 425       msg (SE, "%s", ds_cstr (&s));
 426       ds_destroy (&s);
 427     }
 428 }
 429
 430 /* Checks that we're at end of command.
 431    If so, returns a successful command completion code.
 432    If not, flags a syntax error and returns an error command
 433    completion code. */
 434 int
 435 lex_end_of_command (struct lexer *lexer)
 436 {
 437   if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
 438     {
 439       lex_error (lexer, _("expecting end of command"));
 440       return CMD_FAILURE;
 441     }
 442   else
 443     return CMD_SUCCESS;
 444 }
 445 \f
 446 /* Token testing functions. */
 447
 448 /* Returns true if the current token is a number. */
 449 bool
 450 lex_is_number (const struct lexer *lexer)
 451 {
 452   return lex_next_is_number (lexer, 0);
 453 }
 454
 455 /* Returns true if the current token is a string. */
 456 bool
 457 lex_is_string (const struct lexer *lexer)
 458 {
 459   return lex_next_is_string (lexer, 0);
 460 }
 461
 462 /* Returns the value of the current token, which must be a
 463    floating point number. */
 464 double
 465 lex_number (const struct lexer *lexer)
 466 {
 467   return lex_next_number (lexer, 0);
 468 }
 469
 470 /* Returns true iff the current token is an integer. */
 471 bool
 472 lex_is_integer (const struct lexer *lexer)
 473 {
 474   return lex_next_is_integer (lexer, 0);
 475 }
 476
 477 /* Returns the value of the current token, which must be an
 478    integer. */
 479 long
 480 lex_integer (const struct lexer *lexer)
 481 {
 482   return lex_next_integer (lexer, 0);
 483 }
 484 \f
 485 /* Token testing functions with lookahead.
 486
 487    A value of 0 for N as an argument to any of these functions refers to the
 488    current token.  Lookahead is limited to the current command.  Any N greater
 489    than the number of tokens remaining in the current command will be treated
 490    as referring to a T_ENDCMD token. */
 491
 492 /* Returns true if the token N ahead of the current token is a number. */
 493 bool
 494 lex_next_is_number (const struct lexer *lexer, int n)
 495 {
 496   enum token_type next_token = lex_next_token (lexer, n);
 497   return next_token == T_POS_NUM || next_token == T_NEG_NUM;
 498 }
 499
 500 /* Returns true if the token N ahead of the current token is a string. */
 501 bool
 502 lex_next_is_string (const struct lexer *lexer, int n)
 503 {
 504   return lex_next_token (lexer, n) == T_STRING;
 505 }
 506
 507 /* Returns the value of the token N ahead of the current token, which must be a
 508    floating point number. */
 509 double
 510 lex_next_number (const struct lexer *lexer, int n)
 511 {
 512   assert (lex_next_is_number (lexer, n));
 513   return lex_next_tokval (lexer, n);
 514 }
 515
 516 /* Returns true if the token N ahead of the current token is an integer. */
 517 bool
 518 lex_next_is_integer (const struct lexer *lexer, int n)
 519 {
 520   double value;
 521
 522   if (!lex_next_is_number (lexer, n))
 523     return false;
 524
 525   value = lex_next_tokval (lexer, n);
 526   return value > LONG_MIN && value <= LONG_MAX && floor (value) == value;
 527 }
 528
 529 /* Returns the value of the token N ahead of the current token, which must be
 530    an integer. */
 531 long
 532 lex_next_integer (const struct lexer *lexer, int n)
 533 {
 534   assert (lex_next_is_integer (lexer, n));
 535   return lex_next_tokval (lexer, n);
 536 }
 537 \f
 538 /* Token matching functions. */
 539
 540 /* If the current token has the specified TYPE, skips it and returns true.
 541    Otherwise, returns false. */
 542 bool
 543 lex_match (struct lexer *lexer, enum token_type type)
 544 {
 545   if (lex_token (lexer) == type)
 546     {
 547       lex_get (lexer);
 548       return true;
 549     }
 550   else
 551     return false;
 552 }
 553
 554 /* If the current token matches IDENTIFIER, skips it and returns true.
 555    IDENTIFIER may be abbreviated to its first three letters.  Otherwise,
 556    returns false.
 557
 558    IDENTIFIER must be an ASCII string. */
 559 bool
 560 lex_match_id (struct lexer *lexer, const char *identifier)
 561 {
 562   return lex_match_id_n (lexer, identifier, 3);
 563 }
 564
 565 /* If the current token is IDENTIFIER, skips it and returns true.  IDENTIFIER
 566    may be abbreviated to its first N letters.  Otherwise, returns false.
 567
 568    IDENTIFIER must be an ASCII string. */
 569 bool
 570 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
 571 {
 572   if (lex_token (lexer) == T_ID
 573       && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
 574     {
 575       lex_get (lexer);
 576       return true;
 577     }
 578   else
 579     return false;
 580 }
 581
 582 /* If the current token is integer X, skips it and returns true.  Otherwise,
 583    returns false. */
 584 bool
 585 lex_match_int (struct lexer *lexer, int x)
 586 {
 587   if (lex_is_integer (lexer) && lex_integer (lexer) == x)
 588     {
 589       lex_get (lexer);
 590       return true;
 591     }
 592   else
 593     return false;
 594 }
 595 \f
 596 /* Forced matches. */
 597
 598 /* If this token is IDENTIFIER, skips it and returns true.  IDENTIFIER may be
 599    abbreviated to its first 3 letters.  Otherwise, reports an error and returns
 600    false.
 601
 602    IDENTIFIER must be an ASCII string. */
 603 bool
 604 lex_force_match_id (struct lexer *lexer, const char *identifier)
 605 {
 606   if (lex_match_id (lexer, identifier))
 607     return true;
 608   else
 609     {
 610       lex_error_expecting (lexer, identifier);
 611       return false;
 612     }
 613 }
 614
 615 /* If the current token has the specified TYPE, skips it and returns true.
 616    Otherwise, reports an error and returns false. */
 617 bool
 618 lex_force_match (struct lexer *lexer, enum token_type type)
 619 {
 620   if (lex_token (lexer) == type)
 621     {
 622       lex_get (lexer);
 623       return true;
 624     }
 625   else
 626     {
 627       const char *type_string = token_type_to_string (type);
 628       if (type_string)
 629         {
 630           char *s = xasprintf ("`%s'", type_string);
 631           lex_error_expecting (lexer, s);
 632           free (s);
 633         }
 634       else
 635         lex_error_expecting (lexer, token_type_to_name (type));
 636
 637       return false;
 638     }
 639 }
 640
 641 /* If the current token is a string, does nothing and returns true.
 642    Otherwise, reports an error and returns false. */
 643 bool
 644 lex_force_string (struct lexer *lexer)
 645 {
 646   if (lex_is_string (lexer))
 647     return true;
 648   else
 649     {
 650       lex_error (lexer, _("expecting string"));
 651       return false;
 652     }
 653 }
 654
 655 /* If the current token is a string or an identifier, does nothing and returns
 656    true.  Otherwise, reports an error and returns false.
 657
 658    This is meant for use in syntactic situations where we want to encourage the
 659    user to supply a quoted string, but for compatibility we also accept
 660    identifiers.  (One example of such a situation is file names.)  Therefore,
 661    the error message issued when the current token is wrong only says that a
 662    string is expected and doesn't mention that an identifier would also be
 663    accepted. */
 664 bool
 665 lex_force_string_or_id (struct lexer *lexer)
 666 {
 667   return lex_token (lexer) == T_ID || lex_force_string (lexer);
 668 }
 669
 670 /* If the current token is an integer, does nothing and returns true.
 671    Otherwise, reports an error and returns false. */
 672 bool
 673 lex_force_int (struct lexer *lexer)
 674 {
 675   if (lex_is_integer (lexer))
 676     return true;
 677   else
 678     {
 679       lex_error (lexer, _("expecting integer"));
 680       return false;
 681     }
 682 }
 683
 684 /* If the current token is an integer in the range MIN...MAX (inclusive), does
 685    nothing and returns true.  Otherwise, reports an error and returns false.
 686    If NAME is nonnull, then it is used in the error message. */
 687 bool
 688 lex_force_int_range (struct lexer *lexer, const char *name, long min, long max)
 689 {
 690   bool is_integer = lex_is_integer (lexer);
 691   bool too_small = is_integer && lex_integer (lexer) < min;
 692   bool too_big = is_integer && lex_integer (lexer) > max;
 693   if (is_integer && !too_small && !too_big)
 694     return true;
 695
 696   if (min > max)
 697     {
 698       /* Weird, maybe a bug in the caller.  Just report that we needed an
 699          integer. */
 700       if (name)
 701         lex_error (lexer, _("Integer expected for %s."), name);
 702       else
 703         lex_error (lexer, _("Integer expected."));
 704     }
 705   else if (min == max)
 706     {
 707       if (name)
 708         lex_error (lexer, _("Expected %ld for %s."), min, name);
 709       else
 710         lex_error (lexer, _("Expected %ld."), min);
 711     }
 712   else if (min + 1 == max)
 713     {
 714       if (name)
 715         lex_error (lexer, _("Expected %ld or %ld for %s."), min, min + 1, name);
 716       else
 717         lex_error (lexer, _("Expected %ld or %ld."), min, min + 1);
 718     }
 719   else
 720     {
 721       bool report_lower_bound = (min > INT_MIN / 2) || too_small;
 722       bool report_upper_bound = (max < INT_MAX / 2) || too_big;
 723
 724       if (report_lower_bound && report_upper_bound)
 725         {
 726           if (name)
 727             lex_error (lexer,
 728                        _("Expected integer between %ld and %ld for %s."),
 729                        min, max, name);
 730           else
 731             lex_error (lexer, _("Expected integer between %ld and %ld."),
 732                        min, max);
 733         }
 734       else if (report_lower_bound)
 735         {
 736           if (min == 0)
 737             {
 738               if (name)
 739                 lex_error (lexer, _("Expected non-negative integer for %s."),
 740                            name);
 741               else
 742                 lex_error (lexer, _("Expected non-negative integer."));
 743             }
 744           else if (min == 1)
 745             {
 746               if (name)
 747                 lex_error (lexer, _("Expected positive integer for %s."),
 748                            name);
 749               else
 750                 lex_error (lexer, _("Expected positive integer."));
 751             }
 752         }
 753       else if (report_upper_bound)
 754         {
 755           if (name)
 756             lex_error (lexer,
 757                        _("Expected integer less than or equal to %ld for %s."),
 758                        max, name);
 759           else
 760             lex_error (lexer, _("Expected integer less than or equal to %ld."),
 761                        max);
 762         }
 763       else
 764         {
 765           if (name)
 766             lex_error (lexer, _("Integer expected for %s."), name);
 767           else
 768             lex_error (lexer, _("Integer expected."));
 769         }
 770     }
 771   return false;
 772 }
 773
 774 /* If the current token is a number, does nothing and returns true.
 775    Otherwise, reports an error and returns false. */
 776 bool
 777 lex_force_num (struct lexer *lexer)
 778 {
 779   if (lex_is_number (lexer))
 780     return true;
 781
 782   lex_error (lexer, _("expecting number"));
 783   return false;
 784 }
 785
 786 /* If the current token is an identifier, does nothing and returns true.
 787    Otherwise, reports an error and returns false. */
 788 bool
 789 lex_force_id (struct lexer *lexer)
 790 {
 791   if (lex_token (lexer) == T_ID)
 792     return true;
 793
 794   lex_error (lexer, _("expecting identifier"));
 795   return false;
 796 }
 797 \f
 798 /* Token accessors. */
 799
 800 /* Returns the type of LEXER's current token. */
 801 enum token_type
 802 lex_token (const struct lexer *lexer)
 803 {
 804   return lex_next_token (lexer, 0);
 805 }
 806
 807 /* Returns the number in LEXER's current token.
 808
 809    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
 810    tokens this function will always return zero. */
 811 double
 812 lex_tokval (const struct lexer *lexer)
 813 {
 814   return lex_next_tokval (lexer, 0);
 815 }
 816
 817 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
 818
 819    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 820    this functions this function will always return NULL.
 821
 822    The UTF-8 encoding of the returned string is correct for variable names and
 823    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 824    data_in() to use it in a "union value".  */
 825 const char *
 826 lex_tokcstr (const struct lexer *lexer)
 827 {
 828   return lex_next_tokcstr (lexer, 0);
 829 }
 830
 831 /* Returns the string in LEXER's current token, UTF-8 encoded.  The string is
 832    null-terminated (but the null terminator is not included in the returned
 833    substring's 'length').
 834
 835    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 836    this functions this function will always return NULL.
 837
 838    The UTF-8 encoding of the returned string is correct for variable names and
 839    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 840    data_in() to use it in a "union value".  */
 841 struct substring
 842 lex_tokss (const struct lexer *lexer)
 843 {
 844   return lex_next_tokss (lexer, 0);
 845 }
 846 \f
 847 /* Looking ahead.
 848
 849    A value of 0 for N as an argument to any of these functions refers to the
 850    current token.  Lookahead is limited to the current command.  Any N greater
 851    than the number of tokens remaining in the current command will be treated
 852    as referring to a T_ENDCMD token. */
 853
 854 static const struct lex_token *
 855 lex_next__ (const struct lexer *lexer_, int n)
 856 {
 857   struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
 858   struct lex_source *src = lex_source__ (lexer);
 859
 860   if (src != NULL)
 861     return lex_source_next__ (src, n);
 862   else
 863     {
 864       static const struct lex_token stop_token = { .token = { .type = T_STOP } };
 865       return &stop_token;
 866     }
 867 }
 868
 869 static const struct token *
 870 lex_source_front (const struct lex_source *src)
 871 {
 872   return &src->tokens[deque_front (&src->deque, 0)].token;
 873 }
 874
 875 static const struct lex_token *
 876 lex_source_next__ (const struct lex_source *src, int n)
 877 {
 878   while (deque_count (&src->deque) <= n)
 879     {
 880       if (!deque_is_empty (&src->deque))
 881         {
 882           const struct token *front = lex_source_front (src);
 883           if (front->type == T_STOP || front->type == T_ENDCMD)
 884             return front;
 885         }
 886
 887       lex_source_get (src);
 888     }
 889
 890   return &src->tokens[deque_back (&src->deque, n)];
 891 }
 892
 893 /* Returns the "struct token" of the token N after the current one in LEXER.
 894    The returned pointer can be invalidated by pretty much any succeeding call
 895    into the lexer, although the string pointer within the returned token is
 896    only invalidated by consuming the token (e.g. with lex_get()). */
 897 const struct token *
 898 lex_next (const struct lexer *lexer, int n)
 899 {
 900   return &lex_next__ (lexer, n)->token;
 901 }
 902
 903 /* Returns the type of the token N after the current one in LEXER. */
 904 enum token_type
 905 lex_next_token (const struct lexer *lexer, int n)
 906 {
 907   return lex_next (lexer, n)->type;
 908 }
 909
 910 /* Returns the number in the tokn N after the current one in LEXER.
 911
 912    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
 913    tokens this function will always return zero. */
 914 double
 915 lex_next_tokval (const struct lexer *lexer, int n)
 916 {
 917   const struct token *token = lex_next (lexer, n);
 918   return token->number;
 919 }
 920
 921 /* Returns the null-terminated string in the token N after the current one, in
 922    UTF-8 encoding.
 923
 924    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 925    this functions this function will always return NULL.
 926
 927    The UTF-8 encoding of the returned string is correct for variable names and
 928    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 929    data_in() to use it in a "union value".  */
 930 const char *
 931 lex_next_tokcstr (const struct lexer *lexer, int n)
 932 {
 933   return lex_next_tokss (lexer, n).string;
 934 }
 935
 936 /* Returns the string in the token N after the current one, in UTF-8 encoding.
 937    The string is null-terminated (but the null terminator is not included in
 938    the returned substring's 'length').
 939
 940    Only T_ID, T_MACRO_ID, T_STRING tokens have meaningful strings.  For other
 941    tokens this functions this function will always return NULL.
 942
 943    The UTF-8 encoding of the returned string is correct for variable names and
 944    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 945    data_in() to use it in a "union value".  */
 946 struct substring
 947 lex_next_tokss (const struct lexer *lexer, int n)
 948 {
 949   return lex_next (lexer, n)->string;
 950 }
 951
 952 static bool
 953 lex_tokens_match (const struct token *actual, const struct token *expected)
 954 {
 955   if (actual->type != expected->type)
 956     return false;
 957
 958   switch (actual->type)
 959     {
 960     case T_POS_NUM:
 961     case T_NEG_NUM:
 962       return actual->number == expected->number;
 963
 964     case T_ID:
 965       return lex_id_match (expected->string, actual->string);
 966
 967     case T_STRING:
 968       return (actual->string.length == expected->string.length
 969               && !memcmp (actual->string.string, expected->string.string,
 970                           actual->string.length));
 971
 972     default:
 973       return true;
 974     }
 975 }
 976
 977 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
 978    skips it and returns true.  Otherwise, returns false.
 979
 980    S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
 981    "2SLS", or "END INPUT PROGRAM".  Identifiers may be abbreviated to their
 982    first three letters. */
 983 bool
 984 lex_match_phrase (struct lexer *lexer, const char *s)
 985 {
 986   struct string_lexer slex;
 987   struct token token;
 988   int i;
 989
 990   i = 0;
 991   string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE);
 992   while (string_lexer_next (&slex, &token))
 993     if (token.type != SCAN_SKIP)
 994       {
 995         bool match = lex_tokens_match (lex_next (lexer, i++), &token);
 996         token_uninit (&token);
 997         if (!match)
 998           return false;
 999       }
1000
1001   while (i-- > 0)
1002     lex_get (lexer);
1003   return true;
1004 }
1005
1006 static int
1007 lex_source_get_first_line_number (const struct lex_source *src, int n)
1008 {
1009   return lex_source_next__ (src, n)->first_line;
1010 }
1011
1012 static int
1013 count_newlines (char *s, size_t length)
1014 {
1015   int n_newlines = 0;
1016   char *newline;
1017
1018   while ((newline = memchr (s, '\n', length)) != NULL)
1019     {
1020       n_newlines++;
1021       length -= (newline + 1) - s;
1022       s = newline + 1;
1023     }
1024
1025   return n_newlines;
1026 }
1027
1028 static int
1029 lex_source_get_last_line_number (const struct lex_source *src, int n)
1030 {
1031   const struct lex_token *token = lex_source_next__ (src, n);
1032
1033   if (token->first_line == 0)
1034     return 0;
1035   else
1036     {
1037       char *token_str = &src->buffer[token->token_pos - src->tail];
1038       return token->first_line + count_newlines (token_str, token->token_len) + 1;
1039     }
1040 }
1041
1042 static int
1043 count_columns (const char *s_, size_t length)
1044 {
1045   const uint8_t *s = CHAR_CAST (const uint8_t *, s_);
1046   int columns;
1047   size_t ofs;
1048   int mblen;
1049
1050   columns = 0;
1051   for (ofs = 0; ofs < length; ofs += mblen)
1052     {
1053       ucs4_t uc;
1054
1055       mblen = u8_mbtouc (&uc, s + ofs, length - ofs);
1056       if (uc != '\t')
1057         {
1058           int width = uc_width (uc, "UTF-8");
1059           if (width > 0)
1060             columns += width;
1061         }
1062       else
1063         columns = ROUND_UP (columns + 1, 8);
1064     }
1065
1066   return columns + 1;
1067 }
1068
1069 static int
1070 lex_source_get_first_column (const struct lex_source *src, int n)
1071 {
1072   const struct lex_token *token = lex_source_next__ (src, n);
1073   return count_columns (&src->buffer[token->line_pos - src->tail],
1074                         token->token_pos - token->line_pos);
1075 }
1076
1077 static int
1078 lex_source_get_last_column (const struct lex_source *src, int n)
1079 {
1080   const struct lex_token *token = lex_source_next__ (src, n);
1081   char *start, *end, *newline;
1082
1083   start = &src->buffer[token->line_pos - src->tail];
1084   end = &src->buffer[(token->token_pos + token->token_len) - src->tail];
1085   newline = memrchr (start, '\n', end - start);
1086   if (newline != NULL)
1087     start = newline + 1;
1088   return count_columns (start, end - start);
1089 }
1090
1091 /* Returns the 1-based line number of the start of the syntax that represents
1092    the token N after the current one in LEXER.  Returns 0 for a T_STOP token or
1093    if the token is drawn from a source that does not have line numbers. */
1094 int
1095 lex_get_first_line_number (const struct lexer *lexer, int n)
1096 {
1097   const struct lex_source *src = lex_source__ (lexer);
1098   return src != NULL ? lex_source_get_first_line_number (src, n) : 0;
1099 }
1100
1101 /* Returns the 1-based line number of the end of the syntax that represents the
1102    token N after the current one in LEXER, plus 1.  Returns 0 for a T_STOP
1103    token or if the token is drawn from a source that does not have line
1104    numbers.
1105
1106    Most of the time, a single token is wholly within a single line of syntax,
1107    but there are two exceptions: a T_STRING token can be made up of multiple
1108    segments on adjacent lines connected with "+" punctuators, and a T_NEG_NUM
1109    token can consist of a "-" on one line followed by the number on the next.
1110  */
1111 int
1112 lex_get_last_line_number (const struct lexer *lexer, int n)
1113 {
1114   const struct lex_source *src = lex_source__ (lexer);
1115   return src != NULL ? lex_source_get_last_line_number (src, n) : 0;
1116 }
1117
1118 /* Returns the 1-based column number of the start of the syntax that represents
1119    the token N after the current one in LEXER.  Returns 0 for a T_STOP
1120    token.
1121
1122    Column numbers are measured according to the width of characters as shown in
1123    a typical fixed-width font, in which CJK characters have width 2 and
1124    combining characters have width 0.  */
1125 int
1126 lex_get_first_column (const struct lexer *lexer, int n)
1127 {
1128   const struct lex_source *src = lex_source__ (lexer);
1129   return src != NULL ? lex_source_get_first_column (src, n) : 0;
1130 }
1131
1132 /* Returns the 1-based column number of the end of the syntax that represents
1133    the token N after the current one in LEXER, plus 1.  Returns 0 for a T_STOP
1134    token.
1135
1136    Column numbers are measured according to the width of characters as shown in
1137    a typical fixed-width font, in which CJK characters have width 2 and
1138    combining characters have width 0.  */
1139 int
1140 lex_get_last_column (const struct lexer *lexer, int n)
1141 {
1142   const struct lex_source *src = lex_source__ (lexer);
1143   return src != NULL ? lex_source_get_last_column (src, n) : 0;
1144 }
1145
1146 /* Returns the name of the syntax file from which the current command is drawn.
1147    Returns NULL for a T_STOP token or if the command's source does not have
1148    line numbers.
1149
1150    There is no version of this function that takes an N argument because
1151    lookahead only works to the end of a command and any given command is always
1152    within a single syntax file. */
1153 const char *
1154 lex_get_file_name (const struct lexer *lexer)
1155 {
1156   struct lex_source *src = lex_source__ (lexer);
1157   return src == NULL ? NULL : src->reader->file_name;
1158 }
1159
1160 const char *
1161 lex_get_encoding (const struct lexer *lexer)
1162 {
1163   struct lex_source *src = lex_source__ (lexer);
1164   return src == NULL ? NULL : src->reader->encoding;
1165 }
1166
1167
1168 /* Returns the syntax mode for the syntax file from which the current drawn is
1169    drawn.  Returns SEG_MODE_AUTO for a T_STOP token or if the command's source
1170    does not have line numbers.
1171
1172    There is no version of this function that takes an N argument because
1173    lookahead only works to the end of a command and any given command is always
1174    within a single syntax file. */
1175 enum segmenter_mode
1176 lex_get_syntax_mode (const struct lexer *lexer)
1177 {
1178   struct lex_source *src = lex_source__ (lexer);
1179   return src == NULL ? SEG_MODE_AUTO : src->reader->syntax;
1180 }
1181
1182 /* Returns the error mode for the syntax file from which the current drawn is
1183    drawn.  Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1184    source does not have line numbers.
1185
1186    There is no version of this function that takes an N argument because
1187    lookahead only works to the end of a command and any given command is always
1188    within a single syntax file. */
1189 enum lex_error_mode
1190 lex_get_error_mode (const struct lexer *lexer)
1191 {
1192   struct lex_source *src = lex_source__ (lexer);
1193   return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1194 }
1195
1196 /* If the source that LEXER is currently reading has error mode
1197    LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1198    token to be read comes directly from whatever is next read from the stream.
1199
1200    It makes sense to call this function after encountering an error in a
1201    command entered on the console, because usually the user would prefer not to
1202    have cascading errors. */
1203 void
1204 lex_interactive_reset (struct lexer *lexer)
1205 {
1206   struct lex_source *src = lex_source__ (lexer);
1207   if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1208     {
1209       src->head = src->tail = 0;
1210       src->journal_pos = src->seg_pos = src->line_pos = 0;
1211       src->n_newlines = 0;
1212       src->suppress_next_newline = false;
1213       segmenter_init (&src->segmenter, segmenter_get_mode (&src->segmenter));
1214       while (!deque_is_empty (&src->deque))
1215         lex_source_pop__ (src);
1216       lex_source_push_endcmd__ (src);
1217     }
1218 }
1219
1220 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1221 void
1222 lex_discard_rest_of_command (struct lexer *lexer)
1223 {
1224   while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1225     lex_get (lexer);
1226 }
1227
1228 /* Discards all lookahead tokens in LEXER, then discards all input sources
1229    until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1230    runs out of input sources. */
1231 void
1232 lex_discard_noninteractive (struct lexer *lexer)
1233 {
1234   struct lex_source *src = lex_source__ (lexer);
1235
1236   if (src != NULL)
1237     {
1238       while (!deque_is_empty (&src->deque))
1239         lex_source_pop__ (src);
1240
1241       for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1242            src = lex_source__ (lexer))
1243         lex_source_destroy (src);
1244     }
1245 }
1246 \f
1247 static size_t
1248 lex_source_max_tail__ (const struct lex_source *src)
1249 {
1250   const struct lex_token *token;
1251   size_t max_tail;
1252
1253   assert (src->seg_pos >= src->line_pos);
1254   max_tail = MIN (src->journal_pos, src->line_pos);
1255
1256   /* Use the oldest token also.  (We know that src->deque cannot be empty
1257      because we are in the process of adding a new token, which is already
1258      initialized enough to use here.) */
1259   token = &src->tokens[deque_back (&src->deque, 0)];
1260   assert (token->token_pos >= token->line_pos);
1261   max_tail = MIN (max_tail, token->line_pos);
1262
1263   return max_tail;
1264 }
1265
1266 static void
1267 lex_source_expand__ (struct lex_source *src)
1268 {
1269   if (src->head - src->tail >= src->allocated)
1270     {
1271       size_t max_tail = lex_source_max_tail__ (src);
1272       if (max_tail > src->tail)
1273         {
1274           /* Advance the tail, freeing up room at the head. */
1275           memmove (src->buffer, src->buffer + (max_tail - src->tail),
1276                    src->head - max_tail);
1277           src->tail = max_tail;
1278         }
1279       else
1280         {
1281           /* Buffer is completely full.  Expand it. */
1282           src->buffer = x2realloc (src->buffer, &src->allocated);
1283         }
1284     }
1285   else
1286     {
1287       /* There's space available at the head of the buffer.  Nothing to do. */
1288     }
1289 }
1290
1291 static void
1292 lex_source_read__ (struct lex_source *src)
1293 {
1294   do
1295     {
1296       lex_source_expand__ (src);
1297
1298       size_t head_ofs = src->head - src->tail;
1299       size_t space = src->allocated - head_ofs;
1300       enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1301       size_t n = src->reader->class->read (src->reader, &src->buffer[head_ofs],
1302                                            space, prompt);
1303       assert (n <= space);
1304
1305       if (n == 0)
1306         {
1307           /* End of input. */
1308           src->reader->eof = true;
1309           lex_source_expand__ (src);
1310           return;
1311         }
1312
1313       src->head += n;
1314     }
1315   while (!memchr (&src->buffer[src->seg_pos - src->tail], '\n',
1316                   src->head - src->seg_pos));
1317 }
1318
1319 static struct lex_source *
1320 lex_source__ (const struct lexer *lexer)
1321 {
1322   return (ll_is_empty (&lexer->sources) ? NULL
1323           : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1324 }
1325
1326 static struct substring
1327 lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
1328 {
1329   const struct lex_token *token0 = lex_source_next__ (src, n0);
1330   const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1));
1331   size_t start = token0->token_pos;
1332   size_t end = token1->token_pos + token1->token_len;
1333
1334   return ss_buffer (&src->buffer[start - src->tail], end - start);
1335 }
1336
1337 static void
1338 lex_ellipsize__ (struct substring in, char *out, size_t out_size)
1339 {
1340   size_t out_maxlen;
1341   size_t out_len;
1342   int mblen;
1343
1344   assert (out_size >= 16);
1345   out_maxlen = out_size - 1;
1346   if (in.length > out_maxlen - 3)
1347     out_maxlen -= 3;
1348
1349   for (out_len = 0; out_len < in.length; out_len += mblen)
1350     {
1351       if (in.string[out_len] == '\n'
1352           || in.string[out_len] == '\0'
1353           || (in.string[out_len] == '\r'
1354               && out_len + 1 < in.length
1355               && in.string[out_len + 1] == '\n'))
1356         break;
1357
1358       mblen = u8_mblen (CHAR_CAST (const uint8_t *, in.string + out_len),
1359                         in.length - out_len);
1360
1361       if (mblen < 0)
1362         break;
1363
1364       if (out_len + mblen > out_maxlen)
1365         break;
1366     }
1367
1368   memcpy (out, in.string, out_len);
1369   strcpy (&out[out_len], out_len < in.length ? "..." : "");
1370 }
1371
1372 static void
1373 lex_source_error_valist (struct lex_source *src, int n0, int n1,
1374                          const char *format, va_list args)
1375 {
1376   const struct lex_token *token;
1377   struct string s;
1378
1379   ds_init_empty (&s);
1380
1381   token = lex_source_next__ (src, n0);
1382   if (token->token.type == T_ENDCMD)
1383     ds_put_cstr (&s, _("Syntax error at end of command"));
1384   else
1385     {
1386       struct substring syntax = lex_source_get_syntax__ (src, n0, n1);
1387       if (!ss_is_empty (syntax))
1388         {
1389           char syntax_cstr[64];
1390
1391           lex_ellipsize__ (syntax, syntax_cstr, sizeof syntax_cstr);
1392           ds_put_format (&s, _("Syntax error at `%s'"), syntax_cstr);
1393         }
1394       else
1395         ds_put_cstr (&s, _("Syntax error"));
1396     }
1397
1398   if (format)
1399     {
1400       ds_put_cstr (&s, ": ");
1401       ds_put_vformat (&s, format, args);
1402     }
1403   if (ds_last (&s) != '.')
1404     ds_put_byte (&s, '.');
1405
1406   struct msg m = {
1407     .category = MSG_C_SYNTAX,
1408     .severity = MSG_S_ERROR,
1409     .file_name = src->reader->file_name,
1410     .first_line = lex_source_get_first_line_number (src, n0),
1411     .last_line = lex_source_get_last_line_number (src, n1),
1412     .first_column = lex_source_get_first_column (src, n0),
1413     .last_column = lex_source_get_last_column (src, n1),
1414     .text = ds_steal_cstr (&s),
1415   };
1416   msg_emit (&m);
1417 }
1418
1419 static void PRINTF_FORMAT (2, 3)
1420 lex_get_error (struct lex_source *src, const char *format, ...)
1421 {
1422   va_list args;
1423   int n;
1424
1425   va_start (args, format);
1426
1427   n = deque_count (&src->deque) - 1;
1428   lex_source_error_valist (src, n, n, format, args);
1429   lex_source_pop_front (src);
1430
1431   va_end (args);
1432 }
1433
1434 /* Attempts to append an additional token into SRC's deque, reading more from
1435    the underlying lex_reader if necessary.  Returns true if a new token was
1436    added to SRC's deque, false otherwise. */
1437 static bool
1438 lex_source_try_get (struct lex_source *src)
1439 {
1440   /* State maintained while scanning tokens.  Usually we only need a single
1441      state, but scanner_push() can return SCAN_SAVE to indicate that the state
1442      needs to be saved and possibly restored later with SCAN_BACK. */
1443   struct state
1444     {
1445       struct segmenter segmenter;
1446       enum segment_type last_segment;
1447       int newlines;             /* Number of newlines encountered so far. */
1448       /* Maintained here so we can update lex_source's similar members when we
1449          finish. */
1450       size_t line_pos;
1451       size_t seg_pos;
1452     };
1453
1454   /* Initialize state. */
1455   struct state state =
1456     {
1457       .segmenter = src->segmenter,
1458       .newlines = 0,
1459       .seg_pos = src->seg_pos,
1460       .line_pos = src->line_pos,
1461     };
1462   struct state saved = state;
1463
1464   /* Append a new token to SRC and initialize it. */
1465   struct lex_token *token = lex_push_token__ (src);
1466   struct scanner scanner;
1467   scanner_init (&scanner, &token->token);
1468   token->line_pos = src->line_pos;
1469   token->token_pos = src->seg_pos;
1470   if (src->reader->line_number > 0)
1471     token->first_line = src->reader->line_number + src->n_newlines;
1472   else
1473     token->first_line = 0;
1474
1475   /* Extract segments and pass them through the scanner until we obtain a
1476      token. */
1477   for (;;)
1478     {
1479       /* Extract a segment. */
1480       const char *segment = &src->buffer[state.seg_pos - src->tail];
1481       size_t seg_maxlen = src->head - state.seg_pos;
1482       enum segment_type type;
1483       int seg_len = segmenter_push (&state.segmenter, segment, seg_maxlen,
1484                                     src->reader->eof, &type);
1485       if (seg_len < 0)
1486         {
1487           /* The segmenter needs more input to produce a segment. */
1488           assert (!src->reader->eof);
1489           lex_source_read__ (src);
1490           continue;
1491         }
1492
1493       /* Update state based on the segment. */
1494       state.last_segment = type;
1495       state.seg_pos += seg_len;
1496       if (type == SEG_NEWLINE)
1497         {
1498           state.newlines++;
1499           state.line_pos = state.seg_pos;
1500         }
1501
1502       /* Pass the segment into the scanner and try to get a token out. */
1503       enum scan_result result = scanner_push (&scanner, type,
1504                                               ss_buffer (segment, seg_len),
1505                                               &token->token);
1506       if (result == SCAN_SAVE)
1507         saved = state;
1508       else if (result == SCAN_BACK)
1509         {
1510           state = saved;
1511           break;
1512         }
1513       else if (result == SCAN_DONE)
1514         break;
1515     }
1516
1517   /* If we've reached the end of a line, or the end of a command, then pass
1518      the line to the output engine as a syntax text item.  */
1519   int n_lines = state.newlines;
1520   if (state.last_segment == SEG_END_COMMAND && !src->suppress_next_newline)
1521     {
1522       n_lines++;
1523       src->suppress_next_newline = true;
1524     }
1525   else if (n_lines > 0 && src->suppress_next_newline)
1526     {
1527       n_lines--;
1528       src->suppress_next_newline = false;
1529     }
1530   for (int i = 0; i < n_lines; i++)
1531     {
1532       /* Beginning of line. */
1533       const char *line = &src->buffer[src->journal_pos - src->tail];
1534
1535       /* Calculate line length, including \n or \r\n end-of-line if present.
1536
1537          We use src->head even though that may be beyond what we've actually
1538          converted to tokens (which is only through state.line_pos).  That's
1539          because, if we're emitting the line due to SEG_END_COMMAND, we want to
1540          take the whole line through the newline, not just through the '.'. */
1541       size_t max_len = src->head - src->journal_pos;
1542       const char *newline = memchr (line, '\n', max_len);
1543       size_t line_len = newline ? newline - line + 1 : max_len;
1544
1545       /* Calculate line length excluding end-of-line. */
1546       size_t copy_len = line_len;
1547       if (copy_len > 0 && line[copy_len - 1] == '\n')
1548         copy_len--;
1549       if (copy_len > 0 && line[copy_len - 1] == '\r')
1550         copy_len--;
1551
1552       /* Submit the line as syntax. */
1553       output_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX,
1554                                                    xmemdup0 (line, copy_len),
1555                                                    NULL));
1556
1557       src->journal_pos += line_len;
1558     }
1559
1560   token->token_len = state.seg_pos - src->seg_pos;
1561
1562   src->segmenter = state.segmenter;
1563   src->seg_pos = state.seg_pos;
1564   src->line_pos = state.line_pos;
1565   src->n_newlines += state.newlines;
1566
1567   switch (token->token.type)
1568     {
1569     default:
1570       return true;
1571
1572     case T_STOP:
1573       token->token.type = T_ENDCMD;
1574       src->eof = true;
1575       return true;
1576
1577     case SCAN_BAD_HEX_LENGTH:
1578       lex_get_error (src, _("String of hex digits has %d characters, which "
1579                             "is not a multiple of 2"),
1580                      (int) token->token.number);
1581       return false;
1582
1583     case SCAN_BAD_HEX_DIGIT:
1584     case SCAN_BAD_UNICODE_DIGIT:
1585       lex_get_error (src, _("`%c' is not a valid hex digit"),
1586                      (int) token->token.number);
1587       return false;
1588
1589     case SCAN_BAD_UNICODE_LENGTH:
1590       lex_get_error (src, _("Unicode string contains %d bytes, which is "
1591                             "not in the valid range of 1 to 8 bytes"),
1592                      (int) token->token.number);
1593       return false;
1594
1595     case SCAN_BAD_UNICODE_CODE_POINT:
1596       lex_get_error (src, _("U+%04X is not a valid Unicode code point"),
1597                      (int) token->token.number);
1598       return false;
1599
1600     case SCAN_EXPECTED_QUOTE:
1601       lex_get_error (src, _("Unterminated string constant"));
1602       return false;
1603
1604     case SCAN_EXPECTED_EXPONENT:
1605       lex_get_error (src, _("Missing exponent following `%s'"),
1606                      token->token.string.string);
1607       return false;
1608
1609     case SCAN_UNEXPECTED_CHAR:
1610       {
1611         char c_name[16];
1612         lex_get_error (src, _("Bad character %s in input"),
1613                        uc_name (token->token.number, c_name));
1614         return false;
1615       }
1616
1617     case SCAN_SKIP:
1618       lex_source_pop_front (src);
1619       return false;
1620     }
1621
1622   NOT_REACHED ();
1623 }
1624
1625 static bool
1626 lex_source_get__ (struct lex_source *src)
1627 {
1628   for (;;) (
1629     if (src->eof)
1630       return false;
1631     else if (lex_source_try_get (src))
1632       return true;
1633     }
1634 }
1635
1636 static bool
1637 lex_source_get (const struct lex_source *src_)
1638 {
1639   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1640
1641   if (!lex_source_get (src))
1642     return false;
1643
1644   struct macro_expander *me = macro_expander_create (src->lexer,
1645                                                      lex_source_front (src));
1646   if (!me)
1647     return true;
1648
1649   for (;;)
1650     {
1651       if (!lex_source_get (src))
1652         {
1653           /* This should not be reachable because we always get a T_STOP at the
1654              end of input and the macro_expander should always terminate
1655              expansion on T_STOP. */
1656           NOT_REACHED ();
1657         }
1658
1659       int retval = macro_expander_add (me, lex_source_front (src));
1660
1661
1662     }
1663
1664
1665 }
1666 \f
1667 static void
1668 lex_source_push_endcmd__ (struct lex_source *src)
1669 {
1670   struct lex_token *token = lex_push_token__ (src);
1671   token->token.type = T_ENDCMD;
1672   token->token_pos = 0;
1673   token->token_len = 0;
1674   token->line_pos = 0;
1675   token->first_line = 0;
1676 }
1677
1678 static struct lex_source *
1679 lex_source_create (struct lexer *lexer, struct lex_reader *reader)
1680 {
1681   struct lex_source *src;
1682
1683   src = xzalloc (sizeof *src);
1684   src->reader = reader;
1685   segmenter_init (&src->segmenter, reader->syntax);
1686   src->lexer = lexer;
1687   src->tokens = deque_init (&src->deque, 4, sizeof *src->tokens);
1688
1689   lex_source_push_endcmd__ (src);
1690
1691   return src;
1692 }
1693
1694 static void
1695 lex_source_destroy (struct lex_source *src)
1696 {
1697   char *file_name = src->reader->file_name;
1698   char *encoding = src->reader->encoding;
1699   if (src->reader->class->destroy != NULL)
1700     src->reader->class->destroy (src->reader);
1701   free (file_name);
1702   free (encoding);
1703   free (src->buffer);
1704   while (!deque_is_empty (&src->deque))
1705     lex_source_pop__ (src);
1706   free (src->tokens);
1707   ll_remove (&src->ll);
1708   free (src);
1709 }
1710 \f
1711 struct lex_file_reader
1712   {
1713     struct lex_reader reader;
1714     struct u8_istream *istream;
1715   };
1716
1717 static struct lex_reader_class lex_file_reader_class;
1718
1719 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
1720    from stdin if FILE_NAME is "-").  The file is expected to be encoded with
1721    ENCODING, which should take one of the forms accepted by
1722    u8_istream_for_file().  SYNTAX and ERROR become the syntax mode and error
1723    mode of the new reader, respectively.
1724
1725    Returns a null pointer if FILE_NAME cannot be opened. */
1726 struct lex_reader *
1727 lex_reader_for_file (const char *file_name, const char *encoding,
1728                      enum segmenter_mode syntax,
1729                      enum lex_error_mode error)
1730 {
1731   struct lex_file_reader *r;
1732   struct u8_istream *istream;
1733
1734   istream = (!strcmp(file_name, "-")
1735              ? u8_istream_for_fd (encoding, STDIN_FILENO)
1736              : u8_istream_for_file (encoding, file_name, O_RDONLY));
1737   if (istream == NULL)
1738     {
1739       msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
1740       return NULL;
1741     }
1742
1743   r = xmalloc (sizeof *r);
1744   lex_reader_init (&r->reader, &lex_file_reader_class);
1745   r->reader.syntax = syntax;
1746   r->reader.error = error;
1747   r->reader.file_name = xstrdup (file_name);
1748   r->reader.encoding = xstrdup_if_nonnull (encoding);
1749   r->reader.line_number = 1;
1750   r->istream = istream;
1751
1752   return &r->reader;
1753 }
1754
1755 static struct lex_file_reader *
1756 lex_file_reader_cast (struct lex_reader *r)
1757 {
1758   return UP_CAST (r, struct lex_file_reader, reader);
1759 }
1760
1761 static size_t
1762 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
1763                enum prompt_style prompt_style UNUSED)
1764 {
1765   struct lex_file_reader *r = lex_file_reader_cast (r_);
1766   ssize_t n_read = u8_istream_read (r->istream, buf, n);
1767   if (n_read < 0)
1768     {
1769       msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
1770       return 0;
1771     }
1772   return n_read;
1773 }
1774
1775 static void
1776 lex_file_close (struct lex_reader *r_)
1777 {
1778   struct lex_file_reader *r = lex_file_reader_cast (r_);
1779
1780   if (u8_istream_fileno (r->istream) != STDIN_FILENO)
1781     {
1782       if (u8_istream_close (r->istream) != 0)
1783         msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
1784     }
1785   else
1786     u8_istream_free (r->istream);
1787
1788   free (r);
1789 }
1790
1791 static struct lex_reader_class lex_file_reader_class =
1792   {
1793     lex_file_read,
1794     lex_file_close
1795   };
1796 \f
1797 struct lex_string_reader
1798   {
1799     struct lex_reader reader;
1800     struct substring s;
1801     size_t offset;
1802   };
1803
1804 static struct lex_reader_class lex_string_reader_class;
1805
1806 /* Creates and returns a new lex_reader for the contents of S, which must be
1807    encoded in the given ENCODING.  The new reader takes ownership of S and will free it
1808    with ss_dealloc() when it is closed. */
1809 struct lex_reader *
1810 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
1811 {
1812   struct lex_string_reader *r;
1813
1814   r = xmalloc (sizeof *r);
1815   lex_reader_init (&r->reader, &lex_string_reader_class);
1816   r->reader.syntax = SEG_MODE_AUTO;
1817   r->reader.encoding = xstrdup_if_nonnull (encoding);
1818   r->s = s;
1819   r->offset = 0;
1820
1821   return &r->reader;
1822 }
1823
1824 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
1825    which must be encoded in ENCODING.  The caller retains ownership of S. */
1826 struct lex_reader *
1827 lex_reader_for_string (const char *s, const char *encoding)
1828 {
1829   struct substring ss;
1830   ss_alloc_substring (&ss, ss_cstr (s));
1831   return lex_reader_for_substring_nocopy (ss, encoding);
1832 }
1833
1834 /* Formats FORMAT as a printf()-like format string and creates and returns a
1835    new lex_reader for the formatted result.  */
1836 struct lex_reader *
1837 lex_reader_for_format (const char *format, const char *encoding, ...)
1838 {
1839   struct lex_reader *r;
1840   va_list args;
1841
1842   va_start (args, encoding);
1843   r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
1844   va_end (args);
1845
1846   return r;
1847 }
1848
1849 static struct lex_string_reader *
1850 lex_string_reader_cast (struct lex_reader *r)
1851 {
1852   return UP_CAST (r, struct lex_string_reader, reader);
1853 }
1854
1855 static size_t
1856 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
1857                  enum prompt_style prompt_style UNUSED)
1858 {
1859   struct lex_string_reader *r = lex_string_reader_cast (r_);
1860   size_t chunk;
1861
1862   chunk = MIN (n, r->s.length - r->offset);
1863   memcpy (buf, r->s.string + r->offset, chunk);
1864   r->offset += chunk;
1865
1866   return chunk;
1867 }
1868
1869 static void
1870 lex_string_close (struct lex_reader *r_)
1871 {
1872   struct lex_string_reader *r = lex_string_reader_cast (r_);
1873
1874   ss_dealloc (&r->s);
1875   free (r);
1876 }
1877
1878 static struct lex_reader_class lex_string_reader_class =
1879   {
1880     lex_string_read,
1881     lex_string_close
1882   };