pintos-os.org Git - pspp/blob - src/language/lexer/lexer.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "language/lexer/lexer.h"
  20
  21 #include <errno.h>
  22 #include <fcntl.h>
  23 #include <limits.h>
  24 #include <math.h>
  25 #include <stdarg.h>
  26 #include <stdlib.h>
  27 #include <string.h>
  28 #include <unictype.h>
  29 #include <unistd.h>
  30 #include <unistr.h>
  31 #include <uniwidth.h>
  32
  33 #include "language/command.h"
  34 #include "language/lexer/scan.h"
  35 #include "language/lexer/segment.h"
  36 #include "language/lexer/token.h"
  37 #include "libpspp/assertion.h"
  38 #include "libpspp/cast.h"
  39 #include "libpspp/deque.h"
  40 #include "libpspp/i18n.h"
  41 #include "libpspp/ll.h"
  42 #include "libpspp/message.h"
  43 #include "libpspp/misc.h"
  44 #include "libpspp/str.h"
  45 #include "libpspp/u8-istream.h"
  46 #include "output/journal.h"
  47 #include "output/output-item.h"
  48
  49 #include "gl/c-ctype.h"
  50 #include "gl/minmax.h"
  51 #include "gl/xalloc.h"
  52 #include "gl/xmemdup0.h"
  53
  54 #include "gettext.h"
  55 #define _(msgid) gettext (msgid)
  56 #define N_(msgid) msgid
  57
  58 /* A token within a lex_source. */
  59 struct lex_token
  60   {
  61     /* The regular token information. */
  62     struct token token;
  63
  64     /* Location of token in terms of the lex_source's buffer.
  65        src->tail <= line_pos <= token_pos <= src->head. */
  66     size_t token_pos;           /* Start of token. */
  67     size_t token_len;           /* Length of source for token in bytes. */
  68     size_t line_pos;            /* Start of line containing token_pos. */
  69     int first_line;             /* Line number at token_pos. */
  70   };
  71
  72 /* A source of tokens, corresponding to a syntax file.
  73
  74    This is conceptually a lex_reader wrapped with everything needed to convert
  75    its UTF-8 bytes into tokens. */
  76 struct lex_source
  77   {
  78     struct ll ll;               /* In lexer's list of sources. */
  79     struct lex_reader *reader;
  80     struct segmenter segmenter;
  81     bool eof;                   /* True if T_STOP was read from 'reader'. */
  82
  83     /* Buffer of UTF-8 bytes. */
  84     char *buffer;
  85     size_t allocated;           /* Number of bytes allocated. */
  86     size_t tail;                /* &buffer[0] offset into UTF-8 source. */
  87     size_t head;                /* &buffer[head - tail] offset into source. */
  88
  89     /* Positions in source file, tail <= pos <= head for each member here. */
  90     size_t journal_pos;         /* First byte not yet output to journal. */
  91     size_t seg_pos;             /* First byte not yet scanned as token. */
  92     size_t line_pos;            /* First byte of line containing seg_pos. */
  93
  94     int n_newlines;             /* Number of new-lines up to seg_pos. */
  95     bool suppress_next_newline;
  96
  97     /* Tokens. */
  98     struct deque deque;         /* Indexes into 'tokens'. */
  99     struct lex_token *tokens;   /* Lookahead tokens for parser. */
 100   };
 101
 102 static struct lex_source *lex_source_create (struct lex_reader *);
 103 static void lex_source_destroy (struct lex_source *);
 104
 105 /* Lexer. */
 106 struct lexer
 107   {
 108     struct ll_list sources;     /* Contains "struct lex_source"s. */
 109   };
 110
 111 static struct lex_source *lex_source__ (const struct lexer *);
 112 static const struct lex_token *lex_next__ (const struct lexer *, int n);
 113 static void lex_source_push_endcmd__ (struct lex_source *);
 114
 115 static void lex_source_pop__ (struct lex_source *);
 116 static bool lex_source_get__ (const struct lex_source *);
 117 static void lex_source_error_valist (struct lex_source *, int n0, int n1,
 118                                      const char *format, va_list)
 119    PRINTF_FORMAT (4, 0);
 120 static const struct lex_token *lex_source_next__ (const struct lex_source *,
 121                                                   int n);
 122 \f
 123 /* Initializes READER with the specified CLASS and otherwise some reasonable
 124    defaults.  The caller should fill in the others members as desired. */
 125 void
 126 lex_reader_init (struct lex_reader *reader,
 127                  const struct lex_reader_class *class)
 128 {
 129   reader->class = class;
 130   reader->syntax = LEX_SYNTAX_AUTO;
 131   reader->error = LEX_ERROR_CONTINUE;
 132   reader->file_name = NULL;
 133   reader->encoding = NULL;
 134   reader->line_number = 0;
 135   reader->eof = false;
 136 }
 137
 138 /* Frees any file name already in READER and replaces it by a copy of
 139    FILE_NAME, or if FILE_NAME is null then clears any existing name. */
 140 void
 141 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
 142 {
 143   free (reader->file_name);
 144   reader->file_name = xstrdup_if_nonnull (file_name);
 145 }
 146 \f
 147 /* Creates and returns a new lexer. */
 148 struct lexer *
 149 lex_create (void)
 150 {
 151   struct lexer *lexer = xzalloc (sizeof *lexer);
 152   ll_init (&lexer->sources);
 153   return lexer;
 154 }
 155
 156 /* Destroys LEXER. */
 157 void
 158 lex_destroy (struct lexer *lexer)
 159 {
 160   if (lexer != NULL)
 161     {
 162       struct lex_source *source, *next;
 163
 164       ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
 165         lex_source_destroy (source);
 166       free (lexer);
 167     }
 168 }
 169
 170 /* Inserts READER into LEXER so that the next token read by LEXER comes from
 171    READER.  Before the caller, LEXER must either be empty or at a T_ENDCMD
 172    token. */
 173 void
 174 lex_include (struct lexer *lexer, struct lex_reader *reader)
 175 {
 176   assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
 177   ll_push_head (&lexer->sources, &lex_source_create (reader)->ll);
 178 }
 179
 180 /* Appends READER to LEXER, so that it will be read after all other current
 181    readers have already been read. */
 182 void
 183 lex_append (struct lexer *lexer, struct lex_reader *reader)
 184 {
 185   ll_push_tail (&lexer->sources, &lex_source_create (reader)->ll);
 186 }
 187 \f
 188 /* Advancing. */
 189
 190 static struct lex_token *
 191 lex_push_token__ (struct lex_source *src)
 192 {
 193   struct lex_token *token;
 194
 195   if (deque_is_full (&src->deque))
 196     src->tokens = deque_expand (&src->deque, src->tokens, sizeof *src->tokens);
 197
 198   token = &src->tokens[deque_push_front (&src->deque)];
 199   token_init (&token->token);
 200   return token;
 201 }
 202
 203 static void
 204 lex_source_pop__ (struct lex_source *src)
 205 {
 206   token_destroy (&src->tokens[deque_pop_back (&src->deque)].token);
 207 }
 208
 209 static void
 210 lex_source_pop_front (struct lex_source *src)
 211 {
 212   token_destroy (&src->tokens[deque_pop_front (&src->deque)].token);
 213 }
 214
 215 /* Advances LEXER to the next token, consuming the current token. */
 216 void
 217 lex_get (struct lexer *lexer)
 218 {
 219   struct lex_source *src;
 220
 221   src = lex_source__ (lexer);
 222   if (src == NULL)
 223     return;
 224
 225   if (!deque_is_empty (&src->deque))
 226     lex_source_pop__ (src);
 227
 228   while (deque_is_empty (&src->deque))
 229     if (!lex_source_get__ (src))
 230       {
 231         lex_source_destroy (src);
 232         src = lex_source__ (lexer);
 233         if (src == NULL)
 234           return;
 235       }
 236 }
 237 \f
 238 /* Issuing errors. */
 239
 240 /* Prints a syntax error message containing the current token and
 241    given message MESSAGE (if non-null). */
 242 void
 243 lex_error (struct lexer *lexer, const char *format, ...)
 244 {
 245   va_list args;
 246
 247   va_start (args, format);
 248   lex_next_error_valist (lexer, 0, 0, format, args);
 249   va_end (args);
 250 }
 251
 252 /* Prints a syntax error message containing the current token and
 253    given message MESSAGE (if non-null). */
 254 void
 255 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
 256 {
 257   lex_next_error_valist (lexer, 0, 0, format, args);
 258 }
 259
 260 /* Prints a syntax error message containing the current token and
 261    given message MESSAGE (if non-null). */
 262 void
 263 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
 264 {
 265   va_list args;
 266
 267   va_start (args, format);
 268   lex_next_error_valist (lexer, n0, n1, format, args);
 269   va_end (args);
 270 }
 271
 272 /* Prints a syntax error message saying that OPTION0 or one of the other
 273    strings following it, up to the first NULL, is expected. */
 274 void
 275 (lex_error_expecting) (struct lexer *lexer, const char *option0, ...)
 276 {
 277   enum { MAX_OPTIONS = 8 };
 278   const char *options[MAX_OPTIONS + 1];
 279   va_list args;
 280   int n;
 281
 282   va_start (args, option0);
 283   options[0] = option0;
 284   n = 0;
 285   while (n + 1 < MAX_OPTIONS && options[n] != NULL)
 286     options[++n] = va_arg (args, const char *);
 287   va_end (args);
 288
 289   switch (n)
 290     {
 291     case 0:
 292       lex_error (lexer, NULL);
 293       break;
 294
 295     case 1:
 296       lex_error (lexer, _("expecting %s"), options[0]);
 297       break;
 298
 299     case 2:
 300       lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
 301       break;
 302
 303     case 3:
 304       lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
 305                  options[2]);
 306       break;
 307
 308     case 4:
 309       lex_error (lexer, _("expecting %s, %s, %s, or %s"),
 310                  options[0], options[1], options[2], options[3]);
 311       break;
 312
 313     case 5:
 314       lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
 315                  options[0], options[1], options[2], options[3], options[4]);
 316       break;
 317
 318     case 6:
 319       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
 320                  options[0], options[1], options[2], options[3], options[4],
 321                  options[5]);
 322       break;
 323
 324     case 7:
 325       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
 326                  options[0], options[1], options[2], options[3], options[4],
 327                  options[5], options[6]);
 328       break;
 329
 330     case 8:
 331       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
 332                  options[0], options[1], options[2], options[3], options[4],
 333                  options[5], options[6], options[7]);
 334       break;
 335
 336     default:
 337       NOT_REACHED ();
 338     }
 339 }
 340
 341 /* Reports an error to the effect that subcommand SBC may only be specified
 342    once.
 343
 344    This function does not take a lexer as an argument or use lex_error(),
 345    because the result would ordinarily just be redundant: "Syntax error at
 346    SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
 347    not help the user find the error. */
 348 void
 349 lex_sbc_only_once (const char *sbc)
 350 {
 351   msg (SE, _("Subcommand %s may only be specified once."), sbc);
 352 }
 353
 354 /* Reports an error to the effect that subcommand SBC is missing.
 355
 356    This function does not take a lexer as an argument or use lex_error(),
 357    because a missing subcommand can normally be detected only after the whole
 358    command has been parsed, and so lex_error() would always report "Syntax
 359    error at end of command", which does not help the user find the error. */
 360 void
 361 lex_sbc_missing (const char *sbc)
 362 {
 363   msg (SE, _("Required subcommand %s was not specified."), sbc);
 364 }
 365
 366 /* Reports an error to the effect that specification SPEC may only be specified
 367    once within subcommand SBC. */
 368 void
 369 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
 370 {
 371   lex_error (lexer, _("%s may only be specified once within subcommand %s"),
 372              spec, sbc);
 373 }
 374
 375 /* Reports an error to the effect that specification SPEC is missing within
 376    subcommand SBC. */
 377 void
 378 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
 379 {
 380   lex_error (lexer, _("Required %s specification missing from %s subcommand"),
 381              sbc, spec);
 382 }
 383
 384 /* Prints a syntax error message containing the current token and
 385    given message MESSAGE (if non-null). */
 386 void
 387 lex_next_error_valist (struct lexer *lexer, int n0, int n1,
 388                        const char *format, va_list args)
 389 {
 390   struct lex_source *src = lex_source__ (lexer);
 391
 392   if (src != NULL)
 393     lex_source_error_valist (src, n0, n1, format, args);
 394   else
 395     {
 396       struct string s;
 397
 398       ds_init_empty (&s);
 399       ds_put_format (&s, _("Syntax error at end of input"));
 400       if (format != NULL)
 401         {
 402           ds_put_cstr (&s, ": ");
 403           ds_put_vformat (&s, format, args);
 404         }
 405       ds_put_byte (&s, '.');
 406       msg (SE, "%s", ds_cstr (&s));
 407       ds_destroy (&s);
 408     }
 409 }
 410
 411 /* Checks that we're at end of command.
 412    If so, returns a successful command completion code.
 413    If not, flags a syntax error and returns an error command
 414    completion code. */
 415 int
 416 lex_end_of_command (struct lexer *lexer)
 417 {
 418   if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
 419     {
 420       lex_error (lexer, _("expecting end of command"));
 421       return CMD_FAILURE;
 422     }
 423   else
 424     return CMD_SUCCESS;
 425 }
 426 \f
 427 /* Token testing functions. */
 428
 429 /* Returns true if the current token is a number. */
 430 bool
 431 lex_is_number (const struct lexer *lexer)
 432 {
 433   return lex_next_is_number (lexer, 0);
 434 }
 435
 436 /* Returns true if the current token is a string. */
 437 bool
 438 lex_is_string (const struct lexer *lexer)
 439 {
 440   return lex_next_is_string (lexer, 0);
 441 }
 442
 443 /* Returns the value of the current token, which must be a
 444    floating point number. */
 445 double
 446 lex_number (const struct lexer *lexer)
 447 {
 448   return lex_next_number (lexer, 0);
 449 }
 450
 451 /* Returns true iff the current token is an integer. */
 452 bool
 453 lex_is_integer (const struct lexer *lexer)
 454 {
 455   return lex_next_is_integer (lexer, 0);
 456 }
 457
 458 /* Returns the value of the current token, which must be an
 459    integer. */
 460 long
 461 lex_integer (const struct lexer *lexer)
 462 {
 463   return lex_next_integer (lexer, 0);
 464 }
 465 \f
 466 /* Token testing functions with lookahead.
 467
 468    A value of 0 for N as an argument to any of these functions refers to the
 469    current token.  Lookahead is limited to the current command.  Any N greater
 470    than the number of tokens remaining in the current command will be treated
 471    as referring to a T_ENDCMD token. */
 472
 473 /* Returns true if the token N ahead of the current token is a number. */
 474 bool
 475 lex_next_is_number (const struct lexer *lexer, int n)
 476 {
 477   enum token_type next_token = lex_next_token (lexer, n);
 478   return next_token == T_POS_NUM || next_token == T_NEG_NUM;
 479 }
 480
 481 /* Returns true if the token N ahead of the current token is a string. */
 482 bool
 483 lex_next_is_string (const struct lexer *lexer, int n)
 484 {
 485   return lex_next_token (lexer, n) == T_STRING;
 486 }
 487
 488 /* Returns the value of the token N ahead of the current token, which must be a
 489    floating point number. */
 490 double
 491 lex_next_number (const struct lexer *lexer, int n)
 492 {
 493   assert (lex_next_is_number (lexer, n));
 494   return lex_next_tokval (lexer, n);
 495 }
 496
 497 /* Returns true if the token N ahead of the current token is an integer. */
 498 bool
 499 lex_next_is_integer (const struct lexer *lexer, int n)
 500 {
 501   double value;
 502
 503   if (!lex_next_is_number (lexer, n))
 504     return false;
 505
 506   value = lex_next_tokval (lexer, n);
 507   return value > LONG_MIN && value <= LONG_MAX && floor (value) == value;
 508 }
 509
 510 /* Returns the value of the token N ahead of the current token, which must be
 511    an integer. */
 512 long
 513 lex_next_integer (const struct lexer *lexer, int n)
 514 {
 515   assert (lex_next_is_integer (lexer, n));
 516   return lex_next_tokval (lexer, n);
 517 }
 518 \f
 519 /* Token matching functions. */
 520
 521 /* If the current token has the specified TYPE, skips it and returns true.
 522    Otherwise, returns false. */
 523 bool
 524 lex_match (struct lexer *lexer, enum token_type type)
 525 {
 526   if (lex_token (lexer) == type)
 527     {
 528       lex_get (lexer);
 529       return true;
 530     }
 531   else
 532     return false;
 533 }
 534
 535 /* If the current token matches IDENTIFIER, skips it and returns true.
 536    IDENTIFIER may be abbreviated to its first three letters.  Otherwise,
 537    returns false.
 538
 539    IDENTIFIER must be an ASCII string. */
 540 bool
 541 lex_match_id (struct lexer *lexer, const char *identifier)
 542 {
 543   return lex_match_id_n (lexer, identifier, 3);
 544 }
 545
 546 /* If the current token is IDENTIFIER, skips it and returns true.  IDENTIFIER
 547    may be abbreviated to its first N letters.  Otherwise, returns false.
 548
 549    IDENTIFIER must be an ASCII string. */
 550 bool
 551 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
 552 {
 553   if (lex_token (lexer) == T_ID
 554       && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
 555     {
 556       lex_get (lexer);
 557       return true;
 558     }
 559   else
 560     return false;
 561 }
 562
 563 /* If the current token is integer X, skips it and returns true.  Otherwise,
 564    returns false. */
 565 bool
 566 lex_match_int (struct lexer *lexer, int x)
 567 {
 568   if (lex_is_integer (lexer) && lex_integer (lexer) == x)
 569     {
 570       lex_get (lexer);
 571       return true;
 572     }
 573   else
 574     return false;
 575 }
 576 \f
 577 /* Forced matches. */
 578
 579 /* If this token is IDENTIFIER, skips it and returns true.  IDENTIFIER may be
 580    abbreviated to its first 3 letters.  Otherwise, reports an error and returns
 581    false.
 582
 583    IDENTIFIER must be an ASCII string. */
 584 bool
 585 lex_force_match_id (struct lexer *lexer, const char *identifier)
 586 {
 587   if (lex_match_id (lexer, identifier))
 588     return true;
 589   else
 590     {
 591       lex_error_expecting (lexer, identifier);
 592       return false;
 593     }
 594 }
 595
 596 /* If the current token has the specified TYPE, skips it and returns true.
 597    Otherwise, reports an error and returns false. */
 598 bool
 599 lex_force_match (struct lexer *lexer, enum token_type type)
 600 {
 601   if (lex_token (lexer) == type)
 602     {
 603       lex_get (lexer);
 604       return true;
 605     }
 606   else
 607     {
 608       const char *type_string = token_type_to_string (type);
 609       if (type_string)
 610         {
 611           char *s = xasprintf ("`%s'", type_string);
 612           lex_error_expecting (lexer, s);
 613           free (s);
 614         }
 615       else
 616         lex_error_expecting (lexer, token_type_to_name (type));
 617
 618       return false;
 619     }
 620 }
 621
 622 /* If the current token is a string, does nothing and returns true.
 623    Otherwise, reports an error and returns false. */
 624 bool
 625 lex_force_string (struct lexer *lexer)
 626 {
 627   if (lex_is_string (lexer))
 628     return true;
 629   else
 630     {
 631       lex_error (lexer, _("expecting string"));
 632       return false;
 633     }
 634 }
 635
 636 /* If the current token is a string or an identifier, does nothing and returns
 637    true.  Otherwise, reports an error and returns false.
 638
 639    This is meant for use in syntactic situations where we want to encourage the
 640    user to supply a quoted string, but for compatibility we also accept
 641    identifiers.  (One example of such a situation is file names.)  Therefore,
 642    the error message issued when the current token is wrong only says that a
 643    string is expected and doesn't mention that an identifier would also be
 644    accepted. */
 645 bool
 646 lex_force_string_or_id (struct lexer *lexer)
 647 {
 648   return lex_token (lexer) == T_ID || lex_force_string (lexer);
 649 }
 650
 651 /* If the current token is an integer, does nothing and returns true.
 652    Otherwise, reports an error and returns false. */
 653 bool
 654 lex_force_int (struct lexer *lexer)
 655 {
 656   if (lex_is_integer (lexer))
 657     return true;
 658   else
 659     {
 660       lex_error (lexer, _("expecting integer"));
 661       return false;
 662     }
 663 }
 664
 665 /* If the current token is a number, does nothing and returns true.
 666    Otherwise, reports an error and returns false. */
 667 bool
 668 lex_force_num (struct lexer *lexer)
 669 {
 670   if (lex_is_number (lexer))
 671     return true;
 672
 673   lex_error (lexer, _("expecting number"));
 674   return false;
 675 }
 676
 677 /* If the current token is an identifier, does nothing and returns true.
 678    Otherwise, reports an error and returns false. */
 679 bool
 680 lex_force_id (struct lexer *lexer)
 681 {
 682   if (lex_token (lexer) == T_ID)
 683     return true;
 684
 685   lex_error (lexer, _("expecting identifier"));
 686   return false;
 687 }
 688 \f
 689 /* Token accessors. */
 690
 691 /* Returns the type of LEXER's current token. */
 692 enum token_type
 693 lex_token (const struct lexer *lexer)
 694 {
 695   return lex_next_token (lexer, 0);
 696 }
 697
 698 /* Returns the number in LEXER's current token.
 699
 700    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
 701    tokens this function will always return zero. */
 702 double
 703 lex_tokval (const struct lexer *lexer)
 704 {
 705   return lex_next_tokval (lexer, 0);
 706 }
 707
 708 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
 709
 710    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 711    this functions this function will always return NULL.
 712
 713    The UTF-8 encoding of the returned string is correct for variable names and
 714    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 715    data_in() to use it in a "union value".  */
 716 const char *
 717 lex_tokcstr (const struct lexer *lexer)
 718 {
 719   return lex_next_tokcstr (lexer, 0);
 720 }
 721
 722 /* Returns the string in LEXER's current token, UTF-8 encoded.  The string is
 723    null-terminated (but the null terminator is not included in the returned
 724    substring's 'length').
 725
 726    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 727    this functions this function will always return NULL.
 728
 729    The UTF-8 encoding of the returned string is correct for variable names and
 730    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 731    data_in() to use it in a "union value".  */
 732 struct substring
 733 lex_tokss (const struct lexer *lexer)
 734 {
 735   return lex_next_tokss (lexer, 0);
 736 }
 737 \f
 738 /* Looking ahead.
 739
 740    A value of 0 for N as an argument to any of these functions refers to the
 741    current token.  Lookahead is limited to the current command.  Any N greater
 742    than the number of tokens remaining in the current command will be treated
 743    as referring to a T_ENDCMD token. */
 744
 745 static const struct lex_token *
 746 lex_next__ (const struct lexer *lexer_, int n)
 747 {
 748   struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
 749   struct lex_source *src = lex_source__ (lexer);
 750
 751   if (src != NULL)
 752     return lex_source_next__ (src, n);
 753   else
 754     {
 755       static const struct lex_token stop_token =
 756         { TOKEN_INITIALIZER (T_STOP, 0.0, ""), 0, 0, 0, 0 };
 757
 758       return &stop_token;
 759     }
 760 }
 761
 762 static const struct lex_token *
 763 lex_source_next__ (const struct lex_source *src, int n)
 764 {
 765   while (deque_count (&src->deque) <= n)
 766     {
 767       if (!deque_is_empty (&src->deque))
 768         {
 769           struct lex_token *front;
 770
 771           front = &src->tokens[deque_front (&src->deque, 0)];
 772           if (front->token.type == T_STOP || front->token.type == T_ENDCMD)
 773             return front;
 774         }
 775
 776       lex_source_get__ (src);
 777     }
 778
 779   return &src->tokens[deque_back (&src->deque, n)];
 780 }
 781
 782 /* Returns the "struct token" of the token N after the current one in LEXER.
 783    The returned pointer can be invalidated by pretty much any succeeding call
 784    into the lexer, although the string pointer within the returned token is
 785    only invalidated by consuming the token (e.g. with lex_get()). */
 786 const struct token *
 787 lex_next (const struct lexer *lexer, int n)
 788 {
 789   return &lex_next__ (lexer, n)->token;
 790 }
 791
 792 /* Returns the type of the token N after the current one in LEXER. */
 793 enum token_type
 794 lex_next_token (const struct lexer *lexer, int n)
 795 {
 796   return lex_next (lexer, n)->type;
 797 }
 798
 799 /* Returns the number in the tokn N after the current one in LEXER.
 800
 801    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
 802    tokens this function will always return zero. */
 803 double
 804 lex_next_tokval (const struct lexer *lexer, int n)
 805 {
 806   const struct token *token = lex_next (lexer, n);
 807   return token->number;
 808 }
 809
 810 /* Returns the null-terminated string in the token N after the current one, in
 811    UTF-8 encoding.
 812
 813    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 814    this functions this function will always return NULL.
 815
 816    The UTF-8 encoding of the returned string is correct for variable names and
 817    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 818    data_in() to use it in a "union value".  */
 819 const char *
 820 lex_next_tokcstr (const struct lexer *lexer, int n)
 821 {
 822   return lex_next_tokss (lexer, n).string;
 823 }
 824
 825 /* Returns the string in the token N after the current one, in UTF-8 encoding.
 826    The string is null-terminated (but the null terminator is not included in
 827    the returned substring's 'length').
 828
 829    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 830    this functions this function will always return NULL.
 831
 832    The UTF-8 encoding of the returned string is correct for variable names and
 833    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 834    data_in() to use it in a "union value".  */
 835 struct substring
 836 lex_next_tokss (const struct lexer *lexer, int n)
 837 {
 838   return lex_next (lexer, n)->string;
 839 }
 840
 841 static bool
 842 lex_tokens_match (const struct token *actual, const struct token *expected)
 843 {
 844   if (actual->type != expected->type)
 845     return false;
 846
 847   switch (actual->type)
 848     {
 849     case T_POS_NUM:
 850     case T_NEG_NUM:
 851       return actual->number == expected->number;
 852
 853     case T_ID:
 854       return lex_id_match (expected->string, actual->string);
 855
 856     case T_STRING:
 857       return (actual->string.length == expected->string.length
 858               && !memcmp (actual->string.string, expected->string.string,
 859                           actual->string.length));
 860
 861     default:
 862       return true;
 863     }
 864 }
 865
 866 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
 867    skips it and returns true.  Otherwise, returns false.
 868
 869    S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
 870    "2SLS", or "END INPUT PROGRAM".  Identifiers may be abbreviated to their
 871    first three letters. */
 872 bool
 873 lex_match_phrase (struct lexer *lexer, const char *s)
 874 {
 875   struct string_lexer slex;
 876   struct token token;
 877   int i;
 878
 879   i = 0;
 880   string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE);
 881   while (string_lexer_next (&slex, &token))
 882     if (token.type != SCAN_SKIP)
 883       {
 884         bool match = lex_tokens_match (lex_next (lexer, i++), &token);
 885         token_destroy (&token);
 886         if (!match)
 887           return false;
 888       }
 889
 890   while (i-- > 0)
 891     lex_get (lexer);
 892   return true;
 893 }
 894
 895 static int
 896 lex_source_get_first_line_number (const struct lex_source *src, int n)
 897 {
 898   return lex_source_next__ (src, n)->first_line;
 899 }
 900
 901 static int
 902 count_newlines (char *s, size_t length)
 903 {
 904   int n_newlines = 0;
 905   char *newline;
 906
 907   while ((newline = memchr (s, '\n', length)) != NULL)
 908     {
 909       n_newlines++;
 910       length -= (newline + 1) - s;
 911       s = newline + 1;
 912     }
 913
 914   return n_newlines;
 915 }
 916
 917 static int
 918 lex_source_get_last_line_number (const struct lex_source *src, int n)
 919 {
 920   const struct lex_token *token = lex_source_next__ (src, n);
 921
 922   if (token->first_line == 0)
 923     return 0;
 924   else
 925     {
 926       char *token_str = &src->buffer[token->token_pos - src->tail];
 927       return token->first_line + count_newlines (token_str, token->token_len) + 1;
 928     }
 929 }
 930
 931 static int
 932 count_columns (const char *s_, size_t length)
 933 {
 934   const uint8_t *s = CHAR_CAST (const uint8_t *, s_);
 935   int columns;
 936   size_t ofs;
 937   int mblen;
 938
 939   columns = 0;
 940   for (ofs = 0; ofs < length; ofs += mblen)
 941     {
 942       ucs4_t uc;
 943
 944       mblen = u8_mbtouc (&uc, s + ofs, length - ofs);
 945       if (uc != '\t')
 946         {
 947           int width = uc_width (uc, "UTF-8");
 948           if (width > 0)
 949             columns += width;
 950         }
 951       else
 952         columns = ROUND_UP (columns + 1, 8);
 953     }
 954
 955   return columns + 1;
 956 }
 957
 958 static int
 959 lex_source_get_first_column (const struct lex_source *src, int n)
 960 {
 961   const struct lex_token *token = lex_source_next__ (src, n);
 962   return count_columns (&src->buffer[token->line_pos - src->tail],
 963                         token->token_pos - token->line_pos);
 964 }
 965
 966 static int
 967 lex_source_get_last_column (const struct lex_source *src, int n)
 968 {
 969   const struct lex_token *token = lex_source_next__ (src, n);
 970   char *start, *end, *newline;
 971
 972   start = &src->buffer[token->line_pos - src->tail];
 973   end = &src->buffer[(token->token_pos + token->token_len) - src->tail];
 974   newline = memrchr (start, '\n', end - start);
 975   if (newline != NULL)
 976     start = newline + 1;
 977   return count_columns (start, end - start);
 978 }
 979
 980 /* Returns the 1-based line number of the start of the syntax that represents
 981    the token N after the current one in LEXER.  Returns 0 for a T_STOP token or
 982    if the token is drawn from a source that does not have line numbers. */
 983 int
 984 lex_get_first_line_number (const struct lexer *lexer, int n)
 985 {
 986   const struct lex_source *src = lex_source__ (lexer);
 987   return src != NULL ? lex_source_get_first_line_number (src, n) : 0;
 988 }
 989
 990 /* Returns the 1-based line number of the end of the syntax that represents the
 991    token N after the current one in LEXER, plus 1.  Returns 0 for a T_STOP
 992    token or if the token is drawn from a source that does not have line
 993    numbers.
 994
 995    Most of the time, a single token is wholly within a single line of syntax,
 996    but there are two exceptions: a T_STRING token can be made up of multiple
 997    segments on adjacent lines connected with "+" punctuators, and a T_NEG_NUM
 998    token can consist of a "-" on one line followed by the number on the next.
 999  */
1000 int
1001 lex_get_last_line_number (const struct lexer *lexer, int n)
1002 {
1003   const struct lex_source *src = lex_source__ (lexer);
1004   return src != NULL ? lex_source_get_last_line_number (src, n) : 0;
1005 }
1006
1007 /* Returns the 1-based column number of the start of the syntax that represents
1008    the token N after the current one in LEXER.  Returns 0 for a T_STOP
1009    token.
1010
1011    Column numbers are measured according to the width of characters as shown in
1012    a typical fixed-width font, in which CJK characters have width 2 and
1013    combining characters have width 0.  */
1014 int
1015 lex_get_first_column (const struct lexer *lexer, int n)
1016 {
1017   const struct lex_source *src = lex_source__ (lexer);
1018   return src != NULL ? lex_source_get_first_column (src, n) : 0;
1019 }
1020
1021 /* Returns the 1-based column number of the end of the syntax that represents
1022    the token N after the current one in LEXER, plus 1.  Returns 0 for a T_STOP
1023    token.
1024
1025    Column numbers are measured according to the width of characters as shown in
1026    a typical fixed-width font, in which CJK characters have width 2 and
1027    combining characters have width 0.  */
1028 int
1029 lex_get_last_column (const struct lexer *lexer, int n)
1030 {
1031   const struct lex_source *src = lex_source__ (lexer);
1032   return src != NULL ? lex_source_get_last_column (src, n) : 0;
1033 }
1034
1035 /* Returns the name of the syntax file from which the current command is drawn.
1036    Returns NULL for a T_STOP token or if the command's source does not have
1037    line numbers.
1038
1039    There is no version of this function that takes an N argument because
1040    lookahead only works to the end of a command and any given command is always
1041    within a single syntax file. */
1042 const char *
1043 lex_get_file_name (const struct lexer *lexer)
1044 {
1045   struct lex_source *src = lex_source__ (lexer);
1046   return src == NULL ? NULL : src->reader->file_name;
1047 }
1048
1049 const char *
1050 lex_get_encoding (const struct lexer *lexer)
1051 {
1052   struct lex_source *src = lex_source__ (lexer);
1053   return src == NULL ? NULL : src->reader->encoding;
1054 }
1055
1056
1057 /* Returns the syntax mode for the syntax file from which the current drawn is
1058    drawn.  Returns LEX_SYNTAX_AUTO for a T_STOP token or if the command's
1059    source does not have line numbers.
1060
1061    There is no version of this function that takes an N argument because
1062    lookahead only works to the end of a command and any given command is always
1063    within a single syntax file. */
1064 enum lex_syntax_mode
1065 lex_get_syntax_mode (const struct lexer *lexer)
1066 {
1067   struct lex_source *src = lex_source__ (lexer);
1068   return src == NULL ? LEX_SYNTAX_AUTO : src->reader->syntax;
1069 }
1070
1071 /* Returns the error mode for the syntax file from which the current drawn is
1072    drawn.  Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1073    source does not have line numbers.
1074
1075    There is no version of this function that takes an N argument because
1076    lookahead only works to the end of a command and any given command is always
1077    within a single syntax file. */
1078 enum lex_error_mode
1079 lex_get_error_mode (const struct lexer *lexer)
1080 {
1081   struct lex_source *src = lex_source__ (lexer);
1082   return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1083 }
1084
1085 /* If the source that LEXER is currently reading has error mode
1086    LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1087    token to be read comes directly from whatever is next read from the stream.
1088
1089    It makes sense to call this function after encountering an error in a
1090    command entered on the console, because usually the user would prefer not to
1091    have cascading errors. */
1092 void
1093 lex_interactive_reset (struct lexer *lexer)
1094 {
1095   struct lex_source *src = lex_source__ (lexer);
1096   if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1097     {
1098       src->head = src->tail = 0;
1099       src->journal_pos = src->seg_pos = src->line_pos = 0;
1100       src->n_newlines = 0;
1101       src->suppress_next_newline = false;
1102       segmenter_init (&src->segmenter, segmenter_get_mode (&src->segmenter));
1103       while (!deque_is_empty (&src->deque))
1104         lex_source_pop__ (src);
1105       lex_source_push_endcmd__ (src);
1106     }
1107 }
1108
1109 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1110 void
1111 lex_discard_rest_of_command (struct lexer *lexer)
1112 {
1113   while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1114     lex_get (lexer);
1115 }
1116
1117 /* Discards all lookahead tokens in LEXER, then discards all input sources
1118    until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1119    runs out of input sources. */
1120 void
1121 lex_discard_noninteractive (struct lexer *lexer)
1122 {
1123   struct lex_source *src = lex_source__ (lexer);
1124
1125   if (src != NULL)
1126     {
1127       while (!deque_is_empty (&src->deque))
1128         lex_source_pop__ (src);
1129
1130       for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1131            src = lex_source__ (lexer))
1132         lex_source_destroy (src);
1133     }
1134 }
1135 \f
1136 static size_t
1137 lex_source_max_tail__ (const struct lex_source *src)
1138 {
1139   const struct lex_token *token;
1140   size_t max_tail;
1141
1142   assert (src->seg_pos >= src->line_pos);
1143   max_tail = MIN (src->journal_pos, src->line_pos);
1144
1145   /* Use the oldest token also.  (We know that src->deque cannot be empty
1146      because we are in the process of adding a new token, which is already
1147      initialized enough to use here.) */
1148   token = &src->tokens[deque_back (&src->deque, 0)];
1149   assert (token->token_pos >= token->line_pos);
1150   max_tail = MIN (max_tail, token->line_pos);
1151
1152   return max_tail;
1153 }
1154
1155 static void
1156 lex_source_expand__ (struct lex_source *src)
1157 {
1158   if (src->head - src->tail >= src->allocated)
1159     {
1160       size_t max_tail = lex_source_max_tail__ (src);
1161       if (max_tail > src->tail)
1162         {
1163           /* Advance the tail, freeing up room at the head. */
1164           memmove (src->buffer, src->buffer + (max_tail - src->tail),
1165                    src->head - max_tail);
1166           src->tail = max_tail;
1167         }
1168       else
1169         {
1170           /* Buffer is completely full.  Expand it. */
1171           src->buffer = x2realloc (src->buffer, &src->allocated);
1172         }
1173     }
1174   else
1175     {
1176       /* There's space available at the head of the buffer.  Nothing to do. */
1177     }
1178 }
1179
1180 static void
1181 lex_source_read__ (struct lex_source *src)
1182 {
1183   do
1184     {
1185       lex_source_expand__ (src);
1186
1187       size_t head_ofs = src->head - src->tail;
1188       size_t space = src->allocated - head_ofs;
1189       enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1190       size_t n = src->reader->class->read (src->reader, &src->buffer[head_ofs],
1191                                            space, prompt);
1192       assert (n <= space);
1193
1194       if (n == 0)
1195         {
1196           /* End of input. */
1197           src->reader->eof = true;
1198           lex_source_expand__ (src);
1199           return;
1200         }
1201
1202       src->head += n;
1203     }
1204   while (!memchr (&src->buffer[src->seg_pos - src->tail], '\n',
1205                   src->head - src->seg_pos));
1206 }
1207
1208 static struct lex_source *
1209 lex_source__ (const struct lexer *lexer)
1210 {
1211   return (ll_is_empty (&lexer->sources) ? NULL
1212           : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1213 }
1214
1215 static struct substring
1216 lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
1217 {
1218   const struct lex_token *token0 = lex_source_next__ (src, n0);
1219   const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1));
1220   size_t start = token0->token_pos;
1221   size_t end = token1->token_pos + token1->token_len;
1222
1223   return ss_buffer (&src->buffer[start - src->tail], end - start);
1224 }
1225
1226 static void
1227 lex_ellipsize__ (struct substring in, char *out, size_t out_size)
1228 {
1229   size_t out_maxlen;
1230   size_t out_len;
1231   int mblen;
1232
1233   assert (out_size >= 16);
1234   out_maxlen = out_size - 1;
1235   if (in.length > out_maxlen - 3)
1236     out_maxlen -= 3;
1237
1238   for (out_len = 0; out_len < in.length; out_len += mblen)
1239     {
1240       if (in.string[out_len] == '\n'
1241           || in.string[out_len] == '\0'
1242           || (in.string[out_len] == '\r'
1243               && out_len + 1 < in.length
1244               && in.string[out_len + 1] == '\n'))
1245         break;
1246
1247       mblen = u8_mblen (CHAR_CAST (const uint8_t *, in.string + out_len),
1248                         in.length - out_len);
1249
1250       if (mblen < 0)
1251         break;
1252
1253       if (out_len + mblen > out_maxlen)
1254         break;
1255     }
1256
1257   memcpy (out, in.string, out_len);
1258   strcpy (&out[out_len], out_len < in.length ? "..." : "");
1259 }
1260
1261 static void
1262 lex_source_error_valist (struct lex_source *src, int n0, int n1,
1263                          const char *format, va_list args)
1264 {
1265   const struct lex_token *token;
1266   struct string s;
1267
1268   ds_init_empty (&s);
1269
1270   token = lex_source_next__ (src, n0);
1271   if (token->token.type == T_ENDCMD)
1272     ds_put_cstr (&s, _("Syntax error at end of command"));
1273   else
1274     {
1275       struct substring syntax = lex_source_get_syntax__ (src, n0, n1);
1276       if (!ss_is_empty (syntax))
1277         {
1278           char syntax_cstr[64];
1279
1280           lex_ellipsize__ (syntax, syntax_cstr, sizeof syntax_cstr);
1281           ds_put_format (&s, _("Syntax error at `%s'"), syntax_cstr);
1282         }
1283       else
1284         ds_put_cstr (&s, _("Syntax error"));
1285     }
1286
1287   if (format)
1288     {
1289       ds_put_cstr (&s, ": ");
1290       ds_put_vformat (&s, format, args);
1291     }
1292   ds_put_byte (&s, '.');
1293
1294   struct msg m = {
1295     .category = MSG_C_SYNTAX,
1296     .severity = MSG_S_ERROR,
1297     .file_name = src->reader->file_name,
1298     .first_line = lex_source_get_first_line_number (src, n0),
1299     .last_line = lex_source_get_last_line_number (src, n1),
1300     .first_column = lex_source_get_first_column (src, n0),
1301     .last_column = lex_source_get_last_column (src, n1),
1302     .text = ds_steal_cstr (&s),
1303   };
1304   msg_emit (&m);
1305 }
1306
1307 static void PRINTF_FORMAT (2, 3)
1308 lex_get_error (struct lex_source *src, const char *format, ...)
1309 {
1310   va_list args;
1311   int n;
1312
1313   va_start (args, format);
1314
1315   n = deque_count (&src->deque) - 1;
1316   lex_source_error_valist (src, n, n, format, args);
1317   lex_source_pop_front (src);
1318
1319   va_end (args);
1320 }
1321
1322 /* Attempts to append an additional token into SRC's deque, reading more from
1323    the underlying lex_reader if necessary..  Returns true if successful, false
1324    if the deque already represents (a suffix of) the whole lex_reader's
1325    contents, */
1326 static bool
1327 lex_source_get__ (const struct lex_source *src_)
1328 {
1329   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1330   if (src->eof)
1331     return false;
1332
1333   /* State maintained while scanning tokens.  Usually we only need a single
1334      state, but scanner_push() can return SCAN_SAVE to indicate that the state
1335      needs to be saved and possibly restored later with SCAN_BACK. */
1336   struct state
1337     {
1338       struct segmenter segmenter;
1339       enum segment_type last_segment;
1340       int newlines;             /* Number of newlines encountered so far. */
1341       /* Maintained here so we can update lex_source's similar members when we
1342          finish. */
1343       size_t line_pos;
1344       size_t seg_pos;
1345     };
1346
1347   /* Initialize state. */
1348   struct state state =
1349     {
1350       .segmenter = src->segmenter,
1351       .newlines = 0,
1352       .seg_pos = src->seg_pos,
1353       .line_pos = src->line_pos,
1354     };
1355   struct state saved = state;
1356
1357   /* Append a new token to SRC and initialize it. */
1358   struct lex_token *token = lex_push_token__ (src);
1359   struct scanner scanner;
1360   scanner_init (&scanner, &token->token);
1361   token->line_pos = src->line_pos;
1362   token->token_pos = src->seg_pos;
1363   if (src->reader->line_number > 0)
1364     token->first_line = src->reader->line_number + src->n_newlines;
1365   else
1366     token->first_line = 0;
1367
1368   /* Extract segments and pass them through the scanner until we obtain a
1369      token. */
1370   for (;;)
1371     {
1372       /* Extract a segment. */
1373       const char *segment = &src->buffer[state.seg_pos - src->tail];
1374       size_t seg_maxlen = src->head - state.seg_pos;
1375       enum segment_type type;
1376       int seg_len = segmenter_push (&state.segmenter, segment, seg_maxlen,
1377                                     src->reader->eof, &type);
1378       if (seg_len < 0)
1379         {
1380           /* The segmenter needs more input to produce a segment. */
1381           assert (!src->reader->eof);
1382           lex_source_read__ (src);
1383           continue;
1384         }
1385
1386       /* Update state based on the segment. */
1387       state.last_segment = type;
1388       state.seg_pos += seg_len;
1389       if (type == SEG_NEWLINE)
1390         {
1391           state.newlines++;
1392           state.line_pos = state.seg_pos;
1393         }
1394
1395       /* Pass the segment into the scanner and try to get a token out. */
1396       enum scan_result result = scanner_push (&scanner, type,
1397                                               ss_buffer (segment, seg_len),
1398                                               &token->token);
1399       if (result == SCAN_SAVE)
1400         saved = state;
1401       else if (result == SCAN_BACK)
1402         {
1403           state = saved;
1404           break;
1405         }
1406       else if (result == SCAN_DONE)
1407         break;
1408     }
1409
1410   /* If we've reached the end of a line, or the end of a command, then pass
1411      the line to the output engine as a syntax text item.  */
1412   int n_lines = state.newlines;
1413   if (state.last_segment == SEG_END_COMMAND && !src->suppress_next_newline)
1414     {
1415       n_lines++;
1416       src->suppress_next_newline = true;
1417     }
1418   else if (n_lines > 0 && src->suppress_next_newline)
1419     {
1420       n_lines--;
1421       src->suppress_next_newline = false;
1422     }
1423   for (int i = 0; i < n_lines; i++)
1424     {
1425       /* Beginning of line. */
1426       const char *line = &src->buffer[src->journal_pos - src->tail];
1427
1428       /* Calculate line length, including \n or \r\n end-of-line if present.
1429
1430          We use src->head even though that may be beyond what we've actually
1431          converted to tokens (which is only through state.line_pos).  That's
1432          because, if we're emitting the line due to SEG_END_COMMAND, we want to
1433          take the whole line through the newline, not just through the '.'. */
1434       size_t max_len = src->head - src->journal_pos;
1435       const char *newline = memchr (line, '\n', max_len);
1436       size_t line_len = newline ? newline - line + 1 : max_len;
1437
1438       /* Calculate line length excluding end-of-line. */
1439       size_t copy_len = line_len;
1440       if (copy_len > 0 && line[copy_len - 1] == '\n')
1441         copy_len--;
1442       if (copy_len > 0 && line[copy_len - 1] == '\r')
1443         copy_len--;
1444
1445       /* Submit the line as syntax. */
1446       output_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX,
1447                                                    xmemdup0 (line, copy_len),
1448                                                    NULL));
1449
1450       src->journal_pos += line_len;
1451     }
1452
1453   token->token_len = state.seg_pos - src->seg_pos;
1454
1455   src->segmenter = state.segmenter;
1456   src->seg_pos = state.seg_pos;
1457   src->line_pos = state.line_pos;
1458   src->n_newlines += state.newlines;
1459
1460   switch (token->token.type)
1461     {
1462     default:
1463       break;
1464
1465     case T_STOP:
1466       token->token.type = T_ENDCMD;
1467       src->eof = true;
1468       break;
1469
1470     case SCAN_BAD_HEX_LENGTH:
1471       lex_get_error (src, _("String of hex digits has %d characters, which "
1472                             "is not a multiple of 2"),
1473                      (int) token->token.number);
1474       break;
1475
1476     case SCAN_BAD_HEX_DIGIT:
1477     case SCAN_BAD_UNICODE_DIGIT:
1478       lex_get_error (src, _("`%c' is not a valid hex digit"),
1479                      (int) token->token.number);
1480       break;
1481
1482     case SCAN_BAD_UNICODE_LENGTH:
1483       lex_get_error (src, _("Unicode string contains %d bytes, which is "
1484                             "not in the valid range of 1 to 8 bytes"),
1485                      (int) token->token.number);
1486       break;
1487
1488     case SCAN_BAD_UNICODE_CODE_POINT:
1489       lex_get_error (src, _("U+%04X is not a valid Unicode code point"),
1490                      (int) token->token.number);
1491       break;
1492
1493     case SCAN_EXPECTED_QUOTE:
1494       lex_get_error (src, _("Unterminated string constant"));
1495       break;
1496
1497     case SCAN_EXPECTED_EXPONENT:
1498       lex_get_error (src, _("Missing exponent following `%s'"),
1499                      token->token.string.string);
1500       break;
1501
1502     case SCAN_UNEXPECTED_DOT:
1503       lex_get_error (src, _("Unexpected `.' in middle of command"));
1504       break;
1505
1506     case SCAN_UNEXPECTED_CHAR:
1507       {
1508         char c_name[16];
1509         lex_get_error (src, _("Bad character %s in input"),
1510                        uc_name (token->token.number, c_name));
1511       }
1512       break;
1513
1514     case SCAN_SKIP:
1515       lex_source_pop_front (src);
1516       break;
1517     }
1518
1519   return true;
1520 }
1521 \f
1522 static void
1523 lex_source_push_endcmd__ (struct lex_source *src)
1524 {
1525   struct lex_token *token = lex_push_token__ (src);
1526   token->token.type = T_ENDCMD;
1527   token->token_pos = 0;
1528   token->token_len = 0;
1529   token->line_pos = 0;
1530   token->first_line = 0;
1531 }
1532
1533 static struct lex_source *
1534 lex_source_create (struct lex_reader *reader)
1535 {
1536   struct lex_source *src;
1537   enum segmenter_mode mode;
1538
1539   src = xzalloc (sizeof *src);
1540   src->reader = reader;
1541
1542   if (reader->syntax == LEX_SYNTAX_AUTO)
1543     mode = SEG_MODE_AUTO;
1544   else if (reader->syntax == LEX_SYNTAX_INTERACTIVE)
1545     mode = SEG_MODE_INTERACTIVE;
1546   else if (reader->syntax == LEX_SYNTAX_BATCH)
1547     mode = SEG_MODE_BATCH;
1548   else
1549     NOT_REACHED ();
1550   segmenter_init (&src->segmenter, mode);
1551
1552   src->tokens = deque_init (&src->deque, 4, sizeof *src->tokens);
1553
1554   lex_source_push_endcmd__ (src);
1555
1556   return src;
1557 }
1558
1559 static void
1560 lex_source_destroy (struct lex_source *src)
1561 {
1562   char *file_name = src->reader->file_name;
1563   char *encoding = src->reader->encoding;
1564   if (src->reader->class->destroy != NULL)
1565     src->reader->class->destroy (src->reader);
1566   free (file_name);
1567   free (encoding);
1568   free (src->buffer);
1569   while (!deque_is_empty (&src->deque))
1570     lex_source_pop__ (src);
1571   free (src->tokens);
1572   ll_remove (&src->ll);
1573   free (src);
1574 }
1575 \f
1576 struct lex_file_reader
1577   {
1578     struct lex_reader reader;
1579     struct u8_istream *istream;
1580   };
1581
1582 static struct lex_reader_class lex_file_reader_class;
1583
1584 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
1585    from stdin if FILE_NAME is "-").  The file is expected to be encoded with
1586    ENCODING, which should take one of the forms accepted by
1587    u8_istream_for_file().  SYNTAX and ERROR become the syntax mode and error
1588    mode of the new reader, respectively.
1589
1590    Returns a null pointer if FILE_NAME cannot be opened. */
1591 struct lex_reader *
1592 lex_reader_for_file (const char *file_name, const char *encoding,
1593                      enum lex_syntax_mode syntax,
1594                      enum lex_error_mode error)
1595 {
1596   struct lex_file_reader *r;
1597   struct u8_istream *istream;
1598
1599   istream = (!strcmp(file_name, "-")
1600              ? u8_istream_for_fd (encoding, STDIN_FILENO)
1601              : u8_istream_for_file (encoding, file_name, O_RDONLY));
1602   if (istream == NULL)
1603     {
1604       msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
1605       return NULL;
1606     }
1607
1608   r = xmalloc (sizeof *r);
1609   lex_reader_init (&r->reader, &lex_file_reader_class);
1610   r->reader.syntax = syntax;
1611   r->reader.error = error;
1612   r->reader.file_name = xstrdup (file_name);
1613   r->reader.encoding = xstrdup_if_nonnull (encoding);
1614   r->reader.line_number = 1;
1615   r->istream = istream;
1616
1617   return &r->reader;
1618 }
1619
1620 static struct lex_file_reader *
1621 lex_file_reader_cast (struct lex_reader *r)
1622 {
1623   return UP_CAST (r, struct lex_file_reader, reader);
1624 }
1625
1626 static size_t
1627 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
1628                enum prompt_style prompt_style UNUSED)
1629 {
1630   struct lex_file_reader *r = lex_file_reader_cast (r_);
1631   ssize_t n_read = u8_istream_read (r->istream, buf, n);
1632   if (n_read < 0)
1633     {
1634       msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
1635       return 0;
1636     }
1637   return n_read;
1638 }
1639
1640 static void
1641 lex_file_close (struct lex_reader *r_)
1642 {
1643   struct lex_file_reader *r = lex_file_reader_cast (r_);
1644
1645   if (u8_istream_fileno (r->istream) != STDIN_FILENO)
1646     {
1647       if (u8_istream_close (r->istream) != 0)
1648         msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
1649     }
1650   else
1651     u8_istream_free (r->istream);
1652
1653   free (r);
1654 }
1655
1656 static struct lex_reader_class lex_file_reader_class =
1657   {
1658     lex_file_read,
1659     lex_file_close
1660   };
1661 \f
1662 struct lex_string_reader
1663   {
1664     struct lex_reader reader;
1665     struct substring s;
1666     size_t offset;
1667   };
1668
1669 static struct lex_reader_class lex_string_reader_class;
1670
1671 /* Creates and returns a new lex_reader for the contents of S, which must be
1672    encoded in the given ENCODING.  The new reader takes ownership of S and will free it
1673    with ss_dealloc() when it is closed. */
1674 struct lex_reader *
1675 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
1676 {
1677   struct lex_string_reader *r;
1678
1679   r = xmalloc (sizeof *r);
1680   lex_reader_init (&r->reader, &lex_string_reader_class);
1681   r->reader.syntax = LEX_SYNTAX_AUTO;
1682   r->reader.encoding = xstrdup_if_nonnull (encoding);
1683   r->s = s;
1684   r->offset = 0;
1685
1686   return &r->reader;
1687 }
1688
1689 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
1690    which must be encoded in ENCODING.  The caller retains ownership of S. */
1691 struct lex_reader *
1692 lex_reader_for_string (const char *s, const char *encoding)
1693 {
1694   struct substring ss;
1695   ss_alloc_substring (&ss, ss_cstr (s));
1696   return lex_reader_for_substring_nocopy (ss, encoding);
1697 }
1698
1699 /* Formats FORMAT as a printf()-like format string and creates and returns a
1700    new lex_reader for the formatted result.  */
1701 struct lex_reader *
1702 lex_reader_for_format (const char *format, const char *encoding, ...)
1703 {
1704   struct lex_reader *r;
1705   va_list args;
1706
1707   va_start (args, encoding);
1708   r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
1709   va_end (args);
1710
1711   return r;
1712 }
1713
1714 static struct lex_string_reader *
1715 lex_string_reader_cast (struct lex_reader *r)
1716 {
1717   return UP_CAST (r, struct lex_string_reader, reader);
1718 }
1719
1720 static size_t
1721 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
1722                  enum prompt_style prompt_style UNUSED)
1723 {
1724   struct lex_string_reader *r = lex_string_reader_cast (r_);
1725   size_t chunk;
1726
1727   chunk = MIN (n, r->s.length - r->offset);
1728   memcpy (buf, r->s.string + r->offset, chunk);
1729   r->offset += chunk;
1730
1731   return chunk;
1732 }
1733
1734 static void
1735 lex_string_close (struct lex_reader *r_)
1736 {
1737   struct lex_string_reader *r = lex_string_reader_cast (r_);
1738
1739   ss_dealloc (&r->s);
1740   free (r);
1741 }
1742
1743 static struct lex_reader_class lex_string_reader_class =
1744   {
1745     lex_string_read,
1746     lex_string_close
1747   };