src/language/lexer/lexer.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 1997-9, 2000, 2006 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18 #include "lexer.h"
  19 #include <libpspp/message.h>
  20 #include <ctype.h>
  21 #include <errno.h>
  22 #include <limits.h>
  23 #include <math.h>
  24 #include <stdarg.h>
  25 #include <stdint.h>
  26 #include <stdlib.h>
  27 #include <libpspp/assertion.h>
  28 #include <language/command.h>
  29 #include <libpspp/message.h>
  30 #include <data/settings.h>
  31 #include <libpspp/getl.h>
  32 #include <libpspp/str.h>
  33 #include <output/journal.h>
  34
  35 #include "xalloc.h"
  36
  37 #include "gettext.h"
  38 #define _(msgid) gettext (msgid)
  39 #define N_(msgid) msgid
  40
  41
  42 #define DUMP_TOKENS 0
  43
  44
  45
  46 struct lexer
  47 {
  48   struct string line_buffer;
  49
  50   struct source_stream *ss;
  51
  52   int token;      /* Current token. */
  53   double tokval;  /* T_POS_NUM, T_NEG_NUM: the token's value. */
  54
  55   char tokid [LONG_NAME_LEN + 1];   /* T_ID: the identifier. */
  56
  57   struct string tokstr;   /* T_ID, T_STRING: token string value.
  58                             For T_ID, this is not truncated as is
  59                             tokid. */
  60
  61   char *prog; /* Pointer to next token in line_buffer. */
  62   bool dot;   /* True only if this line ends with a terminal dot. */
  63
  64   int put_token ; /* If nonzero, next token returned by lex_get().
  65                     Used only in exceptional circumstances. */
  66
  67   struct string put_tokstr;
  68   double put_tokval;
  69 };
  70
  71
  72 static int parse_id (struct lexer *);
  73
  74 /* How a string represents its contents. */
  75 enum string_type
  76   {
  77     CHARACTER_STRING,   /* Characters. */
  78     BINARY_STRING,      /* Binary digits. */
  79     OCTAL_STRING,       /* Octal digits. */
  80     HEX_STRING          /* Hexadecimal digits. */
  81   };
  82
  83 static int parse_string (struct lexer *, enum string_type);
  84
  85 #if DUMP_TOKENS
  86 static void dump_token (struct lexer *);
  87 #endif
  88 \f
  89 /* Initialization. */
  90
  91 /* Initializes the lexer. */
  92 struct lexer *
  93 lex_create (struct source_stream *ss)
  94 {
  95   struct lexer *lexer = xzalloc (sizeof (*lexer));
  96
  97   ds_init_empty (&lexer->tokstr);
  98   ds_init_empty (&lexer->put_tokstr);
  99   ds_init_empty (&lexer->line_buffer);
 100   lexer->ss = ss;
 101
 102   return lexer;
 103 }
 104
 105 struct source_stream *
 106 lex_get_source_stream (const struct lexer *lex)
 107 {
 108   return lex->ss;
 109 }
 110
 111 enum syntax_mode
 112 lex_current_syntax_mode (const struct lexer *lex)
 113 {
 114   return source_stream_current_syntax_mode (lex->ss);
 115 }
 116
 117 enum error_mode
 118 lex_current_error_mode (const struct lexer *lex)
 119 {
 120   return source_stream_current_error_mode (lex->ss);
 121 }
 122
 123
 124 void
 125 lex_destroy (struct lexer *lexer)
 126 {
 127   if ( NULL != lexer )
 128     {
 129       ds_destroy (&lexer->put_tokstr);
 130       ds_destroy (&lexer->tokstr);
 131       ds_destroy (&lexer->line_buffer);
 132
 133       free (lexer);
 134     }
 135 }
 136
 137 \f
 138 /* Common functions. */
 139
 140 /* Copies put_token, lexer->put_tokstr, put_tokval into token, tokstr,
 141    tokval, respectively, and sets tokid appropriately. */
 142 static void
 143 restore_token (struct lexer *lexer)
 144 {
 145   assert (lexer->put_token != 0);
 146   lexer->token = lexer->put_token;
 147   ds_assign_string (&lexer->tokstr, &lexer->put_tokstr);
 148   str_copy_trunc (lexer->tokid, sizeof lexer->tokid, ds_cstr (&lexer->tokstr));
 149   lexer->tokval = lexer->put_tokval;
 150   lexer->put_token = 0;
 151 }
 152
 153 /* Copies token, tokstr, lexer->tokval into lexer->put_token, put_tokstr,
 154    put_lexer->tokval respectively. */
 155 static void
 156 save_token (struct lexer *lexer)
 157 {
 158   lexer->put_token = lexer->token;
 159   ds_assign_string (&lexer->put_tokstr, &lexer->tokstr);
 160   lexer->put_tokval = lexer->tokval;
 161 }
 162
 163 /* Parses a single token, setting appropriate global variables to
 164    indicate the token's attributes. */
 165 void
 166 lex_get (struct lexer *lexer)
 167 {
 168   /* Find a token. */
 169   for (;;)
 170     {
 171       if (NULL == lexer->prog && ! lex_get_line (lexer) )
 172         {
 173           lexer->token = T_STOP;
 174           return;
 175         }
 176
 177   /* If a token was pushed ahead, return it. */
 178   if (lexer->put_token)
 179     {
 180       restore_token (lexer);
 181 #if DUMP_TOKENS
 182           dump_token (lexer);
 183 #endif
 184       return;
 185     }
 186
 187   for (;;)
 188     {
 189       /* Skip whitespace. */
 190           while (isspace ((unsigned char) *lexer->prog))
 191             lexer->prog++;
 192
 193           if (*lexer->prog)
 194             break;
 195
 196           if (lexer->dot)
 197             {
 198               lexer->dot = 0;
 199               lexer->token = '.';
 200 #if DUMP_TOKENS
 201               dump_token (lexer);
 202 #endif
 203               return;
 204             }
 205           else if (!lex_get_line (lexer))
 206             {
 207               lexer->prog = NULL;
 208               lexer->token = T_STOP;
 209 #if DUMP_TOKENS
 210               dump_token (lexer);
 211 #endif
 212               return;
 213             }
 214
 215           if (lexer->put_token)
 216             {
 217               restore_token (lexer);
 218 #if DUMP_TOKENS
 219               dump_token (lexer);
 220 #endif
 221               return;
 222             }
 223         }
 224
 225
 226       /* Actually parse the token. */
 227       ds_clear (&lexer->tokstr);
 228
 229       switch (*lexer->prog)
 230         {
 231         case '-': case '.':
 232         case '0': case '1': case '2': case '3': case '4':
 233         case '5': case '6': case '7': case '8': case '9':
 234           {
 235             char *tail;
 236
 237             /* `-' can introduce a negative number, or it can be a
 238                token by itself.  If it is not followed by a digit or a
 239                decimal point, it is definitely not a number.
 240                Otherwise, it might be either, but most of the time we
 241                want it as a number.  When the syntax calls for a `-'
 242                token, lex_negative_to_dash() must be used to break
 243                negative numbers into two tokens. */
 244             if (*lexer->prog == '-')
 245               {
 246                 ds_put_char (&lexer->tokstr, *lexer->prog++);
 247                 while (isspace ((unsigned char) *lexer->prog))
 248                   lexer->prog++;
 249
 250                 if (!isdigit ((unsigned char) *lexer->prog) && *lexer->prog != '.')
 251                   {
 252                     lexer->token = '-';
 253                     break;
 254                   }
 255                 lexer->token = T_NEG_NUM;
 256               }
 257             else
 258               lexer->token = T_POS_NUM;
 259
 260             /* Parse the number, copying it into tokstr. */
 261             while (isdigit ((unsigned char) *lexer->prog))
 262               ds_put_char (&lexer->tokstr, *lexer->prog++);
 263             if (*lexer->prog == '.')
 264               {
 265                 ds_put_char (&lexer->tokstr, *lexer->prog++);
 266                 while (isdigit ((unsigned char) *lexer->prog))
 267                   ds_put_char (&lexer->tokstr, *lexer->prog++);
 268               }
 269             if (*lexer->prog == 'e' || *lexer->prog == 'E')
 270               {
 271                 ds_put_char (&lexer->tokstr, *lexer->prog++);
 272                 if (*lexer->prog == '+' || *lexer->prog == '-')
 273                   ds_put_char (&lexer->tokstr, *lexer->prog++);
 274                 while (isdigit ((unsigned char) *lexer->prog))
 275                   ds_put_char (&lexer->tokstr, *lexer->prog++);
 276               }
 277
 278             /* Parse as floating point. */
 279             lexer->tokval = strtod (ds_cstr (&lexer->tokstr), &tail);
 280             if (*tail)
 281               {
 282                 msg (SE, _("%s does not form a valid number."),
 283                      ds_cstr (&lexer->tokstr));
 284                 lexer->tokval = 0.0;
 285
 286                 ds_clear (&lexer->tokstr);
 287                 ds_put_char (&lexer->tokstr, '0');
 288               }
 289
 290             break;
 291           }
 292
 293         case '\'': case '"':
 294           lexer->token = parse_string (lexer, CHARACTER_STRING);
 295           break;
 296
 297         case '(': case ')': case ',': case '=': case '+': case '/':
 298           lexer->token = *lexer->prog++;
 299           break;
 300
 301         case '*':
 302           if (*++lexer->prog == '*')
 303             {
 304               lexer->prog++;
 305               lexer->token = T_EXP;
 306             }
 307           else
 308             lexer->token = '*';
 309           break;
 310
 311         case '<':
 312           if (*++lexer->prog == '=')
 313             {
 314               lexer->prog++;
 315               lexer->token = T_LE;
 316             }
 317           else if (*lexer->prog == '>')
 318             {
 319               lexer->prog++;
 320               lexer->token = T_NE;
 321             }
 322           else
 323             lexer->token = T_LT;
 324           break;
 325
 326         case '>':
 327           if (*++lexer->prog == '=')
 328             {
 329               lexer->prog++;
 330               lexer->token = T_GE;
 331             }
 332           else
 333             lexer->token = T_GT;
 334           break;
 335
 336         case '~':
 337           if (*++lexer->prog == '=')
 338             {
 339               lexer->prog++;
 340               lexer->token = T_NE;
 341             }
 342           else
 343             lexer->token = T_NOT;
 344           break;
 345
 346         case '&':
 347           lexer->prog++;
 348           lexer->token = T_AND;
 349           break;
 350
 351         case '|':
 352           lexer->prog++;
 353           lexer->token = T_OR;
 354           break;
 355
 356         case 'b': case 'B':
 357           if (lexer->prog[1] == '\'' || lexer->prog[1] == '"')
 358             lexer->token = parse_string (lexer, BINARY_STRING);
 359           else
 360             lexer->token = parse_id (lexer);
 361           break;
 362
 363         case 'o': case 'O':
 364           if (lexer->prog[1] == '\'' || lexer->prog[1] == '"')
 365             lexer->token = parse_string (lexer, OCTAL_STRING);
 366           else
 367             lexer->token = parse_id (lexer);
 368           break;
 369
 370         case 'x': case 'X':
 371           if (lexer->prog[1] == '\'' || lexer->prog[1] == '"')
 372             lexer->token = parse_string (lexer, HEX_STRING);
 373           else
 374             lexer->token = parse_id (lexer);
 375           break;
 376
 377         default:
 378           if (lex_is_id1 (*lexer->prog))
 379             {
 380               lexer->token = parse_id (lexer);
 381               break;
 382             }
 383           else
 384             {
 385               if (isgraph ((unsigned char) *lexer->prog))
 386                 msg (SE, _("Bad character in input: `%c'."), *lexer->prog++);
 387               else
 388                 msg (SE, _("Bad character in input: `\\%o'."), *lexer->prog++);
 389               continue;
 390             }
 391         }
 392       break;
 393     }
 394
 395 #if DUMP_TOKENS
 396   dump_token (lexer);
 397 #endif
 398 }
 399
 400 /* Parses an identifier at the current position into tokid and
 401    tokstr.
 402    Returns the correct token type. */
 403 static int
 404 parse_id (struct lexer *lexer)
 405 {
 406   struct substring rest_of_line
 407     = ss_substr (ds_ss (&lexer->line_buffer),
 408                  ds_pointer_to_position (&lexer->line_buffer, lexer->prog),
 409                  SIZE_MAX);
 410   struct substring id = ss_head (rest_of_line,
 411                                  lex_id_get_length (rest_of_line));
 412   lexer->prog += ss_length (id);
 413
 414   ds_assign_substring (&lexer->tokstr, id);
 415   str_copy_trunc (lexer->tokid, sizeof lexer->tokid, ds_cstr (&lexer->tokstr));
 416   return lex_id_to_token (id);
 417 }
 418
 419 /* Reports an error to the effect that subcommand SBC may only be
 420    specified once. */
 421 void
 422 lex_sbc_only_once (const char *sbc)
 423 {
 424   msg (SE, _("Subcommand %s may only be specified once."), sbc);
 425 }
 426
 427 /* Reports an error to the effect that subcommand SBC is
 428    missing. */
 429 void
 430 lex_sbc_missing (struct lexer *lexer, const char *sbc)
 431 {
 432   lex_error (lexer, _("missing required subcommand %s"), sbc);
 433 }
 434
 435 /* Prints a syntax error message containing the current token and
 436    given message MESSAGE (if non-null). */
 437 void
 438 lex_error (struct lexer *lexer, const char *message, ...)
 439 {
 440   char *token_rep;
 441   char where[128];
 442
 443   token_rep = lex_token_representation (lexer);
 444   if (lexer->token == T_STOP)
 445     strcpy (where, "end of file");
 446   else if (lexer->token == '.')
 447     strcpy (where, "end of command");
 448   else
 449     snprintf (where, sizeof where, "`%s'", token_rep);
 450   free (token_rep);
 451
 452   if (message)
 453     {
 454       char buf[1024];
 455       va_list args;
 456
 457       va_start (args, message);
 458       vsnprintf (buf, 1024, message, args);
 459       va_end (args);
 460
 461       msg (SE, _("Syntax error %s at %s."), buf, where);
 462     }
 463   else
 464     msg (SE, _("Syntax error at %s."), where);
 465 }
 466
 467 /* Checks that we're at end of command.
 468    If so, returns a successful command completion code.
 469    If not, flags a syntax error and returns an error command
 470    completion code. */
 471 int
 472 lex_end_of_command (struct lexer *lexer)
 473 {
 474   if (lexer->token != '.')
 475     {
 476       lex_error (lexer, _("expecting end of command"));
 477       return CMD_FAILURE;
 478     }
 479   else
 480     return CMD_SUCCESS;
 481 }
 482 \f
 483 /* Token testing functions. */
 484
 485 /* Returns true if the current token is a number. */
 486 bool
 487 lex_is_number (struct lexer *lexer)
 488 {
 489   return lexer->token == T_POS_NUM || lexer->token == T_NEG_NUM;
 490 }
 491
 492
 493 /* Returns true if the current token is a string. */
 494 bool
 495 lex_is_string (struct lexer *lexer)
 496 {
 497   return lexer->token == T_STRING;
 498 }
 499
 500
 501 /* Returns the value of the current token, which must be a
 502    floating point number. */
 503 double
 504 lex_number (struct lexer *lexer)
 505 {
 506   assert (lex_is_number (lexer));
 507   return lexer->tokval;
 508 }
 509
 510 /* Returns true iff the current token is an integer. */
 511 bool
 512 lex_is_integer (struct lexer *lexer)
 513 {
 514   return (lex_is_number (lexer)
 515           && lexer->tokval > LONG_MIN
 516           && lexer->tokval <= LONG_MAX
 517           && floor (lexer->tokval) == lexer->tokval);
 518 }
 519
 520 /* Returns the value of the current token, which must be an
 521    integer. */
 522 long
 523 lex_integer (struct lexer *lexer)
 524 {
 525   assert (lex_is_integer (lexer));
 526   return lexer->tokval;
 527 }
 528 \f
 529 /* Token matching functions. */
 530
 531 /* If TOK is the current token, skips it and returns true
 532    Otherwise, returns false. */
 533 bool
 534 lex_match (struct lexer *lexer, int t)
 535 {
 536   if (lexer->token == t)
 537     {
 538       lex_get (lexer);
 539       return true;
 540     }
 541   else
 542     return false;
 543 }
 544
 545 /* If the current token is the identifier S, skips it and returns
 546    true.  The identifier may be abbreviated to its first three
 547    letters.
 548    Otherwise, returns false. */
 549 bool
 550 lex_match_id (struct lexer *lexer, const char *s)
 551 {
 552   if (lexer->token == T_ID
 553       && lex_id_match (ss_cstr (s), ss_cstr (lexer->tokid)))
 554     {
 555       lex_get (lexer);
 556       return true;
 557     }
 558   else
 559     return false;
 560 }
 561
 562 /* If the current token is integer N, skips it and returns true.
 563    Otherwise, returns false. */
 564 bool
 565 lex_match_int (struct lexer *lexer, int x)
 566 {
 567   if (lex_is_integer (lexer) && lex_integer (lexer) == x)
 568     {
 569       lex_get (lexer);
 570       return true;
 571     }
 572   else
 573     return false;
 574 }
 575 \f
 576 /* Forced matches. */
 577
 578 /* If this token is identifier S, fetches the next token and returns
 579    nonzero.
 580    Otherwise, reports an error and returns zero. */
 581 bool
 582 lex_force_match_id (struct lexer *lexer, const char *s)
 583 {
 584   if (lex_match_id (lexer, s))
 585     return true;
 586   else
 587     {
 588       lex_error (lexer, _("expecting `%s'"), s);
 589       return false;
 590     }
 591 }
 592
 593 /* If the current token is T, skips the token.  Otherwise, reports an
 594    error and returns from the current function with return value false. */
 595 bool
 596 lex_force_match (struct lexer *lexer, int t)
 597 {
 598   if (lexer->token == t)
 599     {
 600       lex_get (lexer);
 601       return true;
 602     }
 603   else
 604     {
 605       lex_error (lexer, _("expecting `%s'"), lex_token_name (t));
 606       return false;
 607     }
 608 }
 609
 610 /* If this token is a string, does nothing and returns true.
 611    Otherwise, reports an error and returns false. */
 612 bool
 613 lex_force_string (struct lexer *lexer)
 614 {
 615   if (lexer->token == T_STRING)
 616     return true;
 617   else
 618     {
 619       lex_error (lexer, _("expecting string"));
 620       return false;
 621     }
 622 }
 623
 624 /* If this token is an integer, does nothing and returns true.
 625    Otherwise, reports an error and returns false. */
 626 bool
 627 lex_force_int (struct lexer *lexer)
 628 {
 629   if (lex_is_integer (lexer))
 630     return true;
 631   else
 632     {
 633       lex_error (lexer, _("expecting integer"));
 634       return false;
 635     }
 636 }
 637
 638 /* If this token is a number, does nothing and returns true.
 639    Otherwise, reports an error and returns false. */
 640 bool
 641 lex_force_num (struct lexer *lexer)
 642 {
 643   if (lex_is_number (lexer))
 644     return true;
 645
 646   lex_error (lexer, _("expecting number"));
 647   return false;
 648 }
 649
 650 /* If this token is an identifier, does nothing and returns true.
 651    Otherwise, reports an error and returns false. */
 652 bool
 653 lex_force_id (struct lexer *lexer)
 654 {
 655   if (lexer->token == T_ID)
 656     return true;
 657
 658   lex_error (lexer, _("expecting identifier"));
 659   return false;
 660 }
 661
 662 /* Weird token functions. */
 663
 664 /* Returns the first character of the next token, except that if the
 665    next token is not an identifier, the character returned will not be
 666    a character that can begin an identifier.  Specifically, the
 667    hexstring lead-in X' causes lookahead() to return '.  Note that an
 668    alphanumeric return value doesn't guarantee an ID token, it could
 669    also be a reserved-word token. */
 670 int
 671 lex_look_ahead (struct lexer *lexer)
 672 {
 673   if (lexer->put_token)
 674     return lexer->put_token;
 675
 676   for (;;)
 677     {
 678       if (NULL == lexer->prog && ! lex_get_line (lexer) )
 679         return 0;
 680
 681       for (;;)
 682         {
 683           while (isspace ((unsigned char) *lexer->prog))
 684             lexer->prog++;
 685           if (*lexer->prog)
 686             break;
 687
 688           if (lexer->dot)
 689             return '.';
 690           else if (!lex_get_line (lexer))
 691             return 0;
 692
 693           if (lexer->put_token)
 694             return lexer->put_token;
 695         }
 696
 697       if ((toupper ((unsigned char) *lexer->prog) == 'X'
 698            || toupper ((unsigned char) *lexer->prog) == 'B'
 699            || toupper ((unsigned char) *lexer->prog) == 'O')
 700           && (lexer->prog[1] == '\'' || lexer->prog[1] == '"'))
 701         return '\'';
 702
 703       return *lexer->prog;
 704     }
 705 }
 706
 707 /* Makes the current token become the next token to be read; the
 708    current token is set to T. */
 709 void
 710 lex_put_back (struct lexer *lexer, int t)
 711 {
 712   save_token (lexer);
 713   lexer->token = t;
 714 }
 715
 716 /* Makes the current token become the next token to be read; the
 717    current token is set to the identifier ID. */
 718 void
 719 lex_put_back_id (struct lexer *lexer, const char *id)
 720 {
 721   assert (lex_id_to_token (ss_cstr (id)) == T_ID);
 722   save_token (lexer);
 723   lexer->token = T_ID;
 724   ds_assign_cstr (&lexer->tokstr, id);
 725   str_copy_trunc (lexer->tokid, sizeof lexer->tokid, ds_cstr (&lexer->tokstr));
 726 }
 727 \f
 728 /* Weird line processing functions. */
 729
 730 /* Returns the entire contents of the current line. */
 731 const char *
 732 lex_entire_line (const struct lexer *lexer)
 733 {
 734   return ds_cstr (&lexer->line_buffer);
 735 }
 736
 737 const struct string *
 738 lex_entire_line_ds (const struct lexer *lexer)
 739 {
 740   return &lexer->line_buffer;
 741 }
 742
 743 /* As lex_entire_line(), but only returns the part of the current line
 744    that hasn't already been tokenized. */
 745 const char *
 746 lex_rest_of_line (const struct lexer *lexer)
 747 {
 748   return lexer->prog;
 749 }
 750
 751 /* Returns true if the current line ends in a terminal dot,
 752    false otherwise. */
 753 bool
 754 lex_end_dot (const struct lexer *lexer)
 755 {
 756   return lexer->dot;
 757 }
 758
 759 /* Causes the rest of the current input line to be ignored for
 760    tokenization purposes. */
 761 void
 762 lex_discard_line (struct lexer *lexer)
 763 {
 764   ds_cstr (&lexer->line_buffer);  /* Ensures ds_end points to something valid */
 765   lexer->prog = ds_end (&lexer->line_buffer);
 766   lexer->dot = false;
 767   lexer->put_token = 0;
 768 }
 769
 770
 771 /* Discards the rest of the current command.
 772    When we're reading commands from a file, we skip tokens until
 773    a terminal dot or EOF.
 774    When we're reading commands interactively from the user,
 775    that's just discarding the current line, because presumably
 776    the user doesn't want to finish typing a command that will be
 777    ignored anyway. */
 778 void
 779 lex_discard_rest_of_command (struct lexer *lexer)
 780 {
 781   if (!getl_is_interactive (lexer->ss))
 782     {
 783       while (lexer->token != T_STOP && lexer->token != '.')
 784         lex_get (lexer);
 785     }
 786   else
 787     lex_discard_line (lexer);
 788 }
 789 \f
 790 /* Weird line reading functions. */
 791
 792 /* Remove C-style comments in STRING, begun by slash-star and
 793    terminated by star-slash or newline. */
 794 static void
 795 strip_comments (struct string *string)
 796 {
 797   char *cp;
 798   int quote;
 799   bool in_comment;
 800
 801   in_comment = false;
 802   quote = EOF;
 803   for (cp = ds_cstr (string); *cp; )
 804     {
 805       /* If we're not in a comment, check for quote marks. */
 806       if (!in_comment)
 807         {
 808           if (*cp == quote)
 809             quote = EOF;
 810           else if (*cp == '\'' || *cp == '"')
 811             quote = *cp;
 812         }
 813
 814       /* If we're not inside a quotation, check for comment. */
 815       if (quote == EOF)
 816         {
 817           if (cp[0] == '/' && cp[1] == '*')
 818             {
 819               in_comment = true;
 820               *cp++ = ' ';
 821               *cp++ = ' ';
 822               continue;
 823             }
 824           else if (in_comment && cp[0] == '*' && cp[1] == '/')
 825             {
 826               in_comment = false;
 827               *cp++ = ' ';
 828               *cp++ = ' ';
 829               continue;
 830             }
 831         }
 832
 833       /* Check commenting. */
 834       if (in_comment)
 835         *cp = ' ';
 836       cp++;
 837     }
 838 }
 839
 840 /* Prepares LINE, which is subject to the given SYNTAX rules, for
 841    tokenization by stripping comments and determining whether it
 842    is the beginning or end of a command and storing into
 843    *LINE_STARTS_COMMAND and *LINE_ENDS_COMMAND appropriately. */
 844 void
 845 lex_preprocess_line (struct string *line,
 846                      enum syntax_mode syntax,
 847                      bool *line_starts_command,
 848                      bool *line_ends_command)
 849 {
 850   strip_comments (line);
 851   ds_rtrim (line, ss_cstr (CC_SPACES));
 852   *line_ends_command = (ds_chomp (line, get_endcmd ())
 853                         || (ds_is_empty (line) && get_nulline ()));
 854   *line_starts_command = false;
 855   if (syntax == GETL_BATCH)
 856     {
 857       int first = ds_first (line);
 858       *line_starts_command = !isspace (first);
 859       if (first == '+' || first == '-')
 860         *ds_data (line) = ' ';
 861     }
 862 }
 863
 864 /* Reads a line, without performing any preprocessing.
 865    Sets *SYNTAX, if SYNTAX is non-null, to the line's syntax
 866    mode. */
 867 bool
 868 lex_get_line_raw (struct lexer *lexer)
 869 {
 870   bool ok = getl_read_line (lexer->ss, &lexer->line_buffer);
 871   enum syntax_mode mode = lex_current_syntax_mode (lexer);
 872   journal_write (mode == GETL_BATCH, ds_cstr (&lexer->line_buffer));
 873
 874   return ok;
 875 }
 876
 877 /* Reads a line for use by the tokenizer, and preprocesses it by
 878    removing comments, stripping trailing whitespace and the
 879    terminal dot, and removing leading indentors. */
 880 bool
 881 lex_get_line (struct lexer *lexer)
 882 {
 883   bool line_starts_command;
 884
 885   if (!lex_get_line_raw (lexer))
 886     {
 887       lexer->prog = NULL;
 888       return false;
 889     }
 890
 891   lex_preprocess_line (&lexer->line_buffer,
 892                        lex_current_syntax_mode (lexer),
 893                        &line_starts_command, &lexer->dot);
 894
 895   if (line_starts_command)
 896     lexer->put_token = '.';
 897
 898   lexer->prog = ds_cstr (&lexer->line_buffer);
 899   return true;
 900 }
 901 \f
 902 /* Token names. */
 903
 904 /* Returns the name of a token. */
 905 const char *
 906 lex_token_name (int token)
 907 {
 908   if (lex_is_keyword (token))
 909     return lex_id_name (token);
 910   else if (token < 256)
 911     {
 912       static char t[256][2];
 913       char *s = t[token];
 914       s[0] = token;
 915       s[1] = '\0';
 916       return s;
 917     }
 918   else
 919     NOT_REACHED ();
 920 }
 921
 922 /* Returns an ASCII representation of the current token as a
 923    malloc()'d string. */
 924 char *
 925 lex_token_representation (struct lexer *lexer)
 926 {
 927   char *token_rep;
 928
 929   switch (lexer->token)
 930     {
 931     case T_ID:
 932     case T_POS_NUM:
 933     case T_NEG_NUM:
 934       return ds_xstrdup (&lexer->tokstr);
 935       break;
 936
 937     case T_STRING:
 938       {
 939         int hexstring = 0;
 940         char *sp, *dp;
 941
 942         for (sp = ds_cstr (&lexer->tokstr); sp < ds_end (&lexer->tokstr); sp++)
 943           if (!isprint ((unsigned char) *sp))
 944             {
 945               hexstring = 1;
 946               break;
 947             }
 948
 949         token_rep = xmalloc (2 + ds_length (&lexer->tokstr) * 2 + 1 + 1);
 950
 951         dp = token_rep;
 952         if (hexstring)
 953           *dp++ = 'X';
 954         *dp++ = '\'';
 955
 956         if (!hexstring)
 957           for (sp = ds_cstr (&lexer->tokstr); *sp; )
 958             {
 959               if (*sp == '\'')
 960                 *dp++ = '\'';
 961               *dp++ = (unsigned char) *sp++;
 962             }
 963         else
 964           for (sp = ds_cstr (&lexer->tokstr); sp < ds_end (&lexer->tokstr); sp++)
 965             {
 966               *dp++ = (((unsigned char) *sp) >> 4)["0123456789ABCDEF"];
 967               *dp++ = (((unsigned char) *sp) & 15)["0123456789ABCDEF"];
 968             }
 969         *dp++ = '\'';
 970         *dp = '\0';
 971
 972         return token_rep;
 973       }
 974     break;
 975
 976     case T_STOP:
 977       token_rep = xmalloc (1);
 978       *token_rep = '\0';
 979       return token_rep;
 980
 981     case T_EXP:
 982       return xstrdup ("**");
 983
 984     default:
 985       return xstrdup (lex_token_name (lexer->token));
 986     }
 987
 988   NOT_REACHED ();
 989 }
 990 \f
 991 /* Really weird functions. */
 992
 993 /* Most of the time, a `-' is a lead-in to a negative number.  But
 994    sometimes it's actually part of the syntax.  If a dash can be part
 995    of syntax then this function is called to rip it off of a
 996    number. */
 997 void
 998 lex_negative_to_dash (struct lexer *lexer)
 999 {
1000   if (lexer->token == T_NEG_NUM)
1001     {
1002       lexer->token = T_POS_NUM;
1003       lexer->tokval = -lexer->tokval;
1004       ds_assign_substring (&lexer->tokstr, ds_substr (&lexer->tokstr, 1, SIZE_MAX));
1005       save_token (lexer);
1006       lexer->token = '-';
1007     }
1008 }
1009
1010 /* Skip a COMMENT command. */
1011 void
1012 lex_skip_comment (struct lexer *lexer)
1013 {
1014   for (;;)
1015     {
1016       if (!lex_get_line (lexer))
1017         {
1018           lexer->put_token = T_STOP;
1019           lexer->prog = NULL;
1020           return;
1021         }
1022
1023       if (lexer->put_token == '.')
1024         break;
1025
1026       ds_cstr (&lexer->line_buffer); /* Ensures ds_end will point to a valid char */
1027       lexer->prog = ds_end (&lexer->line_buffer);
1028       if (lexer->dot)
1029         break;
1030     }
1031 }
1032 \f
1033 /* Private functions. */
1034
1035 /* When invoked, tokstr contains a string of binary, octal, or
1036    hex digits, according to TYPE.  The string is converted to
1037    characters having the specified values. */
1038 static void
1039 convert_numeric_string_to_char_string (struct lexer *lexer,
1040                                        enum string_type type)
1041 {
1042   const char *base_name;
1043   int base;
1044   int chars_per_byte;
1045   size_t byte_cnt;
1046   size_t i;
1047   char *p;
1048
1049   switch (type)
1050     {
1051     case BINARY_STRING:
1052       base_name = _("binary");
1053       base = 2;
1054       chars_per_byte = 8;
1055       break;
1056     case OCTAL_STRING:
1057       base_name = _("octal");
1058       base = 8;
1059       chars_per_byte = 3;
1060       break;
1061     case HEX_STRING:
1062       base_name = _("hex");
1063       base = 16;
1064       chars_per_byte = 2;
1065       break;
1066     default:
1067       NOT_REACHED ();
1068     }
1069
1070   byte_cnt = ds_length (&lexer->tokstr) / chars_per_byte;
1071   if (ds_length (&lexer->tokstr) % chars_per_byte)
1072     msg (SE, _("String of %s digits has %zu characters, which is not a "
1073                "multiple of %d."),
1074          base_name, ds_length (&lexer->tokstr), chars_per_byte);
1075
1076   p = ds_cstr (&lexer->tokstr);
1077   for (i = 0; i < byte_cnt; i++)
1078     {
1079       int value;
1080       int j;
1081
1082       value = 0;
1083       for (j = 0; j < chars_per_byte; j++, p++)
1084         {
1085           int v;
1086
1087           if (*p >= '0' && *p <= '9')
1088             v = *p - '0';
1089           else
1090             {
1091               static const char alpha[] = "abcdef";
1092               const char *q = strchr (alpha, tolower ((unsigned char) *p));
1093
1094               if (q)
1095                 v = q - alpha + 10;
1096               else
1097                 v = base;
1098             }
1099
1100           if (v >= base)
1101             msg (SE, _("`%c' is not a valid %s digit."), *p, base_name);
1102
1103           value = value * base + v;
1104         }
1105
1106       ds_cstr (&lexer->tokstr)[i] = (unsigned char) value;
1107     }
1108
1109   ds_truncate (&lexer->tokstr, byte_cnt);
1110 }
1111
1112 /* Parses a string from the input buffer into tokstr.  The input
1113    buffer pointer lexer->prog must point to the initial single or double
1114    quote.  TYPE indicates the type of string to be parsed.
1115    Returns token type. */
1116 static int
1117 parse_string (struct lexer *lexer, enum string_type type)
1118 {
1119   if (type != CHARACTER_STRING)
1120     lexer->prog++;
1121
1122   /* Accumulate the entire string, joining sections indicated by +
1123      signs. */
1124   for (;;)
1125     {
1126       /* Single or double quote. */
1127       int c = *lexer->prog++;
1128
1129       /* Accumulate section. */
1130       for (;;)
1131         {
1132           /* Check end of line. */
1133           if (*lexer->prog == '\0')
1134             {
1135               msg (SE, _("Unterminated string constant."));
1136               goto finish;
1137             }
1138
1139           /* Double quote characters to embed them in strings. */
1140           if (*lexer->prog == c)
1141             {
1142               if (lexer->prog[1] == c)
1143                 lexer->prog++;
1144               else
1145                 break;
1146             }
1147
1148           ds_put_char (&lexer->tokstr, *lexer->prog++);
1149         }
1150       lexer->prog++;
1151
1152       /* Skip whitespace after final quote mark. */
1153       if (lexer->prog == NULL)
1154         break;
1155       for (;;)
1156         {
1157           while (isspace ((unsigned char) *lexer->prog))
1158             lexer->prog++;
1159           if (*lexer->prog)
1160             break;
1161
1162           if (lexer->dot)
1163             goto finish;
1164
1165           if (!lex_get_line (lexer))
1166             goto finish;
1167         }
1168
1169       /* Skip plus sign. */
1170       if (*lexer->prog != '+')
1171         break;
1172       lexer->prog++;
1173
1174       /* Skip whitespace after plus sign. */
1175       if (lexer->prog == NULL)
1176         break;
1177       for (;;)
1178         {
1179           while (isspace ((unsigned char) *lexer->prog))
1180             lexer->prog++;
1181           if (*lexer->prog)
1182             break;
1183
1184           if (lexer->dot)
1185             goto finish;
1186
1187           if (!lex_get_line (lexer))
1188             {
1189               msg (SE, _("Unexpected end of file in string concatenation."));
1190               goto finish;
1191             }
1192         }
1193
1194       /* Ensure that a valid string follows. */
1195       if (*lexer->prog != '\'' && *lexer->prog != '"')
1196         {
1197           msg (SE, _("String expected following `+'."));
1198           goto finish;
1199         }
1200     }
1201
1202   /* We come here when we've finished concatenating all the string sections
1203      into one large string. */
1204 finish:
1205   if (type != CHARACTER_STRING)
1206     convert_numeric_string_to_char_string (lexer, type);
1207
1208   if (ds_length (&lexer->tokstr) > 255)
1209     {
1210       msg (SE, _("String exceeds 255 characters in length (%zu characters)."),
1211            ds_length (&lexer->tokstr));
1212       ds_truncate (&lexer->tokstr, 255);
1213     }
1214
1215   return T_STRING;
1216 }
1217 \f
1218 #if DUMP_TOKENS
1219 /* Reads one token from the lexer and writes a textual representation
1220    on stdout for debugging purposes. */
1221 static void
1222 dump_token (struct lexer *lexer)
1223 {
1224   {
1225     const char *curfn;
1226     int curln;
1227
1228     curln = getl_source_location (lexer->ss);
1229     curfn = getl_source_name (lexer->ss);
1230     if (curfn)
1231       fprintf (stderr, "%s:%d\t", curfn, curln);
1232   }
1233
1234   switch (lexer->token)
1235     {
1236     case T_ID:
1237       fprintf (stderr, "ID\t%s\n", lexer->tokid);
1238       break;
1239
1240     case T_POS_NUM:
1241     case T_NEG_NUM:
1242       fprintf (stderr, "NUM\t%f\n", lexer->tokval);
1243       break;
1244
1245     case T_STRING:
1246       fprintf (stderr, "STRING\t\"%s\"\n", ds_cstr (&lexer->tokstr));
1247       break;
1248
1249     case T_STOP:
1250       fprintf (stderr, "STOP\n");
1251       break;
1252
1253     case T_EXP:
1254       fprintf (stderr, "MISC\tEXP\"");
1255       break;
1256
1257     case 0:
1258       fprintf (stderr, "MISC\tEOF\n");
1259       break;
1260
1261     default:
1262       if (lex_is_keyword (lexer->token))
1263         fprintf (stderr, "KEYWORD\t%s\n", lex_token_name (lexer->token));
1264       else
1265         fprintf (stderr, "PUNCT\t%c\n", lexer->token);
1266       break;
1267     }
1268 }
1269 #endif /* DUMP_TOKENS */
1270
1271
1272 /* Token Accessor Functions */
1273
1274 int
1275 lex_token (const struct lexer *lexer)
1276 {
1277   return lexer->token;
1278 }
1279
1280 double
1281 lex_tokval (const struct lexer *lexer)
1282 {
1283   return lexer->tokval;
1284 }
1285
1286 const char *
1287 lex_tokid (const struct lexer *lexer)
1288 {
1289   return lexer->tokid;
1290 }
1291
1292 const struct string *
1293 lex_tokstr (const struct lexer *lexer)
1294 {
1295   return &lexer->tokstr;
1296 }