src/language/lexer/lexer.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 1997-9, 2000, 2006 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18 #include "lexer.h"
  19 #include <libpspp/message.h>
  20 #include <ctype.h>
  21 #include <errno.h>
  22 #include <limits.h>
  23 #include <math.h>
  24 #include <stdarg.h>
  25 #include <stdlib.h>
  26 #include <libpspp/alloc.h>
  27 #include <libpspp/assertion.h>
  28 #include <language/command.h>
  29 #include <libpspp/message.h>
  30 #include <libpspp/magic.h>
  31 #include <data/settings.h>
  32 #include <libpspp/getl.h>
  33 #include <libpspp/str.h>
  34 #include <output/journal.h>
  35
  36 #include "size_max.h"
  37
  38 #include "gettext.h"
  39 #define _(msgid) gettext (msgid)
  40 #define N_(msgid) msgid
  41
  42
  43 #define DUMP_TOKENS 0
  44
  45
  46 struct lexer
  47 {
  48   struct string line_buffer;
  49
  50   struct source_stream *ss;
  51
  52   int token;      /* Current token. */
  53   double tokval;  /* T_POS_NUM, T_NEG_NUM: the token's value. */
  54
  55   char tokid [LONG_NAME_LEN + 1];   /* T_ID: the identifier. */
  56
  57   struct string tokstr;   /* T_ID, T_STRING: token string value.
  58                             For T_ID, this is not truncated as is
  59                             tokid. */
  60
  61   char *prog; /* Pointer to next token in line_buffer. */
  62   bool dot;   /* True only if this line ends with a terminal dot. */
  63
  64   int put_token ; /* If nonzero, next token returned by lex_get().
  65                     Used only in exceptional circumstances. */
  66
  67   struct string put_tokstr;
  68   double put_tokval;
  69 };
  70
  71
  72 static int parse_id (struct lexer *);
  73
  74 /* How a string represents its contents. */
  75 enum string_type
  76   {
  77     CHARACTER_STRING,   /* Characters. */
  78     BINARY_STRING,      /* Binary digits. */
  79     OCTAL_STRING,       /* Octal digits. */
  80     HEX_STRING          /* Hexadecimal digits. */
  81   };
  82
  83 static int parse_string (struct lexer *, enum string_type);
  84
  85 #if DUMP_TOKENS
  86 static void dump_token (struct lexer *);
  87 #endif
  88 \f
  89 /* Initialization. */
  90
  91 /* Initializes the lexer. */
  92 struct lexer *
  93 lex_create (struct source_stream *ss)
  94 {
  95   struct lexer *lexer = xzalloc (sizeof (*lexer));
  96
  97   ds_init_empty (&lexer->tokstr);
  98   ds_init_empty (&lexer->put_tokstr);
  99   ds_init_empty (&lexer->line_buffer);
 100   lexer->ss = ss;
 101
 102   return lexer;
 103 }
 104
 105 struct source_stream *
 106 lex_get_source_stream (const struct lexer *lex)
 107 {
 108   return lex->ss;
 109 }
 110
 111
 112 void
 113 lex_destroy (struct lexer *lexer)
 114 {
 115   if ( NULL != lexer )
 116     {
 117       ds_destroy (&lexer->put_tokstr);
 118       ds_destroy (&lexer->tokstr);
 119       ds_destroy (&lexer->line_buffer);
 120
 121       free (lexer);
 122     }
 123 }
 124
 125 \f
 126 /* Common functions. */
 127
 128 /* Copies put_token, lexer->put_tokstr, put_tokval into token, tokstr,
 129    tokval, respectively, and sets tokid appropriately. */
 130 static void
 131 restore_token (struct lexer *lexer)
 132 {
 133   assert (lexer->put_token != 0);
 134   lexer->token = lexer->put_token;
 135   ds_assign_string (&lexer->tokstr, &lexer->put_tokstr);
 136   str_copy_trunc (lexer->tokid, sizeof lexer->tokid, ds_cstr (&lexer->tokstr));
 137   lexer->tokval = lexer->put_tokval;
 138   lexer->put_token = 0;
 139 }
 140
 141 /* Copies token, tokstr, lexer->tokval into lexer->put_token, put_tokstr,
 142    put_lexer->tokval respectively. */
 143 static void
 144 save_token (struct lexer *lexer)
 145 {
 146   lexer->put_token = lexer->token;
 147   ds_assign_string (&lexer->put_tokstr, &lexer->tokstr);
 148   lexer->put_tokval = lexer->tokval;
 149 }
 150
 151 /* Parses a single token, setting appropriate global variables to
 152    indicate the token's attributes. */
 153 void
 154 lex_get (struct lexer *lexer)
 155 {
 156   /* Find a token. */
 157   for (;;)
 158     {
 159       if (NULL == lexer->prog && ! lex_get_line (lexer) )
 160         {
 161           lexer->token = T_STOP;
 162           return;
 163         }
 164
 165   /* If a token was pushed ahead, return it. */
 166   if (lexer->put_token)
 167     {
 168       restore_token (lexer);
 169 #if DUMP_TOKENS
 170           dump_token (lexer);
 171 #endif
 172       return;
 173     }
 174
 175   for (;;)
 176     {
 177       /* Skip whitespace. */
 178           while (isspace ((unsigned char) *lexer->prog))
 179             lexer->prog++;
 180
 181           if (*lexer->prog)
 182             break;
 183
 184           if (lexer->dot)
 185             {
 186               lexer->dot = 0;
 187               lexer->token = '.';
 188 #if DUMP_TOKENS
 189               dump_token (lexer);
 190 #endif
 191               return;
 192             }
 193           else if (!lex_get_line (lexer))
 194             {
 195               lexer->prog = NULL;
 196               lexer->token = T_STOP;
 197 #if DUMP_TOKENS
 198               dump_token (lexer);
 199 #endif
 200               return;
 201             }
 202
 203           if (lexer->put_token)
 204             {
 205               restore_token (lexer);
 206 #if DUMP_TOKENS
 207               dump_token (lexer);
 208 #endif
 209               return;
 210             }
 211         }
 212
 213
 214       /* Actually parse the token. */
 215       ds_clear (&lexer->tokstr);
 216
 217       switch (*lexer->prog)
 218         {
 219         case '-': case '.':
 220         case '0': case '1': case '2': case '3': case '4':
 221         case '5': case '6': case '7': case '8': case '9':
 222           {
 223             char *tail;
 224
 225             /* `-' can introduce a negative number, or it can be a
 226                token by itself.  If it is not followed by a digit or a
 227                decimal point, it is definitely not a number.
 228                Otherwise, it might be either, but most of the time we
 229                want it as a number.  When the syntax calls for a `-'
 230                token, lex_negative_to_dash() must be used to break
 231                negative numbers into two tokens. */
 232             if (*lexer->prog == '-')
 233               {
 234                 ds_put_char (&lexer->tokstr, *lexer->prog++);
 235                 while (isspace ((unsigned char) *lexer->prog))
 236                   lexer->prog++;
 237
 238                 if (!isdigit ((unsigned char) *lexer->prog) && *lexer->prog != '.')
 239                   {
 240                     lexer->token = '-';
 241                     break;
 242                   }
 243                 lexer->token = T_NEG_NUM;
 244               }
 245             else
 246               lexer->token = T_POS_NUM;
 247
 248             /* Parse the number, copying it into tokstr. */
 249             while (isdigit ((unsigned char) *lexer->prog))
 250               ds_put_char (&lexer->tokstr, *lexer->prog++);
 251             if (*lexer->prog == '.')
 252               {
 253                 ds_put_char (&lexer->tokstr, *lexer->prog++);
 254                 while (isdigit ((unsigned char) *lexer->prog))
 255                   ds_put_char (&lexer->tokstr, *lexer->prog++);
 256               }
 257             if (*lexer->prog == 'e' || *lexer->prog == 'E')
 258               {
 259                 ds_put_char (&lexer->tokstr, *lexer->prog++);
 260                 if (*lexer->prog == '+' || *lexer->prog == '-')
 261                   ds_put_char (&lexer->tokstr, *lexer->prog++);
 262                 while (isdigit ((unsigned char) *lexer->prog))
 263                   ds_put_char (&lexer->tokstr, *lexer->prog++);
 264               }
 265
 266             /* Parse as floating point. */
 267             lexer->tokval = strtod (ds_cstr (&lexer->tokstr), &tail);
 268             if (*tail)
 269               {
 270                 msg (SE, _("%s does not form a valid number."),
 271                      ds_cstr (&lexer->tokstr));
 272                 lexer->tokval = 0.0;
 273
 274                 ds_clear (&lexer->tokstr);
 275                 ds_put_char (&lexer->tokstr, '0');
 276               }
 277
 278             break;
 279           }
 280
 281         case '\'': case '"':
 282           lexer->token = parse_string (lexer, CHARACTER_STRING);
 283           break;
 284
 285         case '(': case ')': case ',': case '=': case '+': case '/':
 286           lexer->token = *lexer->prog++;
 287           break;
 288
 289         case '*':
 290           if (*++lexer->prog == '*')
 291             {
 292               lexer->prog++;
 293               lexer->token = T_EXP;
 294             }
 295           else
 296             lexer->token = '*';
 297           break;
 298
 299         case '<':
 300           if (*++lexer->prog == '=')
 301             {
 302               lexer->prog++;
 303               lexer->token = T_LE;
 304             }
 305           else if (*lexer->prog == '>')
 306             {
 307               lexer->prog++;
 308               lexer->token = T_NE;
 309             }
 310           else
 311             lexer->token = T_LT;
 312           break;
 313
 314         case '>':
 315           if (*++lexer->prog == '=')
 316             {
 317               lexer->prog++;
 318               lexer->token = T_GE;
 319             }
 320           else
 321             lexer->token = T_GT;
 322           break;
 323
 324         case '~':
 325           if (*++lexer->prog == '=')
 326             {
 327               lexer->prog++;
 328               lexer->token = T_NE;
 329             }
 330           else
 331             lexer->token = T_NOT;
 332           break;
 333
 334         case '&':
 335           lexer->prog++;
 336           lexer->token = T_AND;
 337           break;
 338
 339         case '|':
 340           lexer->prog++;
 341           lexer->token = T_OR;
 342           break;
 343
 344         case 'b': case 'B':
 345           if (lexer->prog[1] == '\'' || lexer->prog[1] == '"')
 346             lexer->token = parse_string (lexer, BINARY_STRING);
 347           else
 348             lexer->token = parse_id (lexer);
 349           break;
 350
 351         case 'o': case 'O':
 352           if (lexer->prog[1] == '\'' || lexer->prog[1] == '"')
 353             lexer->token = parse_string (lexer, OCTAL_STRING);
 354           else
 355             lexer->token = parse_id (lexer);
 356           break;
 357
 358         case 'x': case 'X':
 359           if (lexer->prog[1] == '\'' || lexer->prog[1] == '"')
 360             lexer->token = parse_string (lexer, HEX_STRING);
 361           else
 362             lexer->token = parse_id (lexer);
 363           break;
 364
 365         default:
 366           if (lex_is_id1 (*lexer->prog))
 367             {
 368               lexer->token = parse_id (lexer);
 369               break;
 370             }
 371           else
 372             {
 373               if (isgraph ((unsigned char) *lexer->prog))
 374                 msg (SE, _("Bad character in input: `%c'."), *lexer->prog++);
 375               else
 376                 msg (SE, _("Bad character in input: `\\%o'."), *lexer->prog++);
 377               continue;
 378             }
 379         }
 380       break;
 381     }
 382
 383 #if DUMP_TOKENS
 384   dump_token (lexer);
 385 #endif
 386 }
 387
 388 /* Parses an identifier at the current position into tokid and
 389    tokstr.
 390    Returns the correct token type. */
 391 static int
 392 parse_id (struct lexer *lexer)
 393 {
 394   struct substring rest_of_line
 395     = ss_substr (ds_ss (&lexer->line_buffer),
 396                  ds_pointer_to_position (&lexer->line_buffer, lexer->prog),
 397                  SIZE_MAX);
 398   struct substring id = ss_head (rest_of_line,
 399                                  lex_id_get_length (rest_of_line));
 400   lexer->prog += ss_length (id);
 401
 402   ds_assign_substring (&lexer->tokstr, id);
 403   str_copy_trunc (lexer->tokid, sizeof lexer->tokid, ds_cstr (&lexer->tokstr));
 404   return lex_id_to_token (id);
 405 }
 406
 407 /* Reports an error to the effect that subcommand SBC may only be
 408    specified once. */
 409 void
 410 lex_sbc_only_once (const char *sbc)
 411 {
 412   msg (SE, _("Subcommand %s may only be specified once."), sbc);
 413 }
 414
 415 /* Reports an error to the effect that subcommand SBC is
 416    missing. */
 417 void
 418 lex_sbc_missing (struct lexer *lexer, const char *sbc)
 419 {
 420   lex_error (lexer, _("missing required subcommand %s"), sbc);
 421 }
 422
 423 /* Prints a syntax error message containing the current token and
 424    given message MESSAGE (if non-null). */
 425 void
 426 lex_error (struct lexer *lexer, const char *message, ...)
 427 {
 428   char *token_rep;
 429   char where[128];
 430
 431   token_rep = lex_token_representation (lexer);
 432   if (lexer->token == T_STOP)
 433     strcpy (where, "end of file");
 434   else if (lexer->token == '.')
 435     strcpy (where, "end of command");
 436   else
 437     snprintf (where, sizeof where, "`%s'", token_rep);
 438   free (token_rep);
 439
 440   if (message)
 441     {
 442       char buf[1024];
 443       va_list args;
 444
 445       va_start (args, message);
 446       vsnprintf (buf, 1024, message, args);
 447       va_end (args);
 448
 449       msg (SE, _("Syntax error %s at %s."), buf, where);
 450     }
 451   else
 452     msg (SE, _("Syntax error at %s."), where);
 453 }
 454
 455 /* Checks that we're at end of command.
 456    If so, returns a successful command completion code.
 457    If not, flags a syntax error and returns an error command
 458    completion code. */
 459 int
 460 lex_end_of_command (struct lexer *lexer)
 461 {
 462   if (lexer->token != '.')
 463     {
 464       lex_error (lexer, _("expecting end of command"));
 465       return CMD_FAILURE;
 466     }
 467   else
 468     return CMD_SUCCESS;
 469 }
 470 \f
 471 /* Token testing functions. */
 472
 473 /* Returns true if the current token is a number. */
 474 bool
 475 lex_is_number (struct lexer *lexer)
 476 {
 477   return lexer->token == T_POS_NUM || lexer->token == T_NEG_NUM;
 478 }
 479
 480
 481 /* Returns true if the current token is a string. */
 482 bool
 483 lex_is_string (struct lexer *lexer)
 484 {
 485   return lexer->token == T_STRING;
 486 }
 487
 488
 489 /* Returns the value of the current token, which must be a
 490    floating point number. */
 491 double
 492 lex_number (struct lexer *lexer)
 493 {
 494   assert (lex_is_number (lexer));
 495   return lexer->tokval;
 496 }
 497
 498 /* Returns true iff the current token is an integer. */
 499 bool
 500 lex_is_integer (struct lexer *lexer)
 501 {
 502   return (lex_is_number (lexer)
 503           && lexer->tokval != NOT_LONG
 504           && lexer->tokval >= LONG_MIN
 505           && lexer->tokval <= LONG_MAX
 506           && floor (lexer->tokval) == lexer->tokval);
 507 }
 508
 509 /* Returns the value of the current token, which must be an
 510    integer. */
 511 long
 512 lex_integer (struct lexer *lexer)
 513 {
 514   assert (lex_is_integer (lexer));
 515   return lexer->tokval;
 516 }
 517 \f
 518 /* Token matching functions. */
 519
 520 /* If TOK is the current token, skips it and returns true
 521    Otherwise, returns false. */
 522 bool
 523 lex_match (struct lexer *lexer, int t)
 524 {
 525   if (lexer->token == t)
 526     {
 527       lex_get (lexer);
 528       return true;
 529     }
 530   else
 531     return false;
 532 }
 533
 534 /* If the current token is the identifier S, skips it and returns
 535    true.  The identifier may be abbreviated to its first three
 536    letters.
 537    Otherwise, returns false. */
 538 bool
 539 lex_match_id (struct lexer *lexer, const char *s)
 540 {
 541   if (lexer->token == T_ID
 542       && lex_id_match (ss_cstr (s), ss_cstr (lexer->tokid)))
 543     {
 544       lex_get (lexer);
 545       return true;
 546     }
 547   else
 548     return false;
 549 }
 550
 551 /* If the current token is integer N, skips it and returns true.
 552    Otherwise, returns false. */
 553 bool
 554 lex_match_int (struct lexer *lexer, int x)
 555 {
 556   if (lex_is_integer (lexer) && lex_integer (lexer) == x)
 557     {
 558       lex_get (lexer);
 559       return true;
 560     }
 561   else
 562     return false;
 563 }
 564 \f
 565 /* Forced matches. */
 566
 567 /* If this token is identifier S, fetches the next token and returns
 568    nonzero.
 569    Otherwise, reports an error and returns zero. */
 570 bool
 571 lex_force_match_id (struct lexer *lexer, const char *s)
 572 {
 573   if (lex_match_id (lexer, s))
 574     return true;
 575   else
 576     {
 577       lex_error (lexer, _("expecting `%s'"), s);
 578       return false;
 579     }
 580 }
 581
 582 /* If the current token is T, skips the token.  Otherwise, reports an
 583    error and returns from the current function with return value false. */
 584 bool
 585 lex_force_match (struct lexer *lexer, int t)
 586 {
 587   if (lexer->token == t)
 588     {
 589       lex_get (lexer);
 590       return true;
 591     }
 592   else
 593     {
 594       lex_error (lexer, _("expecting `%s'"), lex_token_name (t));
 595       return false;
 596     }
 597 }
 598
 599 /* If this token is a string, does nothing and returns true.
 600    Otherwise, reports an error and returns false. */
 601 bool
 602 lex_force_string (struct lexer *lexer)
 603 {
 604   if (lexer->token == T_STRING)
 605     return true;
 606   else
 607     {
 608       lex_error (lexer, _("expecting string"));
 609       return false;
 610     }
 611 }
 612
 613 /* If this token is an integer, does nothing and returns true.
 614    Otherwise, reports an error and returns false. */
 615 bool
 616 lex_force_int (struct lexer *lexer)
 617 {
 618   if (lex_is_integer (lexer))
 619     return true;
 620   else
 621     {
 622       lex_error (lexer, _("expecting integer"));
 623       return false;
 624     }
 625 }
 626
 627 /* If this token is a number, does nothing and returns true.
 628    Otherwise, reports an error and returns false. */
 629 bool
 630 lex_force_num (struct lexer *lexer)
 631 {
 632   if (lex_is_number (lexer))
 633     return true;
 634
 635   lex_error (lexer, _("expecting number"));
 636   return false;
 637 }
 638
 639 /* If this token is an identifier, does nothing and returns true.
 640    Otherwise, reports an error and returns false. */
 641 bool
 642 lex_force_id (struct lexer *lexer)
 643 {
 644   if (lexer->token == T_ID)
 645     return true;
 646
 647   lex_error (lexer, _("expecting identifier"));
 648   return false;
 649 }
 650
 651 /* Weird token functions. */
 652
 653 /* Returns the first character of the next token, except that if the
 654    next token is not an identifier, the character returned will not be
 655    a character that can begin an identifier.  Specifically, the
 656    hexstring lead-in X' causes lookahead() to return '.  Note that an
 657    alphanumeric return value doesn't guarantee an ID token, it could
 658    also be a reserved-word token. */
 659 int
 660 lex_look_ahead (struct lexer *lexer)
 661 {
 662   if (lexer->put_token)
 663     return lexer->put_token;
 664
 665   for (;;)
 666     {
 667       if (NULL == lexer->prog && ! lex_get_line (lexer) )
 668         return 0;
 669
 670       for (;;)
 671         {
 672           while (isspace ((unsigned char) *lexer->prog))
 673             lexer->prog++;
 674           if (*lexer->prog)
 675             break;
 676
 677           if (lexer->dot)
 678             return '.';
 679           else if (!lex_get_line (lexer))
 680             return 0;
 681
 682           if (lexer->put_token)
 683             return lexer->put_token;
 684         }
 685
 686       if ((toupper ((unsigned char) *lexer->prog) == 'X'
 687            || toupper ((unsigned char) *lexer->prog) == 'B'
 688            || toupper ((unsigned char) *lexer->prog) == 'O')
 689           && (lexer->prog[1] == '\'' || lexer->prog[1] == '"'))
 690         return '\'';
 691
 692       return *lexer->prog;
 693     }
 694 }
 695
 696 /* Makes the current token become the next token to be read; the
 697    current token is set to T. */
 698 void
 699 lex_put_back (struct lexer *lexer, int t)
 700 {
 701   save_token (lexer);
 702   lexer->token = t;
 703 }
 704
 705 /* Makes the current token become the next token to be read; the
 706    current token is set to the identifier ID. */
 707 void
 708 lex_put_back_id (struct lexer *lexer, const char *id)
 709 {
 710   assert (lex_id_to_token (ss_cstr (id)) == T_ID);
 711   save_token (lexer);
 712   lexer->token = T_ID;
 713   ds_assign_cstr (&lexer->tokstr, id);
 714   str_copy_trunc (lexer->tokid, sizeof lexer->tokid, ds_cstr (&lexer->tokstr));
 715 }
 716 \f
 717 /* Weird line processing functions. */
 718
 719 /* Returns the entire contents of the current line. */
 720 const char *
 721 lex_entire_line (const struct lexer *lexer)
 722 {
 723   return ds_cstr (&lexer->line_buffer);
 724 }
 725
 726 const struct string *
 727 lex_entire_line_ds (const struct lexer *lexer)
 728 {
 729   return &lexer->line_buffer;
 730 }
 731
 732 /* As lex_entire_line(), but only returns the part of the current line
 733    that hasn't already been tokenized. */
 734 const char *
 735 lex_rest_of_line (const struct lexer *lexer)
 736 {
 737   return lexer->prog;
 738 }
 739
 740 /* Returns true if the current line ends in a terminal dot,
 741    false otherwise. */
 742 bool
 743 lex_end_dot (const struct lexer *lexer)
 744 {
 745   return lexer->dot;
 746 }
 747
 748 /* Causes the rest of the current input line to be ignored for
 749    tokenization purposes. */
 750 void
 751 lex_discard_line (struct lexer *lexer)
 752 {
 753   ds_cstr (&lexer->line_buffer);  /* Ensures ds_end points to something valid */
 754   lexer->prog = ds_end (&lexer->line_buffer);
 755   lexer->dot = false;
 756   lexer->put_token = 0;
 757 }
 758
 759
 760 /* Discards the rest of the current command.
 761    When we're reading commands from a file, we skip tokens until
 762    a terminal dot or EOF.
 763    When we're reading commands interactively from the user,
 764    that's just discarding the current line, because presumably
 765    the user doesn't want to finish typing a command that will be
 766    ignored anyway. */
 767 void
 768 lex_discard_rest_of_command (struct lexer *lexer)
 769 {
 770   if (!getl_is_interactive (lexer->ss))
 771     {
 772       while (lexer->token != T_STOP && lexer->token != '.')
 773         lex_get (lexer);
 774     }
 775   else
 776     lex_discard_line (lexer);
 777 }
 778 \f
 779 /* Weird line reading functions. */
 780
 781 /* Remove C-style comments in STRING, begun by slash-star and
 782    terminated by star-slash or newline. */
 783 static void
 784 strip_comments (struct string *string)
 785 {
 786   char *cp;
 787   int quote;
 788   bool in_comment;
 789
 790   in_comment = false;
 791   quote = EOF;
 792   for (cp = ds_cstr (string); *cp; )
 793     {
 794       /* If we're not in a comment, check for quote marks. */
 795       if (!in_comment)
 796         {
 797           if (*cp == quote)
 798             quote = EOF;
 799           else if (*cp == '\'' || *cp == '"')
 800             quote = *cp;
 801         }
 802
 803       /* If we're not inside a quotation, check for comment. */
 804       if (quote == EOF)
 805         {
 806           if (cp[0] == '/' && cp[1] == '*')
 807             {
 808               in_comment = true;
 809               *cp++ = ' ';
 810               *cp++ = ' ';
 811               continue;
 812             }
 813           else if (in_comment && cp[0] == '*' && cp[1] == '/')
 814             {
 815               in_comment = false;
 816               *cp++ = ' ';
 817               *cp++ = ' ';
 818               continue;
 819             }
 820         }
 821
 822       /* Check commenting. */
 823       if (in_comment)
 824         *cp = ' ';
 825       cp++;
 826     }
 827 }
 828
 829 /* Prepares LINE, which is subject to the given SYNTAX rules, for
 830    tokenization by stripping comments and determining whether it
 831    is the beginning or end of a command and storing into
 832    *LINE_STARTS_COMMAND and *LINE_ENDS_COMMAND appropriately. */
 833 void
 834 lex_preprocess_line (struct string *line,
 835                      enum getl_syntax syntax,
 836                      bool *line_starts_command,
 837                      bool *line_ends_command)
 838 {
 839   strip_comments (line);
 840   ds_rtrim (line, ss_cstr (CC_SPACES));
 841   *line_ends_command = (ds_chomp (line, get_endcmd ())
 842                         || (ds_is_empty (line) && get_nulline ()));
 843   *line_starts_command = false;
 844   if (syntax == GETL_BATCH)
 845     {
 846       int first = ds_first (line);
 847       *line_starts_command = !isspace (first);
 848       if (first == '+' || first == '-')
 849         *ds_data (line) = ' ';
 850     }
 851 }
 852
 853 /* Reads a line, without performing any preprocessing.
 854    Sets *SYNTAX, if SYNTAX is non-null, to the line's syntax
 855    mode. */
 856 bool
 857 lex_get_line_raw (struct lexer *lexer, enum getl_syntax *syntax)
 858 {
 859   enum getl_syntax dummy;
 860   bool ok;
 861
 862   if (syntax == NULL)
 863     syntax = &dummy;
 864   ok = getl_read_line (lexer->ss, &lexer->line_buffer, syntax);
 865   journal_write (*syntax == GETL_BATCH, ds_cstr (&lexer->line_buffer));
 866
 867   return ok;
 868 }
 869
 870 /* Reads a line for use by the tokenizer, and preprocesses it by
 871    removing comments, stripping trailing whitespace and the
 872    terminal dot, and removing leading indentors. */
 873 bool
 874 lex_get_line (struct lexer *lexer)
 875 {
 876   bool line_starts_command;
 877   enum getl_syntax syntax = GETL_BATCH;
 878
 879   if (!lex_get_line_raw (lexer, &syntax))
 880     {
 881       lexer->prog = NULL;
 882       return false;
 883     }
 884
 885   lex_preprocess_line (&lexer->line_buffer, syntax,
 886                        &line_starts_command, &lexer->dot);
 887
 888   if (line_starts_command)
 889     lexer->put_token = '.';
 890
 891   lexer->prog = ds_cstr (&lexer->line_buffer);
 892   return true;
 893 }
 894 \f
 895 /* Token names. */
 896
 897 /* Returns the name of a token. */
 898 const char *
 899 lex_token_name (int token)
 900 {
 901   if (lex_is_keyword (token))
 902     return lex_id_name (token);
 903   else if (token < 256)
 904     {
 905       static char t[256][2];
 906       char *s = t[token];
 907       s[0] = token;
 908       s[1] = '\0';
 909       return s;
 910     }
 911   else
 912     NOT_REACHED ();
 913 }
 914
 915 /* Returns an ASCII representation of the current token as a
 916    malloc()'d string. */
 917 char *
 918 lex_token_representation (struct lexer *lexer)
 919 {
 920   char *token_rep;
 921
 922   switch (lexer->token)
 923     {
 924     case T_ID:
 925     case T_POS_NUM:
 926     case T_NEG_NUM:
 927       return ds_xstrdup (&lexer->tokstr);
 928       break;
 929
 930     case T_STRING:
 931       {
 932         int hexstring = 0;
 933         char *sp, *dp;
 934
 935         for (sp = ds_cstr (&lexer->tokstr); sp < ds_end (&lexer->tokstr); sp++)
 936           if (!isprint ((unsigned char) *sp))
 937             {
 938               hexstring = 1;
 939               break;
 940             }
 941
 942         token_rep = xmalloc (2 + ds_length (&lexer->tokstr) * 2 + 1 + 1);
 943
 944         dp = token_rep;
 945         if (hexstring)
 946           *dp++ = 'X';
 947         *dp++ = '\'';
 948
 949         if (!hexstring)
 950           for (sp = ds_cstr (&lexer->tokstr); *sp; )
 951             {
 952               if (*sp == '\'')
 953                 *dp++ = '\'';
 954               *dp++ = (unsigned char) *sp++;
 955             }
 956         else
 957           for (sp = ds_cstr (&lexer->tokstr); sp < ds_end (&lexer->tokstr); sp++)
 958             {
 959               *dp++ = (((unsigned char) *sp) >> 4)["0123456789ABCDEF"];
 960               *dp++ = (((unsigned char) *sp) & 15)["0123456789ABCDEF"];
 961             }
 962         *dp++ = '\'';
 963         *dp = '\0';
 964
 965         return token_rep;
 966       }
 967     break;
 968
 969     case T_STOP:
 970       token_rep = xmalloc (1);
 971       *token_rep = '\0';
 972       return token_rep;
 973
 974     case T_EXP:
 975       return xstrdup ("**");
 976
 977     default:
 978       return xstrdup (lex_token_name (lexer->token));
 979     }
 980
 981   NOT_REACHED ();
 982 }
 983 \f
 984 /* Really weird functions. */
 985
 986 /* Most of the time, a `-' is a lead-in to a negative number.  But
 987    sometimes it's actually part of the syntax.  If a dash can be part
 988    of syntax then this function is called to rip it off of a
 989    number. */
 990 void
 991 lex_negative_to_dash (struct lexer *lexer)
 992 {
 993   if (lexer->token == T_NEG_NUM)
 994     {
 995       lexer->token = T_POS_NUM;
 996       lexer->tokval = -lexer->tokval;
 997       ds_assign_substring (&lexer->tokstr, ds_substr (&lexer->tokstr, 1, SIZE_MAX));
 998       save_token (lexer);
 999       lexer->token = '-';
1000     }
1001 }
1002
1003 /* Skip a COMMENT command. */
1004 void
1005 lex_skip_comment (struct lexer *lexer)
1006 {
1007   for (;;)
1008     {
1009       if (!lex_get_line (lexer))
1010         {
1011           lexer->put_token = T_STOP;
1012           lexer->prog = NULL;
1013           return;
1014         }
1015
1016       if (lexer->put_token == '.')
1017         break;
1018
1019       ds_cstr (&lexer->line_buffer); /* Ensures ds_end will point to a valid char */
1020       lexer->prog = ds_end (&lexer->line_buffer);
1021       if (lexer->dot)
1022         break;
1023     }
1024 }
1025 \f
1026 /* Private functions. */
1027
1028 /* When invoked, tokstr contains a string of binary, octal, or
1029    hex digits, according to TYPE.  The string is converted to
1030    characters having the specified values. */
1031 static void
1032 convert_numeric_string_to_char_string (struct lexer *lexer,
1033                                        enum string_type type)
1034 {
1035   const char *base_name;
1036   int base;
1037   int chars_per_byte;
1038   size_t byte_cnt;
1039   size_t i;
1040   char *p;
1041
1042   switch (type)
1043     {
1044     case BINARY_STRING:
1045       base_name = _("binary");
1046       base = 2;
1047       chars_per_byte = 8;
1048       break;
1049     case OCTAL_STRING:
1050       base_name = _("octal");
1051       base = 8;
1052       chars_per_byte = 3;
1053       break;
1054     case HEX_STRING:
1055       base_name = _("hex");
1056       base = 16;
1057       chars_per_byte = 2;
1058       break;
1059     default:
1060       NOT_REACHED ();
1061     }
1062
1063   byte_cnt = ds_length (&lexer->tokstr) / chars_per_byte;
1064   if (ds_length (&lexer->tokstr) % chars_per_byte)
1065     msg (SE, _("String of %s digits has %d characters, which is not a "
1066                "multiple of %d."),
1067          base_name, (int) ds_length (&lexer->tokstr), chars_per_byte);
1068
1069   p = ds_cstr (&lexer->tokstr);
1070   for (i = 0; i < byte_cnt; i++)
1071     {
1072       int value;
1073       int j;
1074
1075       value = 0;
1076       for (j = 0; j < chars_per_byte; j++, p++)
1077         {
1078           int v;
1079
1080           if (*p >= '0' && *p <= '9')
1081             v = *p - '0';
1082           else
1083             {
1084               static const char alpha[] = "abcdef";
1085               const char *q = strchr (alpha, tolower ((unsigned char) *p));
1086
1087               if (q)
1088                 v = q - alpha + 10;
1089               else
1090                 v = base;
1091             }
1092
1093           if (v >= base)
1094             msg (SE, _("`%c' is not a valid %s digit."), *p, base_name);
1095
1096           value = value * base + v;
1097         }
1098
1099       ds_cstr (&lexer->tokstr)[i] = (unsigned char) value;
1100     }
1101
1102   ds_truncate (&lexer->tokstr, byte_cnt);
1103 }
1104
1105 /* Parses a string from the input buffer into tokstr.  The input
1106    buffer pointer lexer->prog must point to the initial single or double
1107    quote.  TYPE indicates the type of string to be parsed.
1108    Returns token type. */
1109 static int
1110 parse_string (struct lexer *lexer, enum string_type type)
1111 {
1112   if (type != CHARACTER_STRING)
1113     lexer->prog++;
1114
1115   /* Accumulate the entire string, joining sections indicated by +
1116      signs. */
1117   for (;;)
1118     {
1119       /* Single or double quote. */
1120       int c = *lexer->prog++;
1121
1122       /* Accumulate section. */
1123       for (;;)
1124         {
1125           /* Check end of line. */
1126           if (*lexer->prog == '\0')
1127             {
1128               msg (SE, _("Unterminated string constant."));
1129               goto finish;
1130             }
1131
1132           /* Double quote characters to embed them in strings. */
1133           if (*lexer->prog == c)
1134             {
1135               if (lexer->prog[1] == c)
1136                 lexer->prog++;
1137               else
1138                 break;
1139             }
1140
1141           ds_put_char (&lexer->tokstr, *lexer->prog++);
1142         }
1143       lexer->prog++;
1144
1145       /* Skip whitespace after final quote mark. */
1146       if (lexer->prog == NULL)
1147         break;
1148       for (;;)
1149         {
1150           while (isspace ((unsigned char) *lexer->prog))
1151             lexer->prog++;
1152           if (*lexer->prog)
1153             break;
1154
1155           if (lexer->dot)
1156             goto finish;
1157
1158           if (!lex_get_line (lexer))
1159             goto finish;
1160         }
1161
1162       /* Skip plus sign. */
1163       if (*lexer->prog != '+')
1164         break;
1165       lexer->prog++;
1166
1167       /* Skip whitespace after plus sign. */
1168       if (lexer->prog == NULL)
1169         break;
1170       for (;;)
1171         {
1172           while (isspace ((unsigned char) *lexer->prog))
1173             lexer->prog++;
1174           if (*lexer->prog)
1175             break;
1176
1177           if (lexer->dot)
1178             goto finish;
1179
1180           if (!lex_get_line (lexer))
1181             {
1182               msg (SE, _("Unexpected end of file in string concatenation."));
1183               goto finish;
1184             }
1185         }
1186
1187       /* Ensure that a valid string follows. */
1188       if (*lexer->prog != '\'' && *lexer->prog != '"')
1189         {
1190           msg (SE, _("String expected following `+'."));
1191           goto finish;
1192         }
1193     }
1194
1195   /* We come here when we've finished concatenating all the string sections
1196      into one large string. */
1197 finish:
1198   if (type != CHARACTER_STRING)
1199     convert_numeric_string_to_char_string (lexer, type);
1200
1201   if (ds_length (&lexer->tokstr) > 255)
1202     {
1203       msg (SE, _("String exceeds 255 characters in length (%d characters)."),
1204            (int) ds_length (&lexer->tokstr));
1205       ds_truncate (&lexer->tokstr, 255);
1206     }
1207
1208   return T_STRING;
1209 }
1210 \f
1211 #if DUMP_TOKENS
1212 /* Reads one token from the lexer and writes a textual representation
1213    on stdout for debugging purposes. */
1214 static void
1215 dump_token (struct lexer *lexer)
1216 {
1217   {
1218     const char *curfn;
1219     int curln;
1220
1221     curln = getl_source_location (lexer->ss);
1222     curfn = getl_source_name (lexer->ss);
1223     if (curfn)
1224       fprintf (stderr, "%s:%d\t", curfn, curln);
1225   }
1226
1227   switch (lexer->token)
1228     {
1229     case T_ID:
1230       fprintf (stderr, "ID\t%s\n", lexer->tokid);
1231       break;
1232
1233     case T_POS_NUM:
1234     case T_NEG_NUM:
1235       fprintf (stderr, "NUM\t%f\n", lexer->tokval);
1236       break;
1237
1238     case T_STRING:
1239       fprintf (stderr, "STRING\t\"%s\"\n", ds_cstr (&lexer->tokstr));
1240       break;
1241
1242     case T_STOP:
1243       fprintf (stderr, "STOP\n");
1244       break;
1245
1246     case T_EXP:
1247       fprintf (stderr, "MISC\tEXP\"");
1248       break;
1249
1250     case 0:
1251       fprintf (stderr, "MISC\tEOF\n");
1252       break;
1253
1254     default:
1255       if (lex_is_keyword (lexer->token))
1256         fprintf (stderr, "KEYWORD\t%s\n", lex_token_name (lexer->token));
1257       else
1258         fprintf (stderr, "PUNCT\t%c\n", lexer->token);
1259       break;
1260     }
1261 }
1262 #endif /* DUMP_TOKENS */
1263
1264
1265 /* Token Accessor Functions */
1266
1267 int
1268 lex_token (const struct lexer *lexer)
1269 {
1270   return lexer->token;
1271 }
1272
1273 double
1274 lex_tokval (const struct lexer *lexer)
1275 {
1276   return lexer->tokval;
1277 }
1278
1279 const char *
1280 lex_tokid (const struct lexer *lexer)
1281 {
1282   return lexer->tokid;
1283 }
1284
1285 const struct string *
1286 lex_tokstr (const struct lexer *lexer)
1287 {
1288   return &lexer->tokstr;
1289 }