src/language/lexer/lexer.c

   1 /* PSPP - computes sample statistics.
   2    Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
   3    Written by Ben Pfaff <blp@gnu.org>.
   4
   5    This program is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU General Public License as
   7    published by the Free Software Foundation; either version 2 of the
   8    License, or (at your option) any later version.
   9
  10    This program is distributed in the hope that it will be useful, but
  11    WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program; if not, write to the Free Software
  17    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  18    02110-1301, USA. */
  19
  20 #include <config.h>
  21 #include "lexer.h"
  22 #include <libpspp/message.h>
  23 #include <ctype.h>
  24 #include <errno.h>
  25 #include <limits.h>
  26 #include <math.h>
  27 #include <stdarg.h>
  28 #include <stdlib.h>
  29 #include <libpspp/alloc.h>
  30 #include <libpspp/assertion.h>
  31 #include <language/command.h>
  32 #include <libpspp/message.h>
  33 #include <language/line-buffer.h>
  34 #include <libpspp/magic.h>
  35 #include <data/settings.h>
  36 #include <libpspp/str.h>
  37
  38 #include "size_max.h"
  39
  40 #include "gettext.h"
  41 #define _(msgid) gettext (msgid)
  42 #define N_(msgid) msgid
  43
  44 /*
  45 #define DUMP_TOKENS 1
  46 */
  47
  48 struct lexer
  49 {
  50   struct string line_buffer;
  51
  52   bool (*read_line) (struct string *, bool *);
  53
  54   int token;      /* Current token. */
  55   double tokval;  /* T_POS_NUM, T_NEG_NUM: the token's value. */
  56
  57   char tokid [LONG_NAME_LEN + 1];   /* T_ID: the identifier. */
  58
  59   struct string tokstr;   /* T_ID, T_STRING: token string value.
  60                             For T_ID, this is not truncated as is
  61                             tokid. */
  62
  63   char *prog; /* Pointer to next token in line_buffer. */
  64   bool dot;   /* True only if this line ends with a terminal dot. */
  65   bool eof;   /* True only if the last token returned was T_STOP. */
  66
  67   int put_token ; /* If nonzero, next token returned by lex_get().
  68                     Used only in exceptional circumstances. */
  69
  70   struct string put_tokstr;
  71   double put_tokval;
  72 };
  73
  74
  75 static int parse_id (struct lexer *);
  76
  77 /* How a string represents its contents. */
  78 enum string_type
  79   {
  80     CHARACTER_STRING,   /* Characters. */
  81     BINARY_STRING,      /* Binary digits. */
  82     OCTAL_STRING,       /* Octal digits. */
  83     HEX_STRING          /* Hexadecimal digits. */
  84   };
  85
  86 static int parse_string (struct lexer *, enum string_type);
  87
  88 #if DUMP_TOKENS
  89 static void dump_token (void);
  90 #endif
  91 \f
  92 /* Initialization. */
  93
  94 /* Initializes the lexer. */
  95 struct lexer *
  96 lex_create (bool (*read_line_func) (struct string *, bool *))
  97 {
  98   struct lexer *lexer = xzalloc (sizeof (*lexer));
  99
 100   ds_init_empty (&lexer->tokstr);
 101   ds_init_empty (&lexer->put_tokstr);
 102   ds_init_empty (&lexer->line_buffer);
 103   lexer->read_line = read_line_func;
 104
 105   if (!lex_get_line (lexer))
 106     lexer->eof = true;
 107
 108   return lexer;
 109 }
 110
 111 void
 112 lex_destroy (struct lexer *lexer)
 113 {
 114   if ( NULL != lexer )
 115     {
 116       ds_destroy (&lexer->put_tokstr);
 117       ds_destroy (&lexer->tokstr);
 118       ds_destroy (&lexer->line_buffer);
 119
 120       free (lexer);
 121     }
 122 }
 123
 124 \f
 125 /* Common functions. */
 126
 127 /* Copies put_token, lexer->put_tokstr, put_tokval into token, tokstr,
 128    tokval, respectively, and sets tokid appropriately. */
 129 static void
 130 restore_token (struct lexer *lexer)
 131 {
 132   assert (lexer->put_token != 0);
 133   lexer->token = lexer->put_token;
 134   ds_assign_string (&lexer->tokstr, &lexer->put_tokstr);
 135   str_copy_trunc (lexer->tokid, sizeof lexer->tokid, ds_cstr (&lexer->tokstr));
 136   lexer->tokval = lexer->put_tokval;
 137   lexer->put_token = 0;
 138 }
 139
 140 /* Copies token, tokstr, lexer->tokval into lexer->put_token, put_tokstr,
 141    put_lexer->tokval respectively. */
 142 static void
 143 save_token (struct lexer *lexer)
 144 {
 145   lexer->put_token = lexer->token;
 146   ds_assign_string (&lexer->put_tokstr, &lexer->tokstr);
 147   lexer->put_tokval = lexer->tokval;
 148 }
 149
 150 /* Parses a single token, setting appropriate global variables to
 151    indicate the token's attributes. */
 152 void
 153 lex_get (struct lexer *lexer)
 154 {
 155   /* If a token was pushed ahead, return it. */
 156   if (lexer->put_token)
 157     {
 158       restore_token (lexer);
 159 #if DUMP_TOKENS
 160       dump_token ();
 161 #endif
 162       return;
 163     }
 164
 165   /* Find a token. */
 166   for (;;)
 167     {
 168       /* Skip whitespace. */
 169       if (lexer->eof)
 170         {
 171           lexer->token = T_STOP;
 172           return;
 173         }
 174
 175       for (;;)
 176         {
 177           while (isspace ((unsigned char) *lexer->prog))
 178             lexer->prog++;
 179           if (*lexer->prog)
 180             break;
 181
 182           if (lexer->dot)
 183             {
 184               lexer->dot = 0;
 185               lexer->token = '.';
 186 #if DUMP_TOKENS
 187               dump_token ();
 188 #endif
 189               return;
 190             }
 191           else if (!lex_get_line (lexer))
 192             {
 193               lexer->eof = true;
 194               lexer->token = T_STOP;
 195 #if DUMP_TOKENS
 196               dump_token ();
 197 #endif
 198               return;
 199             }
 200
 201           if (lexer->put_token)
 202             {
 203               restore_token (lexer);
 204 #if DUMP_TOKENS
 205               dump_token ();
 206 #endif
 207               return;
 208             }
 209         }
 210
 211
 212       /* Actually parse the token. */
 213       ds_clear (&lexer->tokstr);
 214
 215       switch (*lexer->prog)
 216         {
 217         case '-': case '.':
 218         case '0': case '1': case '2': case '3': case '4':
 219         case '5': case '6': case '7': case '8': case '9':
 220           {
 221             char *tail;
 222
 223             /* `-' can introduce a negative number, or it can be a
 224                token by itself.  If it is not followed by a digit or a
 225                decimal point, it is definitely not a number.
 226                Otherwise, it might be either, but most of the time we
 227                want it as a number.  When the syntax calls for a `-'
 228                token, lex_negative_to_dash() must be used to break
 229                negative numbers into two tokens. */
 230             if (*lexer->prog == '-')
 231               {
 232                 ds_put_char (&lexer->tokstr, *lexer->prog++);
 233                 while (isspace ((unsigned char) *lexer->prog))
 234                   lexer->prog++;
 235
 236                 if (!isdigit ((unsigned char) *lexer->prog) && *lexer->prog != '.')
 237                   {
 238                     lexer->token = '-';
 239                     break;
 240                   }
 241                 lexer->token = T_NEG_NUM;
 242               }
 243             else
 244               lexer->token = T_POS_NUM;
 245
 246             /* Parse the number, copying it into tokstr. */
 247             while (isdigit ((unsigned char) *lexer->prog))
 248               ds_put_char (&lexer->tokstr, *lexer->prog++);
 249             if (*lexer->prog == '.')
 250               {
 251                 ds_put_char (&lexer->tokstr, *lexer->prog++);
 252                 while (isdigit ((unsigned char) *lexer->prog))
 253                   ds_put_char (&lexer->tokstr, *lexer->prog++);
 254               }
 255             if (*lexer->prog == 'e' || *lexer->prog == 'E')
 256               {
 257                 ds_put_char (&lexer->tokstr, *lexer->prog++);
 258                 if (*lexer->prog == '+' || *lexer->prog == '-')
 259                   ds_put_char (&lexer->tokstr, *lexer->prog++);
 260                 while (isdigit ((unsigned char) *lexer->prog))
 261                   ds_put_char (&lexer->tokstr, *lexer->prog++);
 262               }
 263
 264             /* Parse as floating point. */
 265             lexer->tokval = strtod (ds_cstr (&lexer->tokstr), &tail);
 266             if (*tail)
 267               {
 268                 msg (SE, _("%s does not form a valid number."),
 269                      ds_cstr (&lexer->tokstr));
 270                 lexer->tokval = 0.0;
 271
 272                 ds_clear (&lexer->tokstr);
 273                 ds_put_char (&lexer->tokstr, '0');
 274               }
 275
 276             break;
 277           }
 278
 279         case '\'': case '"':
 280           lexer->token = parse_string (lexer, CHARACTER_STRING);
 281           break;
 282
 283         case '(': case ')': case ',': case '=': case '+': case '/':
 284           lexer->token = *lexer->prog++;
 285           break;
 286
 287         case '*':
 288           if (*++lexer->prog == '*')
 289             {
 290               lexer->prog++;
 291               lexer->token = T_EXP;
 292             }
 293           else
 294             lexer->token = '*';
 295           break;
 296
 297         case '<':
 298           if (*++lexer->prog == '=')
 299             {
 300               lexer->prog++;
 301               lexer->token = T_LE;
 302             }
 303           else if (*lexer->prog == '>')
 304             {
 305               lexer->prog++;
 306               lexer->token = T_NE;
 307             }
 308           else
 309             lexer->token = T_LT;
 310           break;
 311
 312         case '>':
 313           if (*++lexer->prog == '=')
 314             {
 315               lexer->prog++;
 316               lexer->token = T_GE;
 317             }
 318           else
 319             lexer->token = T_GT;
 320           break;
 321
 322         case '~':
 323           if (*++lexer->prog == '=')
 324             {
 325               lexer->prog++;
 326               lexer->token = T_NE;
 327             }
 328           else
 329             lexer->token = T_NOT;
 330           break;
 331
 332         case '&':
 333           lexer->prog++;
 334           lexer->token = T_AND;
 335           break;
 336
 337         case '|':
 338           lexer->prog++;
 339           lexer->token = T_OR;
 340           break;
 341
 342         case 'b': case 'B':
 343           if (lexer->prog[1] == '\'' || lexer->prog[1] == '"')
 344             lexer->token = parse_string (lexer, BINARY_STRING);
 345           else
 346             lexer->token = parse_id (lexer);
 347           break;
 348
 349         case 'o': case 'O':
 350           if (lexer->prog[1] == '\'' || lexer->prog[1] == '"')
 351             lexer->token = parse_string (lexer, OCTAL_STRING);
 352           else
 353             lexer->token = parse_id (lexer);
 354           break;
 355
 356         case 'x': case 'X':
 357           if (lexer->prog[1] == '\'' || lexer->prog[1] == '"')
 358             lexer->token = parse_string (lexer, HEX_STRING);
 359           else
 360             lexer->token = parse_id (lexer);
 361           break;
 362
 363         default:
 364           if (lex_is_id1 (*lexer->prog))
 365             {
 366               lexer->token = parse_id (lexer);
 367               break;
 368             }
 369           else
 370             {
 371               if (isgraph ((unsigned char) *lexer->prog))
 372                 msg (SE, _("Bad character in input: `%c'."), *lexer->prog++);
 373               else
 374                 msg (SE, _("Bad character in input: `\\%o'."), *lexer->prog++);
 375               continue;
 376             }
 377         }
 378       break;
 379     }
 380
 381 #if DUMP_TOKENS
 382   dump_token ();
 383 #endif
 384 }
 385
 386 /* Parses an identifier at the current position into tokid and
 387    tokstr.
 388    Returns the correct token type. */
 389 static int
 390 parse_id (struct lexer *lexer)
 391 {
 392   const char *start = lexer->prog;
 393   lexer->prog = lex_skip_identifier (start);
 394
 395   ds_put_substring (&lexer->tokstr, ss_buffer (start, lexer->prog - start));
 396   str_copy_trunc (lexer->tokid, sizeof lexer->tokid, ds_cstr (&lexer->tokstr));
 397   return lex_id_to_token (ds_cstr (&lexer->tokstr), ds_length (&lexer->tokstr));
 398 }
 399
 400 /* Reports an error to the effect that subcommand SBC may only be
 401    specified once. */
 402 void
 403 lex_sbc_only_once (const char *sbc)
 404 {
 405   msg (SE, _("Subcommand %s may only be specified once."), sbc);
 406 }
 407
 408 /* Reports an error to the effect that subcommand SBC is
 409    missing. */
 410 void
 411 lex_sbc_missing (struct lexer *lexer, const char *sbc)
 412 {
 413   lex_error (lexer, _("missing required subcommand %s"), sbc);
 414 }
 415
 416 /* Prints a syntax error message containing the current token and
 417    given message MESSAGE (if non-null). */
 418 void
 419 lex_error (struct lexer *lexer, const char *message, ...)
 420 {
 421   char *token_rep;
 422   char where[128];
 423
 424   token_rep = lex_token_representation (lexer);
 425   if (lexer->token == T_STOP)
 426     strcpy (where, "end of file");
 427   else if (lexer->token == '.')
 428     strcpy (where, "end of command");
 429   else
 430     snprintf (where, sizeof where, "`%s'", token_rep);
 431   free (token_rep);
 432
 433   if (message)
 434     {
 435       char buf[1024];
 436       va_list args;
 437
 438       va_start (args, message);
 439       vsnprintf (buf, 1024, message, args);
 440       va_end (args);
 441
 442       msg (SE, _("Syntax error %s at %s."), buf, where);
 443     }
 444   else
 445     msg (SE, _("Syntax error at %s."), where);
 446 }
 447
 448 /* Checks that we're at end of command.
 449    If so, returns a successful command completion code.
 450    If not, flags a syntax error and returns an error command
 451    completion code. */
 452 int
 453 lex_end_of_command (struct lexer *lexer)
 454 {
 455   if (lexer->token != '.')
 456     {
 457       lex_error (lexer, _("expecting end of command"));
 458       return CMD_FAILURE;
 459     }
 460   else
 461     return CMD_SUCCESS;
 462 }
 463 \f
 464 /* Token testing functions. */
 465
 466 /* Returns true if the current token is a number. */
 467 bool
 468 lex_is_number (struct lexer *lexer)
 469 {
 470   return lexer->token == T_POS_NUM || lexer->token == T_NEG_NUM;
 471 }
 472
 473 /* Returns the value of the current token, which must be a
 474    floating point number. */
 475 double
 476 lex_number (struct lexer *lexer)
 477 {
 478   assert (lex_is_number (lexer));
 479   return lexer->tokval;
 480 }
 481
 482 /* Returns true iff the current token is an integer. */
 483 bool
 484 lex_is_integer (struct lexer *lexer)
 485 {
 486   return (lex_is_number (lexer)
 487           && lexer->tokval != NOT_LONG
 488           && lexer->tokval >= LONG_MIN
 489           && lexer->tokval <= LONG_MAX
 490           && floor (lexer->tokval) == lexer->tokval);
 491 }
 492
 493 /* Returns the value of the current token, which must be an
 494    integer. */
 495 long
 496 lex_integer (struct lexer *lexer)
 497 {
 498   assert (lex_is_integer (lexer));
 499   return lexer->tokval;
 500 }
 501 \f
 502 /* Token matching functions. */
 503
 504 /* If TOK is the current token, skips it and returns true
 505    Otherwise, returns false. */
 506 bool
 507 lex_match (struct lexer *lexer, int t)
 508 {
 509   if (lexer->token == t)
 510     {
 511       lex_get (lexer);
 512       return true;
 513     }
 514   else
 515     return false;
 516 }
 517
 518 /* If the current token is the identifier S, skips it and returns
 519    true.  The identifier may be abbreviated to its first three
 520    letters.
 521    Otherwise, returns false. */
 522 bool
 523 lex_match_id (struct lexer *lexer, const char *s)
 524 {
 525   if (lexer->token == T_ID && lex_id_match (s, lexer->tokid))
 526     {
 527       lex_get (lexer);
 528       return true;
 529     }
 530   else
 531     return false;
 532 }
 533
 534 /* If the current token is integer N, skips it and returns true.
 535    Otherwise, returns false. */
 536 bool
 537 lex_match_int (struct lexer *lexer, int x)
 538 {
 539   if (lex_is_integer (lexer) && lex_integer (lexer) == x)
 540     {
 541       lex_get (lexer);
 542       return true;
 543     }
 544   else
 545     return false;
 546 }
 547 \f
 548 /* Forced matches. */
 549
 550 /* If this token is identifier S, fetches the next token and returns
 551    nonzero.
 552    Otherwise, reports an error and returns zero. */
 553 bool
 554 lex_force_match_id (struct lexer *lexer, const char *s)
 555 {
 556   if (lexer->token == T_ID && lex_id_match (s, lexer->tokid))
 557     {
 558       lex_get (lexer);
 559       return true;
 560     }
 561   else
 562     {
 563       lex_error (lexer, _("expecting `%s'"), s);
 564       return false;
 565     }
 566 }
 567
 568 /* If the current token is T, skips the token.  Otherwise, reports an
 569    error and returns from the current function with return value false. */
 570 bool
 571 lex_force_match (struct lexer *lexer, int t)
 572 {
 573   if (lexer->token == t)
 574     {
 575       lex_get (lexer);
 576       return true;
 577     }
 578   else
 579     {
 580       lex_error (lexer, _("expecting `%s'"), lex_token_name (t));
 581       return false;
 582     }
 583 }
 584
 585 /* If this token is a string, does nothing and returns true.
 586    Otherwise, reports an error and returns false. */
 587 bool
 588 lex_force_string (struct lexer *lexer)
 589 {
 590   if (lexer->token == T_STRING)
 591     return true;
 592   else
 593     {
 594       lex_error (lexer, _("expecting string"));
 595       return false;
 596     }
 597 }
 598
 599 /* If this token is an integer, does nothing and returns true.
 600    Otherwise, reports an error and returns false. */
 601 bool
 602 lex_force_int (struct lexer *lexer)
 603 {
 604   if (lex_is_integer (lexer))
 605     return true;
 606   else
 607     {
 608       lex_error (lexer, _("expecting integer"));
 609       return false;
 610     }
 611 }
 612
 613 /* If this token is a number, does nothing and returns true.
 614    Otherwise, reports an error and returns false. */
 615 bool
 616 lex_force_num (struct lexer *lexer)
 617 {
 618   if (lex_is_number (lexer))
 619     return true;
 620
 621   lex_error (lexer, _("expecting number"));
 622   return false;
 623 }
 624
 625 /* If this token is an identifier, does nothing and returns true.
 626    Otherwise, reports an error and returns false. */
 627 bool
 628 lex_force_id (struct lexer *lexer)
 629 {
 630   if (lexer->token == T_ID)
 631     return true;
 632
 633   lex_error (lexer, _("expecting identifier"));
 634   return false;
 635 }
 636
 637 /* Weird token functions. */
 638
 639 /* Returns the first character of the next token, except that if the
 640    next token is not an identifier, the character returned will not be
 641    a character that can begin an identifier.  Specifically, the
 642    hexstring lead-in X' causes lookahead() to return '.  Note that an
 643    alphanumeric return value doesn't guarantee an ID token, it could
 644    also be a reserved-word token. */
 645 int
 646 lex_look_ahead (struct lexer *lexer)
 647 {
 648   if (lexer->put_token)
 649     return lexer->put_token;
 650
 651   for (;;)
 652     {
 653       if (lexer->eof)
 654         return 0;
 655
 656       for (;;)
 657         {
 658           while (isspace ((unsigned char) *lexer->prog))
 659             lexer->prog++;
 660           if (*lexer->prog)
 661             break;
 662
 663           if (lexer->dot)
 664             return '.';
 665           else if (!lex_get_line (lexer))
 666             return 0;
 667
 668           if (lexer->put_token)
 669             return lexer->put_token;
 670         }
 671
 672       if ((toupper ((unsigned char) *lexer->prog) == 'X'
 673            || toupper ((unsigned char) *lexer->prog) == 'B'
 674            || toupper ((unsigned char) *lexer->prog) == 'O')
 675           && (lexer->prog[1] == '\'' || lexer->prog[1] == '"'))
 676         return '\'';
 677
 678       return *lexer->prog;
 679     }
 680 }
 681
 682 /* Makes the current token become the next token to be read; the
 683    current token is set to T. */
 684 void
 685 lex_put_back (struct lexer *lexer, int t)
 686 {
 687   save_token (lexer);
 688   lexer->token = t;
 689 }
 690
 691 /* Makes the current token become the next token to be read; the
 692    current token is set to the identifier ID. */
 693 void
 694 lex_put_back_id (struct lexer *lexer, const char *id)
 695 {
 696   assert (lex_id_to_token (id, strlen (id)) == T_ID);
 697   save_token (lexer);
 698   lexer->token = T_ID;
 699   ds_assign_cstr (&lexer->tokstr, id);
 700   str_copy_trunc (lexer->tokid, sizeof lexer->tokid, ds_cstr (&lexer->tokstr));
 701 }
 702 \f
 703 /* Weird line processing functions. */
 704
 705 /* Returns the entire contents of the current line. */
 706 const char *
 707 lex_entire_line (struct lexer *lexer)
 708 {
 709   return ds_cstr (&lexer->line_buffer);
 710 }
 711
 712 const struct string *
 713 lex_entire_line_ds (struct lexer *lexer)
 714 {
 715   return &lexer->line_buffer;
 716 }
 717
 718 /* As lex_entire_line(), but only returns the part of the current line
 719    that hasn't already been tokenized.
 720    If END_DOT is non-null, stores nonzero into *END_DOT if the line
 721    ends with a terminal dot, or zero if it doesn't. */
 722 const char *
 723 lex_rest_of_line (struct lexer *lexer, int *end_dot)
 724 {
 725   if (end_dot)
 726     *end_dot = lexer->dot;
 727   return lexer->prog;
 728 }
 729
 730 /* Causes the rest of the current input line to be ignored for
 731    tokenization purposes. */
 732 void
 733 lex_discard_line (struct lexer *lexer)
 734 {
 735   ds_cstr (&lexer->line_buffer);  /* Ensures ds_end points to something valid */
 736   lexer->prog = ds_end (&lexer->line_buffer);
 737   lexer->dot = false;
 738   lexer->put_token = 0;
 739 }
 740
 741
 742 /* Discards the rest of the current command.
 743    When we're reading commands from a file, we skip tokens until
 744    a terminal dot or EOF.
 745    When we're reading commands interactively from the user,
 746    that's just discarding the current line, because presumably
 747    the user doesn't want to finish typing a command that will be
 748    ignored anyway. */
 749 void
 750 lex_discard_rest_of_command (struct lexer *lexer)
 751 {
 752   if (!getl_is_interactive ())
 753     {
 754       while (lexer->token != T_STOP && lexer->token != '.')
 755         lex_get (lexer);
 756     }
 757   else
 758     lex_discard_line (lexer);
 759 }
 760 \f
 761 /* Weird line reading functions. */
 762
 763 /* Remove C-style comments in STRING, begun by slash-star and
 764    terminated by star-slash or newline. */
 765 static void
 766 strip_comments (struct string *string)
 767 {
 768   char *cp;
 769   int quote;
 770   bool in_comment;
 771
 772   in_comment = false;
 773   quote = EOF;
 774   for (cp = ds_cstr (string); *cp; )
 775     {
 776       /* If we're not in a comment, check for quote marks. */
 777       if (!in_comment)
 778         {
 779           if (*cp == quote)
 780             quote = EOF;
 781           else if (*cp == '\'' || *cp == '"')
 782             quote = *cp;
 783         }
 784
 785       /* If we're not inside a quotation, check for comment. */
 786       if (quote == EOF)
 787         {
 788           if (cp[0] == '/' && cp[1] == '*')
 789             {
 790               in_comment = true;
 791               *cp++ = ' ';
 792               *cp++ = ' ';
 793               continue;
 794             }
 795           else if (in_comment && cp[0] == '*' && cp[1] == '/')
 796             {
 797               in_comment = false;
 798               *cp++ = ' ';
 799               *cp++ = ' ';
 800               continue;
 801             }
 802         }
 803
 804       /* Check commenting. */
 805       if (in_comment)
 806         *cp = ' ';
 807       cp++;
 808     }
 809 }
 810
 811 /* Reads a line, without performing any preprocessing */
 812 bool
 813 lex_get_line_raw (struct lexer *lexer)
 814 {
 815   bool dummy;
 816   return lexer->read_line (&lexer->line_buffer, &dummy);
 817 }
 818
 819 /* Reads a line for use by the tokenizer, and preprocesses it by
 820    removing comments, stripping trailing whitespace and the
 821    terminal dot, and removing leading indentors. */
 822 bool
 823 lex_get_line (struct lexer *lexer)
 824 {
 825   struct string *line = &lexer->line_buffer;
 826   bool interactive;
 827
 828   if (!lexer->read_line (line, &interactive))
 829     return false;
 830
 831   strip_comments (line);
 832   ds_rtrim (line, ss_cstr (CC_SPACES));
 833
 834   /* Check for and remove terminal dot. */
 835   lexer->dot = (ds_chomp (line, get_endcmd ())
 836          || (ds_is_empty (line) && get_nulline ()));
 837
 838   /* Strip leading indentors or insert a terminal dot (unless the
 839      line was obtained interactively). */
 840   if (!interactive)
 841     {
 842       int first = ds_first (line);
 843
 844       if (first == '+' || first == '-')
 845         *ds_data (line) = ' ';
 846       else if (first != EOF && !isspace (first))
 847         lexer->put_token = '.';
 848     }
 849
 850   lexer->prog = ds_cstr (line);
 851
 852   return true;
 853 }
 854 \f
 855 /* Token names. */
 856
 857 /* Returns the name of a token in a static buffer. */
 858 const char *
 859 lex_token_name (int token)
 860 {
 861   if (token >= T_FIRST_KEYWORD && token <= T_LAST_KEYWORD)
 862     return keywords[token - T_FIRST_KEYWORD];
 863
 864   if (token < 256)
 865     {
 866       static char t[2];
 867       t[0] = token;
 868       return t;
 869     }
 870
 871   NOT_REACHED ();
 872 }
 873
 874 /* Returns an ASCII representation of the current token as a
 875    malloc()'d string. */
 876 char *
 877 lex_token_representation (struct lexer *lexer)
 878 {
 879   char *token_rep;
 880
 881   switch (lexer->token)
 882     {
 883     case T_ID:
 884     case T_POS_NUM:
 885     case T_NEG_NUM:
 886       return ds_xstrdup (&lexer->tokstr);
 887       break;
 888
 889     case T_STRING:
 890       {
 891         int hexstring = 0;
 892         char *sp, *dp;
 893
 894         for (sp = ds_cstr (&lexer->tokstr); sp < ds_end (&lexer->tokstr); sp++)
 895           if (!isprint ((unsigned char) *sp))
 896             {
 897               hexstring = 1;
 898               break;
 899             }
 900
 901         token_rep = xmalloc (2 + ds_length (&lexer->tokstr) * 2 + 1 + 1);
 902
 903         dp = token_rep;
 904         if (hexstring)
 905           *dp++ = 'X';
 906         *dp++ = '\'';
 907
 908         if (!hexstring)
 909           for (sp = ds_cstr (&lexer->tokstr); *sp; )
 910             {
 911               if (*sp == '\'')
 912                 *dp++ = '\'';
 913               *dp++ = (unsigned char) *sp++;
 914             }
 915         else
 916           for (sp = ds_cstr (&lexer->tokstr); sp < ds_end (&lexer->tokstr); sp++)
 917             {
 918               *dp++ = (((unsigned char) *sp) >> 4)["0123456789ABCDEF"];
 919               *dp++ = (((unsigned char) *sp) & 15)["0123456789ABCDEF"];
 920             }
 921         *dp++ = '\'';
 922         *dp = '\0';
 923
 924         return token_rep;
 925       }
 926     break;
 927
 928     case T_STOP:
 929       token_rep = xmalloc (1);
 930       *token_rep = '\0';
 931       return token_rep;
 932
 933     case T_EXP:
 934       return xstrdup ("**");
 935
 936     default:
 937       if (lexer->token >= T_FIRST_KEYWORD && lexer->token <= T_LAST_KEYWORD)
 938         return xstrdup (keywords [lexer->token - T_FIRST_KEYWORD]);
 939       else
 940         {
 941           token_rep = xmalloc (2);
 942           token_rep[0] = lexer->token;
 943           token_rep[1] = '\0';
 944           return token_rep;
 945         }
 946     }
 947
 948   NOT_REACHED ();
 949 }
 950 \f
 951 /* Really weird functions. */
 952
 953 /* Most of the time, a `-' is a lead-in to a negative number.  But
 954    sometimes it's actually part of the syntax.  If a dash can be part
 955    of syntax then this function is called to rip it off of a
 956    number. */
 957 void
 958 lex_negative_to_dash (struct lexer *lexer)
 959 {
 960   if (lexer->token == T_NEG_NUM)
 961     {
 962       lexer->token = T_POS_NUM;
 963       lexer->tokval = -lexer->tokval;
 964       ds_assign_substring (&lexer->tokstr, ds_substr (&lexer->tokstr, 1, SIZE_MAX));
 965       save_token (lexer);
 966       lexer->token = '-';
 967     }
 968 }
 969
 970 /* We're not at eof any more. */
 971 void
 972 lex_reset_eof (struct lexer *lexer)
 973 {
 974   lexer->eof = false;
 975 }
 976
 977 /* Skip a COMMENT command. */
 978 void
 979 lex_skip_comment (struct lexer *lexer)
 980 {
 981   for (;;)
 982     {
 983       if (!lex_get_line (lexer))
 984         {
 985           lexer->put_token = T_STOP;
 986           lexer->eof = true;
 987           return;
 988         }
 989
 990       if (lexer->put_token == '.')
 991         break;
 992
 993       ds_cstr (&lexer->line_buffer); /* Ensures ds_end will point to a valid char */
 994       lexer->prog = ds_end (&lexer->line_buffer);
 995       if (lexer->dot)
 996         break;
 997     }
 998 }
 999 \f
1000 /* Private functions. */
1001
1002 /* When invoked, tokstr contains a string of binary, octal, or
1003    hex digits, according to TYPE.  The string is converted to
1004    characters having the specified values. */
1005 static void
1006 convert_numeric_string_to_char_string (struct lexer *lexer,
1007                                        enum string_type type)
1008 {
1009   const char *base_name;
1010   int base;
1011   int chars_per_byte;
1012   size_t byte_cnt;
1013   size_t i;
1014   char *p;
1015
1016   switch (type)
1017     {
1018     case BINARY_STRING:
1019       base_name = _("binary");
1020       base = 2;
1021       chars_per_byte = 8;
1022       break;
1023     case OCTAL_STRING:
1024       base_name = _("octal");
1025       base = 8;
1026       chars_per_byte = 3;
1027       break;
1028     case HEX_STRING:
1029       base_name = _("hex");
1030       base = 16;
1031       chars_per_byte = 2;
1032       break;
1033     default:
1034       NOT_REACHED ();
1035     }
1036
1037   byte_cnt = ds_length (&lexer->tokstr) / chars_per_byte;
1038   if (ds_length (&lexer->tokstr) % chars_per_byte)
1039     msg (SE, _("String of %s digits has %d characters, which is not a "
1040                "multiple of %d."),
1041          base_name, ds_length (&lexer->tokstr), chars_per_byte);
1042
1043   p = ds_cstr (&lexer->tokstr);
1044   for (i = 0; i < byte_cnt; i++)
1045     {
1046       int value;
1047       int j;
1048
1049       value = 0;
1050       for (j = 0; j < chars_per_byte; j++, p++)
1051         {
1052           int v;
1053
1054           if (*p >= '0' && *p <= '9')
1055             v = *p - '0';
1056           else
1057             {
1058               static const char alpha[] = "abcdef";
1059               const char *q = strchr (alpha, tolower ((unsigned char) *p));
1060
1061               if (q)
1062                 v = q - alpha + 10;
1063               else
1064                 v = base;
1065             }
1066
1067           if (v >= base)
1068             msg (SE, _("`%c' is not a valid %s digit."), *p, base_name);
1069
1070           value = value * base + v;
1071         }
1072
1073       ds_cstr (&lexer->tokstr)[i] = (unsigned char) value;
1074     }
1075
1076   ds_truncate (&lexer->tokstr, byte_cnt);
1077 }
1078
1079 /* Parses a string from the input buffer into tokstr.  The input
1080    buffer pointer lexer->prog must point to the initial single or double
1081    quote.  TYPE indicates the type of string to be parsed.
1082    Returns token type. */
1083 static int
1084 parse_string (struct lexer *lexer, enum string_type type)
1085 {
1086   if (type != CHARACTER_STRING)
1087     lexer->prog++;
1088
1089   /* Accumulate the entire string, joining sections indicated by +
1090      signs. */
1091   for (;;)
1092     {
1093       /* Single or double quote. */
1094       int c = *lexer->prog++;
1095
1096       /* Accumulate section. */
1097       for (;;)
1098         {
1099           /* Check end of line. */
1100           if (*lexer->prog == '\0')
1101             {
1102               msg (SE, _("Unterminated string constant."));
1103               goto finish;
1104             }
1105
1106           /* Double quote characters to embed them in strings. */
1107           if (*lexer->prog == c)
1108             {
1109               if (lexer->prog[1] == c)
1110                 lexer->prog++;
1111               else
1112                 break;
1113             }
1114
1115           ds_put_char (&lexer->tokstr, *lexer->prog++);
1116         }
1117       lexer->prog++;
1118
1119       /* Skip whitespace after final quote mark. */
1120       if (lexer->eof)
1121         break;
1122       for (;;)
1123         {
1124           while (isspace ((unsigned char) *lexer->prog))
1125             lexer->prog++;
1126           if (*lexer->prog)
1127             break;
1128
1129           if (lexer->dot)
1130             goto finish;
1131
1132           if (!lex_get_line (lexer))
1133             goto finish;
1134         }
1135
1136       /* Skip plus sign. */
1137       if (*lexer->prog != '+')
1138         break;
1139       lexer->prog++;
1140
1141       /* Skip whitespace after plus sign. */
1142       if (lexer->eof)
1143         break;
1144       for (;;)
1145         {
1146           while (isspace ((unsigned char) *lexer->prog))
1147             lexer->prog++;
1148           if (*lexer->prog)
1149             break;
1150
1151           if (lexer->dot)
1152             goto finish;
1153
1154           if (!lex_get_line (lexer))
1155             {
1156               msg (SE, _("Unexpected end of file in string concatenation."));
1157               goto finish;
1158             }
1159         }
1160
1161       /* Ensure that a valid string follows. */
1162       if (*lexer->prog != '\'' && *lexer->prog != '"')
1163         {
1164           msg (SE, _("String expected following `+'."));
1165           goto finish;
1166         }
1167     }
1168
1169   /* We come here when we've finished concatenating all the string sections
1170      into one large string. */
1171 finish:
1172   if (type != CHARACTER_STRING)
1173     convert_numeric_string_to_char_string (lexer, type);
1174
1175   if (ds_length (&lexer->tokstr) > 255)
1176     {
1177       msg (SE, _("String exceeds 255 characters in length (%d characters)."),
1178            ds_length (&lexer->tokstr));
1179       ds_truncate (&lexer->tokstr, 255);
1180     }
1181
1182   return T_STRING;
1183 }
1184 \f
1185 #if DUMP_TOKENS
1186 /* Reads one token from the lexer and writes a textual representation
1187    on stdout for debugging purposes. */
1188 static void
1189 dump_token (struct lexer *lexer)
1190 {
1191   {
1192     const char *curfn;
1193     int curln;
1194
1195     getl_location (&curfn, &curln);
1196     if (curfn)
1197       fprintf (stderr, "%s:%d\t", curfn, curln);
1198   }
1199
1200   switch (lexer->token)
1201     {
1202     case T_ID:
1203       fprintf (stderr, "ID\t%s\n", lexer->tokid);
1204       break;
1205
1206     case T_POS_NUM:
1207     case T_NEG_NUM:
1208       fprintf (stderr, "NUM\t%f\n", lexer->tokval);
1209       break;
1210
1211     case T_STRING:
1212       fprintf (stderr, "STRING\t\"%s\"\n", ds_cstr (&lexer->tokstr));
1213       break;
1214
1215     case T_STOP:
1216       fprintf (stderr, "STOP\n");
1217       break;
1218
1219     case T_EXP:
1220       fprintf (stderr, "MISC\tEXP\"");
1221       break;
1222
1223     case 0:
1224       fprintf (stderr, "MISC\tEOF\n");
1225       break;
1226
1227     default:
1228       if (lexer->token >= T_FIRST_KEYWORD && lexer->token <= T_LAST_KEYWORD)
1229         fprintf (stderr, "KEYWORD\t%s\n", lex_token_name (token));
1230       else
1231         fprintf (stderr, "PUNCT\t%c\n", lexer->token);
1232       break;
1233     }
1234 }
1235 #endif /* DUMP_TOKENS */
1236
1237
1238 /* Token Accessor Functions */
1239
1240 int
1241 lex_token (const struct lexer *lexer)
1242 {
1243   return lexer->token;
1244 }
1245
1246 double
1247 lex_tokval (const struct lexer *lexer)
1248 {
1249   return lexer->tokval;
1250 }
1251
1252 const char *
1253 lex_tokid (const struct lexer *lexer)
1254 {
1255   return lexer->tokid;
1256 }
1257
1258 const struct string *
1259 lex_tokstr (const struct lexer *lexer)
1260 {
1261   return &lexer->tokstr;
1262 }