src/language/lexer/lexer.c

   1 /* PSPP - computes sample statistics.
   2    Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
   3    Written by Ben Pfaff <blp@gnu.org>.
   4
   5    This program is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU General Public License as
   7    published by the Free Software Foundation; either version 2 of the
   8    License, or (at your option) any later version.
   9
  10    This program is distributed in the hope that it will be useful, but
  11    WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program; if not, write to the Free Software
  17    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  18    02110-1301, USA. */
  19
  20 #include <config.h>
  21 #include "lexer.h"
  22 #include <libpspp/message.h>
  23 #include <ctype.h>
  24 #include <errno.h>
  25 #include <limits.h>
  26 #include <math.h>
  27 #include <stdarg.h>
  28 #include <stdlib.h>
  29 #include <libpspp/alloc.h>
  30 #include <libpspp/assertion.h>
  31 #include <language/command.h>
  32 #include <libpspp/message.h>
  33 #include <language/line-buffer.h>
  34 #include <libpspp/magic.h>
  35 #include <data/settings.h>
  36 #include <libpspp/str.h>
  37
  38 #include "size_max.h"
  39
  40 #include "gettext.h"
  41 #define _(msgid) gettext (msgid)
  42 #define N_(msgid) msgid
  43
  44 /*
  45 #define DUMP_TOKENS 1
  46 */
  47
  48 struct lexer
  49 {
  50   struct string line_buffer;
  51
  52   bool (*read_line) (struct string *, bool *);
  53
  54   int token;      /* Current token. */
  55   double tokval;  /* T_POS_NUM, T_NEG_NUM: the token's value. */
  56
  57   char tokid [LONG_NAME_LEN + 1];   /* T_ID: the identifier. */
  58
  59   struct string tokstr;   /* T_ID, T_STRING: token string value.
  60                             For T_ID, this is not truncated as is
  61                             tokid. */
  62
  63   char *prog; /* Pointer to next token in line_buffer. */
  64   bool dot;   /* True only if this line ends with a terminal dot. */
  65   bool eof;   /* True only if the last token returned was T_STOP. */
  66
  67   int put_token ; /* If nonzero, next token returned by lex_get().
  68                     Used only in exceptional circumstances. */
  69
  70   struct string put_tokstr;
  71   double put_tokval;
  72 };
  73
  74
  75 static int parse_id (struct lexer *);
  76
  77 /* How a string represents its contents. */
  78 enum string_type
  79   {
  80     CHARACTER_STRING,   /* Characters. */
  81     BINARY_STRING,      /* Binary digits. */
  82     OCTAL_STRING,       /* Octal digits. */
  83     HEX_STRING          /* Hexadecimal digits. */
  84   };
  85
  86 static int parse_string (struct lexer *, enum string_type);
  87
  88 #if DUMP_TOKENS
  89 static void dump_token (void);
  90 #endif
  91 \f
  92 /* Initialization. */
  93
  94 /* Initializes the lexer. */
  95 struct lexer *
  96 lex_create (bool (*read_line_func) (struct string *, bool *))
  97 {
  98   struct lexer *lexer = xzalloc (sizeof (*lexer));
  99
 100   ds_init_empty (&lexer->tokstr);
 101   ds_init_empty (&lexer->put_tokstr);
 102   ds_init_empty (&lexer->line_buffer);
 103   lexer->read_line = read_line_func;
 104
 105   if (!lex_get_line (lexer))
 106     lexer->eof = true;
 107
 108   return lexer;
 109 }
 110
 111 void
 112 lex_destroy (struct lexer *lexer)
 113 {
 114   ds_destroy (&lexer->put_tokstr);
 115   ds_destroy (&lexer->tokstr);
 116   ds_destroy (&lexer->line_buffer);
 117
 118   free (lexer);
 119 }
 120
 121 \f
 122 /* Common functions. */
 123
 124 /* Copies put_token, lexer->put_tokstr, put_tokval into token, tokstr,
 125    tokval, respectively, and sets tokid appropriately. */
 126 static void
 127 restore_token (struct lexer *lexer)
 128 {
 129   assert (lexer->put_token != 0);
 130   lexer->token = lexer->put_token;
 131   ds_assign_string (&lexer->tokstr, &lexer->put_tokstr);
 132   str_copy_trunc (lexer->tokid, sizeof lexer->tokid, ds_cstr (&lexer->tokstr));
 133   lexer->tokval = lexer->put_tokval;
 134   lexer->put_token = 0;
 135 }
 136
 137 /* Copies token, tokstr, lexer->tokval into lexer->put_token, put_tokstr,
 138    put_lexer->tokval respectively. */
 139 static void
 140 save_token (struct lexer *lexer)
 141 {
 142   lexer->put_token = lexer->token;
 143   ds_assign_string (&lexer->put_tokstr, &lexer->tokstr);
 144   lexer->put_tokval = lexer->tokval;
 145 }
 146
 147 /* Parses a single token, setting appropriate global variables to
 148    indicate the token's attributes. */
 149 void
 150 lex_get (struct lexer *lexer)
 151 {
 152   /* If a token was pushed ahead, return it. */
 153   if (lexer->put_token)
 154     {
 155       restore_token (lexer);
 156 #if DUMP_TOKENS
 157       dump_token ();
 158 #endif
 159       return;
 160     }
 161
 162   /* Find a token. */
 163   for (;;)
 164     {
 165       /* Skip whitespace. */
 166       if (lexer->eof)
 167         {
 168           lexer->token = T_STOP;
 169           return;
 170         }
 171
 172       for (;;)
 173         {
 174           while (isspace ((unsigned char) *lexer->prog))
 175             lexer->prog++;
 176           if (*lexer->prog)
 177             break;
 178
 179           if (lexer->dot)
 180             {
 181               lexer->dot = 0;
 182               lexer->token = '.';
 183 #if DUMP_TOKENS
 184               dump_token ();
 185 #endif
 186               return;
 187             }
 188           else if (!lex_get_line (lexer))
 189             {
 190               lexer->eof = true;
 191               lexer->token = T_STOP;
 192 #if DUMP_TOKENS
 193               dump_token ();
 194 #endif
 195               return;
 196             }
 197
 198           if (lexer->put_token)
 199             {
 200               restore_token (lexer);
 201 #if DUMP_TOKENS
 202               dump_token ();
 203 #endif
 204               return;
 205             }
 206         }
 207
 208
 209       /* Actually parse the token. */
 210       ds_clear (&lexer->tokstr);
 211
 212       switch (*lexer->prog)
 213         {
 214         case '-': case '.':
 215         case '0': case '1': case '2': case '3': case '4':
 216         case '5': case '6': case '7': case '8': case '9':
 217           {
 218             char *tail;
 219
 220             /* `-' can introduce a negative number, or it can be a
 221                token by itself.  If it is not followed by a digit or a
 222                decimal point, it is definitely not a number.
 223                Otherwise, it might be either, but most of the time we
 224                want it as a number.  When the syntax calls for a `-'
 225                token, lex_negative_to_dash() must be used to break
 226                negative numbers into two tokens. */
 227             if (*lexer->prog == '-')
 228               {
 229                 ds_put_char (&lexer->tokstr, *lexer->prog++);
 230                 while (isspace ((unsigned char) *lexer->prog))
 231                   lexer->prog++;
 232
 233                 if (!isdigit ((unsigned char) *lexer->prog) && *lexer->prog != '.')
 234                   {
 235                     lexer->token = '-';
 236                     break;
 237                   }
 238                 lexer->token = T_NEG_NUM;
 239               }
 240             else
 241               lexer->token = T_POS_NUM;
 242
 243             /* Parse the number, copying it into tokstr. */
 244             while (isdigit ((unsigned char) *lexer->prog))
 245               ds_put_char (&lexer->tokstr, *lexer->prog++);
 246             if (*lexer->prog == '.')
 247               {
 248                 ds_put_char (&lexer->tokstr, *lexer->prog++);
 249                 while (isdigit ((unsigned char) *lexer->prog))
 250                   ds_put_char (&lexer->tokstr, *lexer->prog++);
 251               }
 252             if (*lexer->prog == 'e' || *lexer->prog == 'E')
 253               {
 254                 ds_put_char (&lexer->tokstr, *lexer->prog++);
 255                 if (*lexer->prog == '+' || *lexer->prog == '-')
 256                   ds_put_char (&lexer->tokstr, *lexer->prog++);
 257                 while (isdigit ((unsigned char) *lexer->prog))
 258                   ds_put_char (&lexer->tokstr, *lexer->prog++);
 259               }
 260
 261             /* Parse as floating point. */
 262             lexer->tokval = strtod (ds_cstr (&lexer->tokstr), &tail);
 263             if (*tail)
 264               {
 265                 msg (SE, _("%s does not form a valid number."),
 266                      ds_cstr (&lexer->tokstr));
 267                 lexer->tokval = 0.0;
 268
 269                 ds_clear (&lexer->tokstr);
 270                 ds_put_char (&lexer->tokstr, '0');
 271               }
 272
 273             break;
 274           }
 275
 276         case '\'': case '"':
 277           lexer->token = parse_string (lexer, CHARACTER_STRING);
 278           break;
 279
 280         case '(': case ')': case ',': case '=': case '+': case '/':
 281           lexer->token = *lexer->prog++;
 282           break;
 283
 284         case '*':
 285           if (*++lexer->prog == '*')
 286             {
 287               lexer->prog++;
 288               lexer->token = T_EXP;
 289             }
 290           else
 291             lexer->token = '*';
 292           break;
 293
 294         case '<':
 295           if (*++lexer->prog == '=')
 296             {
 297               lexer->prog++;
 298               lexer->token = T_LE;
 299             }
 300           else if (*lexer->prog == '>')
 301             {
 302               lexer->prog++;
 303               lexer->token = T_NE;
 304             }
 305           else
 306             lexer->token = T_LT;
 307           break;
 308
 309         case '>':
 310           if (*++lexer->prog == '=')
 311             {
 312               lexer->prog++;
 313               lexer->token = T_GE;
 314             }
 315           else
 316             lexer->token = T_GT;
 317           break;
 318
 319         case '~':
 320           if (*++lexer->prog == '=')
 321             {
 322               lexer->prog++;
 323               lexer->token = T_NE;
 324             }
 325           else
 326             lexer->token = T_NOT;
 327           break;
 328
 329         case '&':
 330           lexer->prog++;
 331           lexer->token = T_AND;
 332           break;
 333
 334         case '|':
 335           lexer->prog++;
 336           lexer->token = T_OR;
 337           break;
 338
 339         case 'b': case 'B':
 340           if (lexer->prog[1] == '\'' || lexer->prog[1] == '"')
 341             lexer->token = parse_string (lexer, BINARY_STRING);
 342           else
 343             lexer->token = parse_id (lexer);
 344           break;
 345
 346         case 'o': case 'O':
 347           if (lexer->prog[1] == '\'' || lexer->prog[1] == '"')
 348             lexer->token = parse_string (lexer, OCTAL_STRING);
 349           else
 350             lexer->token = parse_id (lexer);
 351           break;
 352
 353         case 'x': case 'X':
 354           if (lexer->prog[1] == '\'' || lexer->prog[1] == '"')
 355             lexer->token = parse_string (lexer, HEX_STRING);
 356           else
 357             lexer->token = parse_id (lexer);
 358           break;
 359
 360         default:
 361           if (lex_is_id1 (*lexer->prog))
 362             {
 363               lexer->token = parse_id (lexer);
 364               break;
 365             }
 366           else
 367             {
 368               if (isgraph ((unsigned char) *lexer->prog))
 369                 msg (SE, _("Bad character in input: `%c'."), *lexer->prog++);
 370               else
 371                 msg (SE, _("Bad character in input: `\\%o'."), *lexer->prog++);
 372               continue;
 373             }
 374         }
 375       break;
 376     }
 377
 378 #if DUMP_TOKENS
 379   dump_token ();
 380 #endif
 381 }
 382
 383 /* Parses an identifier at the current position into tokid and
 384    tokstr.
 385    Returns the correct token type. */
 386 static int
 387 parse_id (struct lexer *lexer)
 388 {
 389   const char *start = lexer->prog;
 390   lexer->prog = lex_skip_identifier (start);
 391
 392   ds_put_substring (&lexer->tokstr, ss_buffer (start, lexer->prog - start));
 393   str_copy_trunc (lexer->tokid, sizeof lexer->tokid, ds_cstr (&lexer->tokstr));
 394   return lex_id_to_token (ds_cstr (&lexer->tokstr), ds_length (&lexer->tokstr));
 395 }
 396
 397 /* Reports an error to the effect that subcommand SBC may only be
 398    specified once. */
 399 void
 400 lex_sbc_only_once (const char *sbc)
 401 {
 402   msg (SE, _("Subcommand %s may only be specified once."), sbc);
 403 }
 404
 405 /* Reports an error to the effect that subcommand SBC is
 406    missing. */
 407 void
 408 lex_sbc_missing (struct lexer *lexer, const char *sbc)
 409 {
 410   lex_error (lexer, _("missing required subcommand %s"), sbc);
 411 }
 412
 413 /* Prints a syntax error message containing the current token and
 414    given message MESSAGE (if non-null). */
 415 void
 416 lex_error (struct lexer *lexer, const char *message, ...)
 417 {
 418   char *token_rep;
 419   char where[128];
 420
 421   token_rep = lex_token_representation (lexer);
 422   if (lexer->token == T_STOP)
 423     strcpy (where, "end of file");
 424   else if (lexer->token == '.')
 425     strcpy (where, "end of command");
 426   else
 427     snprintf (where, sizeof where, "`%s'", token_rep);
 428   free (token_rep);
 429
 430   if (message)
 431     {
 432       char buf[1024];
 433       va_list args;
 434
 435       va_start (args, message);
 436       vsnprintf (buf, 1024, message, args);
 437       va_end (args);
 438
 439       msg (SE, _("Syntax error %s at %s."), buf, where);
 440     }
 441   else
 442     msg (SE, _("Syntax error at %s."), where);
 443 }
 444
 445 /* Checks that we're at end of command.
 446    If so, returns a successful command completion code.
 447    If not, flags a syntax error and returns an error command
 448    completion code. */
 449 int
 450 lex_end_of_command (struct lexer *lexer)
 451 {
 452   if (lexer->token != '.')
 453     {
 454       lex_error (lexer, _("expecting end of command"));
 455       return CMD_FAILURE;
 456     }
 457   else
 458     return CMD_SUCCESS;
 459 }
 460 \f
 461 /* Token testing functions. */
 462
 463 /* Returns true if the current token is a number. */
 464 bool
 465 lex_is_number (struct lexer *lexer)
 466 {
 467   return lexer->token == T_POS_NUM || lexer->token == T_NEG_NUM;
 468 }
 469
 470 /* Returns the value of the current token, which must be a
 471    floating point number. */
 472 double
 473 lex_number (struct lexer *lexer)
 474 {
 475   assert (lex_is_number (lexer));
 476   return lexer->tokval;
 477 }
 478
 479 /* Returns true iff the current token is an integer. */
 480 bool
 481 lex_is_integer (struct lexer *lexer)
 482 {
 483   return (lex_is_number (lexer)
 484           && lexer->tokval != NOT_LONG
 485           && lexer->tokval >= LONG_MIN
 486           && lexer->tokval <= LONG_MAX
 487           && floor (lexer->tokval) == lexer->tokval);
 488 }
 489
 490 /* Returns the value of the current token, which must be an
 491    integer. */
 492 long
 493 lex_integer (struct lexer *lexer)
 494 {
 495   assert (lex_is_integer (lexer));
 496   return lexer->tokval;
 497 }
 498 \f
 499 /* Token matching functions. */
 500
 501 /* If TOK is the current token, skips it and returns true
 502    Otherwise, returns false. */
 503 bool
 504 lex_match (struct lexer *lexer, int t)
 505 {
 506   if (lexer->token == t)
 507     {
 508       lex_get (lexer);
 509       return true;
 510     }
 511   else
 512     return false;
 513 }
 514
 515 /* If the current token is the identifier S, skips it and returns
 516    true.  The identifier may be abbreviated to its first three
 517    letters.
 518    Otherwise, returns false. */
 519 bool
 520 lex_match_id (struct lexer *lexer, const char *s)
 521 {
 522   if (lexer->token == T_ID && lex_id_match (s, lexer->tokid))
 523     {
 524       lex_get (lexer);
 525       return true;
 526     }
 527   else
 528     return false;
 529 }
 530
 531 /* If the current token is integer N, skips it and returns true.
 532    Otherwise, returns false. */
 533 bool
 534 lex_match_int (struct lexer *lexer, int x)
 535 {
 536   if (lex_is_integer (lexer) && lex_integer (lexer) == x)
 537     {
 538       lex_get (lexer);
 539       return true;
 540     }
 541   else
 542     return false;
 543 }
 544 \f
 545 /* Forced matches. */
 546
 547 /* If this token is identifier S, fetches the next token and returns
 548    nonzero.
 549    Otherwise, reports an error and returns zero. */
 550 bool
 551 lex_force_match_id (struct lexer *lexer, const char *s)
 552 {
 553   if (lexer->token == T_ID && lex_id_match (s, lexer->tokid))
 554     {
 555       lex_get (lexer);
 556       return true;
 557     }
 558   else
 559     {
 560       lex_error (lexer, _("expecting `%s'"), s);
 561       return false;
 562     }
 563 }
 564
 565 /* If the current token is T, skips the token.  Otherwise, reports an
 566    error and returns from the current function with return value false. */
 567 bool
 568 lex_force_match (struct lexer *lexer, int t)
 569 {
 570   if (lexer->token == t)
 571     {
 572       lex_get (lexer);
 573       return true;
 574     }
 575   else
 576     {
 577       lex_error (lexer, _("expecting `%s'"), lex_token_name (t));
 578       return false;
 579     }
 580 }
 581
 582 /* If this token is a string, does nothing and returns true.
 583    Otherwise, reports an error and returns false. */
 584 bool
 585 lex_force_string (struct lexer *lexer)
 586 {
 587   if (lexer->token == T_STRING)
 588     return true;
 589   else
 590     {
 591       lex_error (lexer, _("expecting string"));
 592       return false;
 593     }
 594 }
 595
 596 /* If this token is an integer, does nothing and returns true.
 597    Otherwise, reports an error and returns false. */
 598 bool
 599 lex_force_int (struct lexer *lexer)
 600 {
 601   if (lex_is_integer (lexer))
 602     return true;
 603   else
 604     {
 605       lex_error (lexer, _("expecting integer"));
 606       return false;
 607     }
 608 }
 609
 610 /* If this token is a number, does nothing and returns true.
 611    Otherwise, reports an error and returns false. */
 612 bool
 613 lex_force_num (struct lexer *lexer)
 614 {
 615   if (lex_is_number (lexer))
 616     return true;
 617
 618   lex_error (lexer, _("expecting number"));
 619   return false;
 620 }
 621
 622 /* If this token is an identifier, does nothing and returns true.
 623    Otherwise, reports an error and returns false. */
 624 bool
 625 lex_force_id (struct lexer *lexer)
 626 {
 627   if (lexer->token == T_ID)
 628     return true;
 629
 630   lex_error (lexer, _("expecting identifier"));
 631   return false;
 632 }
 633
 634 /* Weird token functions. */
 635
 636 /* Returns the first character of the next token, except that if the
 637    next token is not an identifier, the character returned will not be
 638    a character that can begin an identifier.  Specifically, the
 639    hexstring lead-in X' causes lookahead() to return '.  Note that an
 640    alphanumeric return value doesn't guarantee an ID token, it could
 641    also be a reserved-word token. */
 642 int
 643 lex_look_ahead (struct lexer *lexer)
 644 {
 645   if (lexer->put_token)
 646     return lexer->put_token;
 647
 648   for (;;)
 649     {
 650       if (lexer->eof)
 651         return 0;
 652
 653       for (;;)
 654         {
 655           while (isspace ((unsigned char) *lexer->prog))
 656             lexer->prog++;
 657           if (*lexer->prog)
 658             break;
 659
 660           if (lexer->dot)
 661             return '.';
 662           else if (!lex_get_line (lexer))
 663             return 0;
 664
 665           if (lexer->put_token)
 666             return lexer->put_token;
 667         }
 668
 669       if ((toupper ((unsigned char) *lexer->prog) == 'X'
 670            || toupper ((unsigned char) *lexer->prog) == 'B'
 671            || toupper ((unsigned char) *lexer->prog) == 'O')
 672           && (lexer->prog[1] == '\'' || lexer->prog[1] == '"'))
 673         return '\'';
 674
 675       return *lexer->prog;
 676     }
 677 }
 678
 679 /* Makes the current token become the next token to be read; the
 680    current token is set to T. */
 681 void
 682 lex_put_back (struct lexer *lexer, int t)
 683 {
 684   save_token (lexer);
 685   lexer->token = t;
 686 }
 687
 688 /* Makes the current token become the next token to be read; the
 689    current token is set to the identifier ID. */
 690 void
 691 lex_put_back_id (struct lexer *lexer, const char *id)
 692 {
 693   assert (lex_id_to_token (id, strlen (id)) == T_ID);
 694   save_token (lexer);
 695   lexer->token = T_ID;
 696   ds_assign_cstr (&lexer->tokstr, id);
 697   str_copy_trunc (lexer->tokid, sizeof lexer->tokid, ds_cstr (&lexer->tokstr));
 698 }
 699 \f
 700 /* Weird line processing functions. */
 701
 702 /* Returns the entire contents of the current line. */
 703 const char *
 704 lex_entire_line (struct lexer *lexer)
 705 {
 706   return ds_cstr (&lexer->line_buffer);
 707 }
 708
 709 const struct string *
 710 lex_entire_line_ds (struct lexer *lexer)
 711 {
 712   return &lexer->line_buffer;
 713 }
 714
 715 /* As lex_entire_line(), but only returns the part of the current line
 716    that hasn't already been tokenized.
 717    If END_DOT is non-null, stores nonzero into *END_DOT if the line
 718    ends with a terminal dot, or zero if it doesn't. */
 719 const char *
 720 lex_rest_of_line (struct lexer *lexer, int *end_dot)
 721 {
 722   if (end_dot)
 723     *end_dot = lexer->dot;
 724   return lexer->prog;
 725 }
 726
 727 /* Causes the rest of the current input line to be ignored for
 728    tokenization purposes. */
 729 void
 730 lex_discard_line (struct lexer *lexer)
 731 {
 732   ds_cstr (&lexer->line_buffer);  /* Ensures ds_end points to something valid */
 733   lexer->prog = ds_end (&lexer->line_buffer);
 734   lexer->dot = false;
 735   lexer->put_token = 0;
 736 }
 737
 738
 739 /* Discards the rest of the current command.
 740    When we're reading commands from a file, we skip tokens until
 741    a terminal dot or EOF.
 742    When we're reading commands interactively from the user,
 743    that's just discarding the current line, because presumably
 744    the user doesn't want to finish typing a command that will be
 745    ignored anyway. */
 746 void
 747 lex_discard_rest_of_command (struct lexer *lexer)
 748 {
 749   if (!getl_is_interactive ())
 750     {
 751       while (lexer->token != T_STOP && lexer->token != '.')
 752         lex_get (lexer);
 753     }
 754   else
 755     lex_discard_line (lexer);
 756 }
 757 \f
 758 /* Weird line reading functions. */
 759
 760 /* Remove C-style comments in STRING, begun by slash-star and
 761    terminated by star-slash or newline. */
 762 static void
 763 strip_comments (struct string *string)
 764 {
 765   char *cp;
 766   int quote;
 767   bool in_comment;
 768
 769   in_comment = false;
 770   quote = EOF;
 771   for (cp = ds_cstr (string); *cp; )
 772     {
 773       /* If we're not in a comment, check for quote marks. */
 774       if (!in_comment)
 775         {
 776           if (*cp == quote)
 777             quote = EOF;
 778           else if (*cp == '\'' || *cp == '"')
 779             quote = *cp;
 780         }
 781
 782       /* If we're not inside a quotation, check for comment. */
 783       if (quote == EOF)
 784         {
 785           if (cp[0] == '/' && cp[1] == '*')
 786             {
 787               in_comment = true;
 788               *cp++ = ' ';
 789               *cp++ = ' ';
 790               continue;
 791             }
 792           else if (in_comment && cp[0] == '*' && cp[1] == '/')
 793             {
 794               in_comment = false;
 795               *cp++ = ' ';
 796               *cp++ = ' ';
 797               continue;
 798             }
 799         }
 800
 801       /* Check commenting. */
 802       if (in_comment)
 803         *cp = ' ';
 804       cp++;
 805     }
 806 }
 807
 808 /* Reads a line, without performing any preprocessing */
 809 bool
 810 lex_get_line_raw (struct lexer *lexer)
 811 {
 812   bool dummy;
 813   return lexer->read_line (&lexer->line_buffer, &dummy);
 814 }
 815
 816 /* Reads a line for use by the tokenizer, and preprocesses it by
 817    removing comments, stripping trailing whitespace and the
 818    terminal dot, and removing leading indentors. */
 819 bool
 820 lex_get_line (struct lexer *lexer)
 821 {
 822   struct string *line = &lexer->line_buffer;
 823   bool interactive;
 824
 825   if (!lexer->read_line (line, &interactive))
 826     return false;
 827
 828   strip_comments (line);
 829   ds_rtrim (line, ss_cstr (CC_SPACES));
 830
 831   /* Check for and remove terminal dot. */
 832   lexer->dot = (ds_chomp (line, get_endcmd ())
 833          || (ds_is_empty (line) && get_nulline ()));
 834
 835   /* Strip leading indentors or insert a terminal dot (unless the
 836      line was obtained interactively). */
 837   if (!interactive)
 838     {
 839       int first = ds_first (line);
 840
 841       if (first == '+' || first == '-')
 842         *ds_data (line) = ' ';
 843       else if (first != EOF && !isspace (first))
 844         lexer->put_token = '.';
 845     }
 846
 847   lexer->prog = ds_cstr (line);
 848
 849   return true;
 850 }
 851 \f
 852 /* Token names. */
 853
 854 /* Returns the name of a token in a static buffer. */
 855 const char *
 856 lex_token_name (int token)
 857 {
 858   if (token >= T_FIRST_KEYWORD && token <= T_LAST_KEYWORD)
 859     return keywords[token - T_FIRST_KEYWORD];
 860
 861   if (token < 256)
 862     {
 863       static char t[2];
 864       t[0] = token;
 865       return t;
 866     }
 867
 868   NOT_REACHED ();
 869 }
 870
 871 /* Returns an ASCII representation of the current token as a
 872    malloc()'d string. */
 873 char *
 874 lex_token_representation (struct lexer *lexer)
 875 {
 876   char *token_rep;
 877
 878   switch (lexer->token)
 879     {
 880     case T_ID:
 881     case T_POS_NUM:
 882     case T_NEG_NUM:
 883       return ds_xstrdup (&lexer->tokstr);
 884       break;
 885
 886     case T_STRING:
 887       {
 888         int hexstring = 0;
 889         char *sp, *dp;
 890
 891         for (sp = ds_cstr (&lexer->tokstr); sp < ds_end (&lexer->tokstr); sp++)
 892           if (!isprint ((unsigned char) *sp))
 893             {
 894               hexstring = 1;
 895               break;
 896             }
 897
 898         token_rep = xmalloc (2 + ds_length (&lexer->tokstr) * 2 + 1 + 1);
 899
 900         dp = token_rep;
 901         if (hexstring)
 902           *dp++ = 'X';
 903         *dp++ = '\'';
 904
 905         if (!hexstring)
 906           for (sp = ds_cstr (&lexer->tokstr); *sp; )
 907             {
 908               if (*sp == '\'')
 909                 *dp++ = '\'';
 910               *dp++ = (unsigned char) *sp++;
 911             }
 912         else
 913           for (sp = ds_cstr (&lexer->tokstr); sp < ds_end (&lexer->tokstr); sp++)
 914             {
 915               *dp++ = (((unsigned char) *sp) >> 4)["0123456789ABCDEF"];
 916               *dp++ = (((unsigned char) *sp) & 15)["0123456789ABCDEF"];
 917             }
 918         *dp++ = '\'';
 919         *dp = '\0';
 920
 921         return token_rep;
 922       }
 923     break;
 924
 925     case T_STOP:
 926       token_rep = xmalloc (1);
 927       *token_rep = '\0';
 928       return token_rep;
 929
 930     case T_EXP:
 931       return xstrdup ("**");
 932
 933     default:
 934       if (lexer->token >= T_FIRST_KEYWORD && lexer->token <= T_LAST_KEYWORD)
 935         return xstrdup (keywords [lexer->token - T_FIRST_KEYWORD]);
 936       else
 937         {
 938           token_rep = xmalloc (2);
 939           token_rep[0] = lexer->token;
 940           token_rep[1] = '\0';
 941           return token_rep;
 942         }
 943     }
 944
 945   NOT_REACHED ();
 946 }
 947 \f
 948 /* Really weird functions. */
 949
 950 /* Most of the time, a `-' is a lead-in to a negative number.  But
 951    sometimes it's actually part of the syntax.  If a dash can be part
 952    of syntax then this function is called to rip it off of a
 953    number. */
 954 void
 955 lex_negative_to_dash (struct lexer *lexer)
 956 {
 957   if (lexer->token == T_NEG_NUM)
 958     {
 959       lexer->token = T_POS_NUM;
 960       lexer->tokval = -lexer->tokval;
 961       ds_assign_substring (&lexer->tokstr, ds_substr (&lexer->tokstr, 1, SIZE_MAX));
 962       save_token (lexer);
 963       lexer->token = '-';
 964     }
 965 }
 966
 967 /* We're not at eof any more. */
 968 void
 969 lex_reset_eof (struct lexer *lexer)
 970 {
 971   lexer->eof = false;
 972 }
 973
 974 /* Skip a COMMENT command. */
 975 void
 976 lex_skip_comment (struct lexer *lexer)
 977 {
 978   for (;;)
 979     {
 980       if (!lex_get_line (lexer))
 981         {
 982           lexer->put_token = T_STOP;
 983           lexer->eof = true;
 984           return;
 985         }
 986
 987       if (lexer->put_token == '.')
 988         break;
 989
 990       ds_cstr (&lexer->line_buffer); /* Ensures ds_end will point to a valid char */
 991       lexer->prog = ds_end (&lexer->line_buffer);
 992       if (lexer->dot)
 993         break;
 994     }
 995 }
 996 \f
 997 /* Private functions. */
 998
 999 /* When invoked, tokstr contains a string of binary, octal, or
1000    hex digits, according to TYPE.  The string is converted to
1001    characters having the specified values. */
1002 static void
1003 convert_numeric_string_to_char_string (struct lexer *lexer,
1004                                        enum string_type type)
1005 {
1006   const char *base_name;
1007   int base;
1008   int chars_per_byte;
1009   size_t byte_cnt;
1010   size_t i;
1011   char *p;
1012
1013   switch (type)
1014     {
1015     case BINARY_STRING:
1016       base_name = _("binary");
1017       base = 2;
1018       chars_per_byte = 8;
1019       break;
1020     case OCTAL_STRING:
1021       base_name = _("octal");
1022       base = 8;
1023       chars_per_byte = 3;
1024       break;
1025     case HEX_STRING:
1026       base_name = _("hex");
1027       base = 16;
1028       chars_per_byte = 2;
1029       break;
1030     default:
1031       NOT_REACHED ();
1032     }
1033
1034   byte_cnt = ds_length (&lexer->tokstr) / chars_per_byte;
1035   if (ds_length (&lexer->tokstr) % chars_per_byte)
1036     msg (SE, _("String of %s digits has %d characters, which is not a "
1037                "multiple of %d."),
1038          base_name, ds_length (&lexer->tokstr), chars_per_byte);
1039
1040   p = ds_cstr (&lexer->tokstr);
1041   for (i = 0; i < byte_cnt; i++)
1042     {
1043       int value;
1044       int j;
1045
1046       value = 0;
1047       for (j = 0; j < chars_per_byte; j++, p++)
1048         {
1049           int v;
1050
1051           if (*p >= '0' && *p <= '9')
1052             v = *p - '0';
1053           else
1054             {
1055               static const char alpha[] = "abcdef";
1056               const char *q = strchr (alpha, tolower ((unsigned char) *p));
1057
1058               if (q)
1059                 v = q - alpha + 10;
1060               else
1061                 v = base;
1062             }
1063
1064           if (v >= base)
1065             msg (SE, _("`%c' is not a valid %s digit."), *p, base_name);
1066
1067           value = value * base + v;
1068         }
1069
1070       ds_cstr (&lexer->tokstr)[i] = (unsigned char) value;
1071     }
1072
1073   ds_truncate (&lexer->tokstr, byte_cnt);
1074 }
1075
1076 /* Parses a string from the input buffer into tokstr.  The input
1077    buffer pointer lexer->prog must point to the initial single or double
1078    quote.  TYPE indicates the type of string to be parsed.
1079    Returns token type. */
1080 static int
1081 parse_string (struct lexer *lexer, enum string_type type)
1082 {
1083   if (type != CHARACTER_STRING)
1084     lexer->prog++;
1085
1086   /* Accumulate the entire string, joining sections indicated by +
1087      signs. */
1088   for (;;)
1089     {
1090       /* Single or double quote. */
1091       int c = *lexer->prog++;
1092
1093       /* Accumulate section. */
1094       for (;;)
1095         {
1096           /* Check end of line. */
1097           if (*lexer->prog == '\0')
1098             {
1099               msg (SE, _("Unterminated string constant."));
1100               goto finish;
1101             }
1102
1103           /* Double quote characters to embed them in strings. */
1104           if (*lexer->prog == c)
1105             {
1106               if (lexer->prog[1] == c)
1107                 lexer->prog++;
1108               else
1109                 break;
1110             }
1111
1112           ds_put_char (&lexer->tokstr, *lexer->prog++);
1113         }
1114       lexer->prog++;
1115
1116       /* Skip whitespace after final quote mark. */
1117       if (lexer->eof)
1118         break;
1119       for (;;)
1120         {
1121           while (isspace ((unsigned char) *lexer->prog))
1122             lexer->prog++;
1123           if (*lexer->prog)
1124             break;
1125
1126           if (lexer->dot)
1127             goto finish;
1128
1129           if (!lex_get_line (lexer))
1130             goto finish;
1131         }
1132
1133       /* Skip plus sign. */
1134       if (*lexer->prog != '+')
1135         break;
1136       lexer->prog++;
1137
1138       /* Skip whitespace after plus sign. */
1139       if (lexer->eof)
1140         break;
1141       for (;;)
1142         {
1143           while (isspace ((unsigned char) *lexer->prog))
1144             lexer->prog++;
1145           if (*lexer->prog)
1146             break;
1147
1148           if (lexer->dot)
1149             goto finish;
1150
1151           if (!lex_get_line (lexer))
1152             {
1153               msg (SE, _("Unexpected end of file in string concatenation."));
1154               goto finish;
1155             }
1156         }
1157
1158       /* Ensure that a valid string follows. */
1159       if (*lexer->prog != '\'' && *lexer->prog != '"')
1160         {
1161           msg (SE, _("String expected following `+'."));
1162           goto finish;
1163         }
1164     }
1165
1166   /* We come here when we've finished concatenating all the string sections
1167      into one large string. */
1168 finish:
1169   if (type != CHARACTER_STRING)
1170     convert_numeric_string_to_char_string (lexer, type);
1171
1172   if (ds_length (&lexer->tokstr) > 255)
1173     {
1174       msg (SE, _("String exceeds 255 characters in length (%d characters)."),
1175            ds_length (&lexer->tokstr));
1176       ds_truncate (&lexer->tokstr, 255);
1177     }
1178
1179   return T_STRING;
1180 }
1181 \f
1182 #if DUMP_TOKENS
1183 /* Reads one token from the lexer and writes a textual representation
1184    on stdout for debugging purposes. */
1185 static void
1186 dump_token (struct lexer *lexer)
1187 {
1188   {
1189     const char *curfn;
1190     int curln;
1191
1192     getl_location (&curfn, &curln);
1193     if (curfn)
1194       fprintf (stderr, "%s:%d\t", curfn, curln);
1195   }
1196
1197   switch (lexer->token)
1198     {
1199     case T_ID:
1200       fprintf (stderr, "ID\t%s\n", lexer->tokid);
1201       break;
1202
1203     case T_POS_NUM:
1204     case T_NEG_NUM:
1205       fprintf (stderr, "NUM\t%f\n", lexer->tokval);
1206       break;
1207
1208     case T_STRING:
1209       fprintf (stderr, "STRING\t\"%s\"\n", ds_cstr (&lexer->tokstr));
1210       break;
1211
1212     case T_STOP:
1213       fprintf (stderr, "STOP\n");
1214       break;
1215
1216     case T_EXP:
1217       fprintf (stderr, "MISC\tEXP\"");
1218       break;
1219
1220     case 0:
1221       fprintf (stderr, "MISC\tEOF\n");
1222       break;
1223
1224     default:
1225       if (lexer->token >= T_FIRST_KEYWORD && lexer->token <= T_LAST_KEYWORD)
1226         fprintf (stderr, "KEYWORD\t%s\n", lex_token_name (token));
1227       else
1228         fprintf (stderr, "PUNCT\t%c\n", lexer->token);
1229       break;
1230     }
1231 }
1232 #endif /* DUMP_TOKENS */
1233
1234
1235 /* Token Accessor Functions */
1236
1237 int
1238 lex_token (const struct lexer *lexer)
1239 {
1240   return lexer->token;
1241 }
1242
1243 double
1244 lex_tokval (const struct lexer *lexer)
1245 {
1246   return lexer->tokval;
1247 }
1248
1249 const char *
1250 lex_tokid (const struct lexer *lexer)
1251 {
1252   return lexer->tokid;
1253 }
1254
1255 const struct string *
1256 lex_tokstr (const struct lexer *lexer)
1257 {
1258   return &lexer->tokstr;
1259 }