src/language/lexer/lexer.c

   1 /* PSPP - computes sample statistics.
   2    Copyright (C) 1997-9, 2000, 2006 Free Software Foundation, Inc.
   3
   4    This program is free software; you can redistribute it and/or
   5    modify it under the terms of the GNU General Public License as
   6    published by the Free Software Foundation; either version 2 of the
   7    License, or (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful, but
  10    WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12    General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program; if not, write to the Free Software
  16    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  17    02110-1301, USA. */
  18
  19 #include <config.h>
  20 #include "lexer.h"
  21 #include <libpspp/message.h>
  22 #include <ctype.h>
  23 #include <errno.h>
  24 #include <limits.h>
  25 #include <math.h>
  26 #include <stdarg.h>
  27 #include <stdlib.h>
  28 #include <libpspp/alloc.h>
  29 #include <libpspp/assertion.h>
  30 #include <language/command.h>
  31 #include <libpspp/message.h>
  32 #include <libpspp/magic.h>
  33 #include <data/settings.h>
  34 #include <libpspp/getl.h>
  35 #include <libpspp/str.h>
  36
  37 #include "size_max.h"
  38
  39 #include "gettext.h"
  40 #define _(msgid) gettext (msgid)
  41 #define N_(msgid) msgid
  42
  43
  44 #define DUMP_TOKENS 0
  45
  46
  47 struct lexer
  48 {
  49   struct string line_buffer;
  50
  51   struct source_stream *ss;
  52
  53   int token;      /* Current token. */
  54   double tokval;  /* T_POS_NUM, T_NEG_NUM: the token's value. */
  55
  56   char tokid [LONG_NAME_LEN + 1];   /* T_ID: the identifier. */
  57
  58   struct string tokstr;   /* T_ID, T_STRING: token string value.
  59                             For T_ID, this is not truncated as is
  60                             tokid. */
  61
  62   char *prog; /* Pointer to next token in line_buffer. */
  63   bool dot;   /* True only if this line ends with a terminal dot. */
  64
  65   int put_token ; /* If nonzero, next token returned by lex_get().
  66                     Used only in exceptional circumstances. */
  67
  68   struct string put_tokstr;
  69   double put_tokval;
  70 };
  71
  72
  73 static int parse_id (struct lexer *);
  74
  75 /* How a string represents its contents. */
  76 enum string_type
  77   {
  78     CHARACTER_STRING,   /* Characters. */
  79     BINARY_STRING,      /* Binary digits. */
  80     OCTAL_STRING,       /* Octal digits. */
  81     HEX_STRING          /* Hexadecimal digits. */
  82   };
  83
  84 static int parse_string (struct lexer *, enum string_type);
  85
  86 #if DUMP_TOKENS
  87 static void dump_token (struct lexer *);
  88 #endif
  89 \f
  90 /* Initialization. */
  91
  92 /* Initializes the lexer. */
  93 struct lexer *
  94 lex_create (struct source_stream *ss)
  95 {
  96   struct lexer *lexer = xzalloc (sizeof (*lexer));
  97
  98   ds_init_empty (&lexer->tokstr);
  99   ds_init_empty (&lexer->put_tokstr);
 100   ds_init_empty (&lexer->line_buffer);
 101   lexer->ss = ss;
 102
 103   return lexer;
 104 }
 105
 106 struct source_stream *
 107 lex_get_source_stream (const struct lexer *lex)
 108 {
 109   return lex->ss;
 110 }
 111
 112
 113 void
 114 lex_destroy (struct lexer *lexer)
 115 {
 116   if ( NULL != lexer )
 117     {
 118       ds_destroy (&lexer->put_tokstr);
 119       ds_destroy (&lexer->tokstr);
 120       ds_destroy (&lexer->line_buffer);
 121
 122       free (lexer);
 123     }
 124 }
 125
 126 \f
 127 /* Common functions. */
 128
 129 /* Copies put_token, lexer->put_tokstr, put_tokval into token, tokstr,
 130    tokval, respectively, and sets tokid appropriately. */
 131 static void
 132 restore_token (struct lexer *lexer)
 133 {
 134   assert (lexer->put_token != 0);
 135   lexer->token = lexer->put_token;
 136   ds_assign_string (&lexer->tokstr, &lexer->put_tokstr);
 137   str_copy_trunc (lexer->tokid, sizeof lexer->tokid, ds_cstr (&lexer->tokstr));
 138   lexer->tokval = lexer->put_tokval;
 139   lexer->put_token = 0;
 140 }
 141
 142 /* Copies token, tokstr, lexer->tokval into lexer->put_token, put_tokstr,
 143    put_lexer->tokval respectively. */
 144 static void
 145 save_token (struct lexer *lexer)
 146 {
 147   lexer->put_token = lexer->token;
 148   ds_assign_string (&lexer->put_tokstr, &lexer->tokstr);
 149   lexer->put_tokval = lexer->tokval;
 150 }
 151
 152 /* Parses a single token, setting appropriate global variables to
 153    indicate the token's attributes. */
 154 void
 155 lex_get (struct lexer *lexer)
 156 {
 157   /* Find a token. */
 158   for (;;)
 159     {
 160       if (NULL == lexer->prog && ! lex_get_line (lexer) )
 161         {
 162           lexer->token = T_STOP;
 163           return;
 164         }
 165
 166   /* If a token was pushed ahead, return it. */
 167   if (lexer->put_token)
 168     {
 169       restore_token (lexer);
 170 #if DUMP_TOKENS
 171           dump_token (lexer);
 172 #endif
 173       return;
 174     }
 175
 176   for (;;)
 177     {
 178       /* Skip whitespace. */
 179           while (isspace ((unsigned char) *lexer->prog))
 180             lexer->prog++;
 181
 182           if (*lexer->prog)
 183             break;
 184
 185           if (lexer->dot)
 186             {
 187               lexer->dot = 0;
 188               lexer->token = '.';
 189 #if DUMP_TOKENS
 190               dump_token (lexer);
 191 #endif
 192               return;
 193             }
 194           else if (!lex_get_line (lexer))
 195             {
 196               lexer->prog = NULL;
 197               lexer->token = T_STOP;
 198 #if DUMP_TOKENS
 199               dump_token (lexer);
 200 #endif
 201               return;
 202             }
 203
 204           if (lexer->put_token)
 205             {
 206               restore_token (lexer);
 207 #if DUMP_TOKENS
 208               dump_token (lexer);
 209 #endif
 210               return;
 211             }
 212         }
 213
 214
 215       /* Actually parse the token. */
 216       ds_clear (&lexer->tokstr);
 217
 218       switch (*lexer->prog)
 219         {
 220         case '-': case '.':
 221         case '0': case '1': case '2': case '3': case '4':
 222         case '5': case '6': case '7': case '8': case '9':
 223           {
 224             char *tail;
 225
 226             /* `-' can introduce a negative number, or it can be a
 227                token by itself.  If it is not followed by a digit or a
 228                decimal point, it is definitely not a number.
 229                Otherwise, it might be either, but most of the time we
 230                want it as a number.  When the syntax calls for a `-'
 231                token, lex_negative_to_dash() must be used to break
 232                negative numbers into two tokens. */
 233             if (*lexer->prog == '-')
 234               {
 235                 ds_put_char (&lexer->tokstr, *lexer->prog++);
 236                 while (isspace ((unsigned char) *lexer->prog))
 237                   lexer->prog++;
 238
 239                 if (!isdigit ((unsigned char) *lexer->prog) && *lexer->prog != '.')
 240                   {
 241                     lexer->token = '-';
 242                     break;
 243                   }
 244                 lexer->token = T_NEG_NUM;
 245               }
 246             else
 247               lexer->token = T_POS_NUM;
 248
 249             /* Parse the number, copying it into tokstr. */
 250             while (isdigit ((unsigned char) *lexer->prog))
 251               ds_put_char (&lexer->tokstr, *lexer->prog++);
 252             if (*lexer->prog == '.')
 253               {
 254                 ds_put_char (&lexer->tokstr, *lexer->prog++);
 255                 while (isdigit ((unsigned char) *lexer->prog))
 256                   ds_put_char (&lexer->tokstr, *lexer->prog++);
 257               }
 258             if (*lexer->prog == 'e' || *lexer->prog == 'E')
 259               {
 260                 ds_put_char (&lexer->tokstr, *lexer->prog++);
 261                 if (*lexer->prog == '+' || *lexer->prog == '-')
 262                   ds_put_char (&lexer->tokstr, *lexer->prog++);
 263                 while (isdigit ((unsigned char) *lexer->prog))
 264                   ds_put_char (&lexer->tokstr, *lexer->prog++);
 265               }
 266
 267             /* Parse as floating point. */
 268             lexer->tokval = strtod (ds_cstr (&lexer->tokstr), &tail);
 269             if (*tail)
 270               {
 271                 msg (SE, _("%s does not form a valid number."),
 272                      ds_cstr (&lexer->tokstr));
 273                 lexer->tokval = 0.0;
 274
 275                 ds_clear (&lexer->tokstr);
 276                 ds_put_char (&lexer->tokstr, '0');
 277               }
 278
 279             break;
 280           }
 281
 282         case '\'': case '"':
 283           lexer->token = parse_string (lexer, CHARACTER_STRING);
 284           break;
 285
 286         case '(': case ')': case ',': case '=': case '+': case '/':
 287           lexer->token = *lexer->prog++;
 288           break;
 289
 290         case '*':
 291           if (*++lexer->prog == '*')
 292             {
 293               lexer->prog++;
 294               lexer->token = T_EXP;
 295             }
 296           else
 297             lexer->token = '*';
 298           break;
 299
 300         case '<':
 301           if (*++lexer->prog == '=')
 302             {
 303               lexer->prog++;
 304               lexer->token = T_LE;
 305             }
 306           else if (*lexer->prog == '>')
 307             {
 308               lexer->prog++;
 309               lexer->token = T_NE;
 310             }
 311           else
 312             lexer->token = T_LT;
 313           break;
 314
 315         case '>':
 316           if (*++lexer->prog == '=')
 317             {
 318               lexer->prog++;
 319               lexer->token = T_GE;
 320             }
 321           else
 322             lexer->token = T_GT;
 323           break;
 324
 325         case '~':
 326           if (*++lexer->prog == '=')
 327             {
 328               lexer->prog++;
 329               lexer->token = T_NE;
 330             }
 331           else
 332             lexer->token = T_NOT;
 333           break;
 334
 335         case '&':
 336           lexer->prog++;
 337           lexer->token = T_AND;
 338           break;
 339
 340         case '|':
 341           lexer->prog++;
 342           lexer->token = T_OR;
 343           break;
 344
 345         case 'b': case 'B':
 346           if (lexer->prog[1] == '\'' || lexer->prog[1] == '"')
 347             lexer->token = parse_string (lexer, BINARY_STRING);
 348           else
 349             lexer->token = parse_id (lexer);
 350           break;
 351
 352         case 'o': case 'O':
 353           if (lexer->prog[1] == '\'' || lexer->prog[1] == '"')
 354             lexer->token = parse_string (lexer, OCTAL_STRING);
 355           else
 356             lexer->token = parse_id (lexer);
 357           break;
 358
 359         case 'x': case 'X':
 360           if (lexer->prog[1] == '\'' || lexer->prog[1] == '"')
 361             lexer->token = parse_string (lexer, HEX_STRING);
 362           else
 363             lexer->token = parse_id (lexer);
 364           break;
 365
 366         default:
 367           if (lex_is_id1 (*lexer->prog))
 368             {
 369               lexer->token = parse_id (lexer);
 370               break;
 371             }
 372           else
 373             {
 374               if (isgraph ((unsigned char) *lexer->prog))
 375                 msg (SE, _("Bad character in input: `%c'."), *lexer->prog++);
 376               else
 377                 msg (SE, _("Bad character in input: `\\%o'."), *lexer->prog++);
 378               continue;
 379             }
 380         }
 381       break;
 382     }
 383
 384 #if DUMP_TOKENS
 385   dump_token (lexer);
 386 #endif
 387 }
 388
 389 /* Parses an identifier at the current position into tokid and
 390    tokstr.
 391    Returns the correct token type. */
 392 static int
 393 parse_id (struct lexer *lexer)
 394 {
 395   struct substring rest_of_line
 396     = ss_substr (ds_ss (&lexer->line_buffer),
 397                  ds_pointer_to_position (&lexer->line_buffer, lexer->prog),
 398                  SIZE_MAX);
 399   struct substring id = ss_head (rest_of_line,
 400                                  lex_id_get_length (rest_of_line));
 401   lexer->prog += ss_length (id);
 402
 403   ds_assign_substring (&lexer->tokstr, id);
 404   str_copy_trunc (lexer->tokid, sizeof lexer->tokid, ds_cstr (&lexer->tokstr));
 405   return lex_id_to_token (id);
 406 }
 407
 408 /* Reports an error to the effect that subcommand SBC may only be
 409    specified once. */
 410 void
 411 lex_sbc_only_once (const char *sbc)
 412 {
 413   msg (SE, _("Subcommand %s may only be specified once."), sbc);
 414 }
 415
 416 /* Reports an error to the effect that subcommand SBC is
 417    missing. */
 418 void
 419 lex_sbc_missing (struct lexer *lexer, const char *sbc)
 420 {
 421   lex_error (lexer, _("missing required subcommand %s"), sbc);
 422 }
 423
 424 /* Prints a syntax error message containing the current token and
 425    given message MESSAGE (if non-null). */
 426 void
 427 lex_error (struct lexer *lexer, const char *message, ...)
 428 {
 429   char *token_rep;
 430   char where[128];
 431
 432   token_rep = lex_token_representation (lexer);
 433   if (lexer->token == T_STOP)
 434     strcpy (where, "end of file");
 435   else if (lexer->token == '.')
 436     strcpy (where, "end of command");
 437   else
 438     snprintf (where, sizeof where, "`%s'", token_rep);
 439   free (token_rep);
 440
 441   if (message)
 442     {
 443       char buf[1024];
 444       va_list args;
 445
 446       va_start (args, message);
 447       vsnprintf (buf, 1024, message, args);
 448       va_end (args);
 449
 450       msg (SE, _("Syntax error %s at %s."), buf, where);
 451     }
 452   else
 453     msg (SE, _("Syntax error at %s."), where);
 454 }
 455
 456 /* Checks that we're at end of command.
 457    If so, returns a successful command completion code.
 458    If not, flags a syntax error and returns an error command
 459    completion code. */
 460 int
 461 lex_end_of_command (struct lexer *lexer)
 462 {
 463   if (lexer->token != '.')
 464     {
 465       lex_error (lexer, _("expecting end of command"));
 466       return CMD_FAILURE;
 467     }
 468   else
 469     return CMD_SUCCESS;
 470 }
 471 \f
 472 /* Token testing functions. */
 473
 474 /* Returns true if the current token is a number. */
 475 bool
 476 lex_is_number (struct lexer *lexer)
 477 {
 478   return lexer->token == T_POS_NUM || lexer->token == T_NEG_NUM;
 479 }
 480
 481
 482 /* Returns true if the current token is a string. */
 483 bool
 484 lex_is_string (struct lexer *lexer)
 485 {
 486   return lexer->token == T_STRING;
 487 }
 488
 489
 490 /* Returns the value of the current token, which must be a
 491    floating point number. */
 492 double
 493 lex_number (struct lexer *lexer)
 494 {
 495   assert (lex_is_number (lexer));
 496   return lexer->tokval;
 497 }
 498
 499 /* Returns true iff the current token is an integer. */
 500 bool
 501 lex_is_integer (struct lexer *lexer)
 502 {
 503   return (lex_is_number (lexer)
 504           && lexer->tokval != NOT_LONG
 505           && lexer->tokval >= LONG_MIN
 506           && lexer->tokval <= LONG_MAX
 507           && floor (lexer->tokval) == lexer->tokval);
 508 }
 509
 510 /* Returns the value of the current token, which must be an
 511    integer. */
 512 long
 513 lex_integer (struct lexer *lexer)
 514 {
 515   assert (lex_is_integer (lexer));
 516   return lexer->tokval;
 517 }
 518 \f
 519 /* Token matching functions. */
 520
 521 /* If TOK is the current token, skips it and returns true
 522    Otherwise, returns false. */
 523 bool
 524 lex_match (struct lexer *lexer, int t)
 525 {
 526   if (lexer->token == t)
 527     {
 528       lex_get (lexer);
 529       return true;
 530     }
 531   else
 532     return false;
 533 }
 534
 535 /* If the current token is the identifier S, skips it and returns
 536    true.  The identifier may be abbreviated to its first three
 537    letters.
 538    Otherwise, returns false. */
 539 bool
 540 lex_match_id (struct lexer *lexer, const char *s)
 541 {
 542   if (lexer->token == T_ID
 543       && lex_id_match (ss_cstr (s), ss_cstr (lexer->tokid)))
 544     {
 545       lex_get (lexer);
 546       return true;
 547     }
 548   else
 549     return false;
 550 }
 551
 552 /* If the current token is integer N, skips it and returns true.
 553    Otherwise, returns false. */
 554 bool
 555 lex_match_int (struct lexer *lexer, int x)
 556 {
 557   if (lex_is_integer (lexer) && lex_integer (lexer) == x)
 558     {
 559       lex_get (lexer);
 560       return true;
 561     }
 562   else
 563     return false;
 564 }
 565 \f
 566 /* Forced matches. */
 567
 568 /* If this token is identifier S, fetches the next token and returns
 569    nonzero.
 570    Otherwise, reports an error and returns zero. */
 571 bool
 572 lex_force_match_id (struct lexer *lexer, const char *s)
 573 {
 574   if (lex_match_id (lexer, s))
 575     return true;
 576   else
 577     {
 578       lex_error (lexer, _("expecting `%s'"), s);
 579       return false;
 580     }
 581 }
 582
 583 /* If the current token is T, skips the token.  Otherwise, reports an
 584    error and returns from the current function with return value false. */
 585 bool
 586 lex_force_match (struct lexer *lexer, int t)
 587 {
 588   if (lexer->token == t)
 589     {
 590       lex_get (lexer);
 591       return true;
 592     }
 593   else
 594     {
 595       lex_error (lexer, _("expecting `%s'"), lex_token_name (t));
 596       return false;
 597     }
 598 }
 599
 600 /* If this token is a string, does nothing and returns true.
 601    Otherwise, reports an error and returns false. */
 602 bool
 603 lex_force_string (struct lexer *lexer)
 604 {
 605   if (lexer->token == T_STRING)
 606     return true;
 607   else
 608     {
 609       lex_error (lexer, _("expecting string"));
 610       return false;
 611     }
 612 }
 613
 614 /* If this token is an integer, does nothing and returns true.
 615    Otherwise, reports an error and returns false. */
 616 bool
 617 lex_force_int (struct lexer *lexer)
 618 {
 619   if (lex_is_integer (lexer))
 620     return true;
 621   else
 622     {
 623       lex_error (lexer, _("expecting integer"));
 624       return false;
 625     }
 626 }
 627
 628 /* If this token is a number, does nothing and returns true.
 629    Otherwise, reports an error and returns false. */
 630 bool
 631 lex_force_num (struct lexer *lexer)
 632 {
 633   if (lex_is_number (lexer))
 634     return true;
 635
 636   lex_error (lexer, _("expecting number"));
 637   return false;
 638 }
 639
 640 /* If this token is an identifier, does nothing and returns true.
 641    Otherwise, reports an error and returns false. */
 642 bool
 643 lex_force_id (struct lexer *lexer)
 644 {
 645   if (lexer->token == T_ID)
 646     return true;
 647
 648   lex_error (lexer, _("expecting identifier"));
 649   return false;
 650 }
 651
 652 /* Weird token functions. */
 653
 654 /* Returns the first character of the next token, except that if the
 655    next token is not an identifier, the character returned will not be
 656    a character that can begin an identifier.  Specifically, the
 657    hexstring lead-in X' causes lookahead() to return '.  Note that an
 658    alphanumeric return value doesn't guarantee an ID token, it could
 659    also be a reserved-word token. */
 660 int
 661 lex_look_ahead (struct lexer *lexer)
 662 {
 663   if (lexer->put_token)
 664     return lexer->put_token;
 665
 666   for (;;)
 667     {
 668       if (NULL == lexer->prog && ! lex_get_line (lexer) )
 669         return 0;
 670
 671       for (;;)
 672         {
 673           while (isspace ((unsigned char) *lexer->prog))
 674             lexer->prog++;
 675           if (*lexer->prog)
 676             break;
 677
 678           if (lexer->dot)
 679             return '.';
 680           else if (!lex_get_line (lexer))
 681             return 0;
 682
 683           if (lexer->put_token)
 684             return lexer->put_token;
 685         }
 686
 687       if ((toupper ((unsigned char) *lexer->prog) == 'X'
 688            || toupper ((unsigned char) *lexer->prog) == 'B'
 689            || toupper ((unsigned char) *lexer->prog) == 'O')
 690           && (lexer->prog[1] == '\'' || lexer->prog[1] == '"'))
 691         return '\'';
 692
 693       return *lexer->prog;
 694     }
 695 }
 696
 697 /* Makes the current token become the next token to be read; the
 698    current token is set to T. */
 699 void
 700 lex_put_back (struct lexer *lexer, int t)
 701 {
 702   save_token (lexer);
 703   lexer->token = t;
 704 }
 705
 706 /* Makes the current token become the next token to be read; the
 707    current token is set to the identifier ID. */
 708 void
 709 lex_put_back_id (struct lexer *lexer, const char *id)
 710 {
 711   assert (lex_id_to_token (ss_cstr (id)) == T_ID);
 712   save_token (lexer);
 713   lexer->token = T_ID;
 714   ds_assign_cstr (&lexer->tokstr, id);
 715   str_copy_trunc (lexer->tokid, sizeof lexer->tokid, ds_cstr (&lexer->tokstr));
 716 }
 717 \f
 718 /* Weird line processing functions. */
 719
 720 /* Returns the entire contents of the current line. */
 721 const char *
 722 lex_entire_line (const struct lexer *lexer)
 723 {
 724   return ds_cstr (&lexer->line_buffer);
 725 }
 726
 727 const struct string *
 728 lex_entire_line_ds (const struct lexer *lexer)
 729 {
 730   return &lexer->line_buffer;
 731 }
 732
 733 /* As lex_entire_line(), but only returns the part of the current line
 734    that hasn't already been tokenized. */
 735 const char *
 736 lex_rest_of_line (const struct lexer *lexer)
 737 {
 738   return lexer->prog;
 739 }
 740
 741 /* Returns true if the current line ends in a terminal dot,
 742    false otherwise. */
 743 bool
 744 lex_end_dot (const struct lexer *lexer)
 745 {
 746   return lexer->dot;
 747 }
 748
 749 /* Causes the rest of the current input line to be ignored for
 750    tokenization purposes. */
 751 void
 752 lex_discard_line (struct lexer *lexer)
 753 {
 754   ds_cstr (&lexer->line_buffer);  /* Ensures ds_end points to something valid */
 755   lexer->prog = ds_end (&lexer->line_buffer);
 756   lexer->dot = false;
 757   lexer->put_token = 0;
 758 }
 759
 760
 761 /* Discards the rest of the current command.
 762    When we're reading commands from a file, we skip tokens until
 763    a terminal dot or EOF.
 764    When we're reading commands interactively from the user,
 765    that's just discarding the current line, because presumably
 766    the user doesn't want to finish typing a command that will be
 767    ignored anyway. */
 768 void
 769 lex_discard_rest_of_command (struct lexer *lexer)
 770 {
 771   if (!getl_is_interactive (lexer->ss))
 772     {
 773       while (lexer->token != T_STOP && lexer->token != '.')
 774         lex_get (lexer);
 775     }
 776   else
 777     lex_discard_line (lexer);
 778 }
 779 \f
 780 /* Weird line reading functions. */
 781
 782 /* Remove C-style comments in STRING, begun by slash-star and
 783    terminated by star-slash or newline. */
 784 static void
 785 strip_comments (struct string *string)
 786 {
 787   char *cp;
 788   int quote;
 789   bool in_comment;
 790
 791   in_comment = false;
 792   quote = EOF;
 793   for (cp = ds_cstr (string); *cp; )
 794     {
 795       /* If we're not in a comment, check for quote marks. */
 796       if (!in_comment)
 797         {
 798           if (*cp == quote)
 799             quote = EOF;
 800           else if (*cp == '\'' || *cp == '"')
 801             quote = *cp;
 802         }
 803
 804       /* If we're not inside a quotation, check for comment. */
 805       if (quote == EOF)
 806         {
 807           if (cp[0] == '/' && cp[1] == '*')
 808             {
 809               in_comment = true;
 810               *cp++ = ' ';
 811               *cp++ = ' ';
 812               continue;
 813             }
 814           else if (in_comment && cp[0] == '*' && cp[1] == '/')
 815             {
 816               in_comment = false;
 817               *cp++ = ' ';
 818               *cp++ = ' ';
 819               continue;
 820             }
 821         }
 822
 823       /* Check commenting. */
 824       if (in_comment)
 825         *cp = ' ';
 826       cp++;
 827     }
 828 }
 829
 830 /* Prepares LINE, which is subject to the given SYNTAX rules, for
 831    tokenization by stripping comments and determining whether it
 832    is the beginning or end of a command and storing into
 833    *LINE_STARTS_COMMAND and *LINE_ENDS_COMMAND appropriately. */
 834 void
 835 lex_preprocess_line (struct string *line,
 836                      enum getl_syntax syntax,
 837                      bool *line_starts_command,
 838                      bool *line_ends_command)
 839 {
 840   strip_comments (line);
 841   ds_rtrim (line, ss_cstr (CC_SPACES));
 842   *line_ends_command = (ds_chomp (line, get_endcmd ())
 843                         || (ds_is_empty (line) && get_nulline ()));
 844   *line_starts_command = false;
 845   if (syntax == GETL_BATCH)
 846     {
 847       int first = ds_first (line);
 848       *line_starts_command = !isspace (first);
 849       if (first == '+' || first == '-')
 850         *ds_data (line) = ' ';
 851     }
 852 }
 853
 854 /* Reads a line, without performing any preprocessing.
 855    Sets *SYNTAX, if SYNTAX is non-null, to the line's syntax
 856    mode. */
 857 bool
 858 lex_get_line_raw (struct lexer *lexer, enum getl_syntax *syntax)
 859 {
 860   enum getl_syntax dummy;
 861   bool ok = getl_read_line (lexer->ss, &lexer->line_buffer,
 862                               syntax != NULL ? syntax : &dummy);
 863   return ok;
 864 }
 865
 866 /* Reads a line for use by the tokenizer, and preprocesses it by
 867    removing comments, stripping trailing whitespace and the
 868    terminal dot, and removing leading indentors. */
 869 bool
 870 lex_get_line (struct lexer *lexer)
 871 {
 872   bool line_starts_command;
 873   enum getl_syntax syntax;
 874
 875   if (!lex_get_line_raw (lexer, &syntax))
 876     {
 877       lexer->prog = NULL;
 878     return false;
 879     }
 880
 881   lex_preprocess_line (&lexer->line_buffer, syntax,
 882                        &line_starts_command, &lexer->dot);
 883
 884   if (line_starts_command)
 885     lexer->put_token = '.';
 886
 887   lexer->prog = ds_cstr (&lexer->line_buffer);
 888   return true;
 889 }
 890 \f
 891 /* Token names. */
 892
 893 /* Returns the name of a token. */
 894 const char *
 895 lex_token_name (int token)
 896 {
 897   if (lex_is_keyword (token))
 898     return lex_id_name (token);
 899   else if (token < 256)
 900     {
 901       static char t[256][2];
 902       char *s = t[token];
 903       s[0] = token;
 904       s[1] = '\0';
 905       return s;
 906     }
 907   else
 908     NOT_REACHED ();
 909 }
 910
 911 /* Returns an ASCII representation of the current token as a
 912    malloc()'d string. */
 913 char *
 914 lex_token_representation (struct lexer *lexer)
 915 {
 916   char *token_rep;
 917
 918   switch (lexer->token)
 919     {
 920     case T_ID:
 921     case T_POS_NUM:
 922     case T_NEG_NUM:
 923       return ds_xstrdup (&lexer->tokstr);
 924       break;
 925
 926     case T_STRING:
 927       {
 928         int hexstring = 0;
 929         char *sp, *dp;
 930
 931         for (sp = ds_cstr (&lexer->tokstr); sp < ds_end (&lexer->tokstr); sp++)
 932           if (!isprint ((unsigned char) *sp))
 933             {
 934               hexstring = 1;
 935               break;
 936             }
 937
 938         token_rep = xmalloc (2 + ds_length (&lexer->tokstr) * 2 + 1 + 1);
 939
 940         dp = token_rep;
 941         if (hexstring)
 942           *dp++ = 'X';
 943         *dp++ = '\'';
 944
 945         if (!hexstring)
 946           for (sp = ds_cstr (&lexer->tokstr); *sp; )
 947             {
 948               if (*sp == '\'')
 949                 *dp++ = '\'';
 950               *dp++ = (unsigned char) *sp++;
 951             }
 952         else
 953           for (sp = ds_cstr (&lexer->tokstr); sp < ds_end (&lexer->tokstr); sp++)
 954             {
 955               *dp++ = (((unsigned char) *sp) >> 4)["0123456789ABCDEF"];
 956               *dp++ = (((unsigned char) *sp) & 15)["0123456789ABCDEF"];
 957             }
 958         *dp++ = '\'';
 959         *dp = '\0';
 960
 961         return token_rep;
 962       }
 963     break;
 964
 965     case T_STOP:
 966       token_rep = xmalloc (1);
 967       *token_rep = '\0';
 968       return token_rep;
 969
 970     case T_EXP:
 971       return xstrdup ("**");
 972
 973     default:
 974       return xstrdup (lex_token_name (lexer->token));
 975     }
 976
 977   NOT_REACHED ();
 978 }
 979 \f
 980 /* Really weird functions. */
 981
 982 /* Most of the time, a `-' is a lead-in to a negative number.  But
 983    sometimes it's actually part of the syntax.  If a dash can be part
 984    of syntax then this function is called to rip it off of a
 985    number. */
 986 void
 987 lex_negative_to_dash (struct lexer *lexer)
 988 {
 989   if (lexer->token == T_NEG_NUM)
 990     {
 991       lexer->token = T_POS_NUM;
 992       lexer->tokval = -lexer->tokval;
 993       ds_assign_substring (&lexer->tokstr, ds_substr (&lexer->tokstr, 1, SIZE_MAX));
 994       save_token (lexer);
 995       lexer->token = '-';
 996     }
 997 }
 998
 999 /* Skip a COMMENT command. */
1000 void
1001 lex_skip_comment (struct lexer *lexer)
1002 {
1003   for (;;)
1004     {
1005       if (!lex_get_line (lexer))
1006         {
1007           lexer->put_token = T_STOP;
1008           lexer->prog = NULL;
1009           return;
1010         }
1011
1012       if (lexer->put_token == '.')
1013         break;
1014
1015       ds_cstr (&lexer->line_buffer); /* Ensures ds_end will point to a valid char */
1016       lexer->prog = ds_end (&lexer->line_buffer);
1017       if (lexer->dot)
1018         break;
1019     }
1020 }
1021 \f
1022 /* Private functions. */
1023
1024 /* When invoked, tokstr contains a string of binary, octal, or
1025    hex digits, according to TYPE.  The string is converted to
1026    characters having the specified values. */
1027 static void
1028 convert_numeric_string_to_char_string (struct lexer *lexer,
1029                                        enum string_type type)
1030 {
1031   const char *base_name;
1032   int base;
1033   int chars_per_byte;
1034   size_t byte_cnt;
1035   size_t i;
1036   char *p;
1037
1038   switch (type)
1039     {
1040     case BINARY_STRING:
1041       base_name = _("binary");
1042       base = 2;
1043       chars_per_byte = 8;
1044       break;
1045     case OCTAL_STRING:
1046       base_name = _("octal");
1047       base = 8;
1048       chars_per_byte = 3;
1049       break;
1050     case HEX_STRING:
1051       base_name = _("hex");
1052       base = 16;
1053       chars_per_byte = 2;
1054       break;
1055     default:
1056       NOT_REACHED ();
1057     }
1058
1059   byte_cnt = ds_length (&lexer->tokstr) / chars_per_byte;
1060   if (ds_length (&lexer->tokstr) % chars_per_byte)
1061     msg (SE, _("String of %s digits has %d characters, which is not a "
1062                "multiple of %d."),
1063          base_name, (int) ds_length (&lexer->tokstr), chars_per_byte);
1064
1065   p = ds_cstr (&lexer->tokstr);
1066   for (i = 0; i < byte_cnt; i++)
1067     {
1068       int value;
1069       int j;
1070
1071       value = 0;
1072       for (j = 0; j < chars_per_byte; j++, p++)
1073         {
1074           int v;
1075
1076           if (*p >= '0' && *p <= '9')
1077             v = *p - '0';
1078           else
1079             {
1080               static const char alpha[] = "abcdef";
1081               const char *q = strchr (alpha, tolower ((unsigned char) *p));
1082
1083               if (q)
1084                 v = q - alpha + 10;
1085               else
1086                 v = base;
1087             }
1088
1089           if (v >= base)
1090             msg (SE, _("`%c' is not a valid %s digit."), *p, base_name);
1091
1092           value = value * base + v;
1093         }
1094
1095       ds_cstr (&lexer->tokstr)[i] = (unsigned char) value;
1096     }
1097
1098   ds_truncate (&lexer->tokstr, byte_cnt);
1099 }
1100
1101 /* Parses a string from the input buffer into tokstr.  The input
1102    buffer pointer lexer->prog must point to the initial single or double
1103    quote.  TYPE indicates the type of string to be parsed.
1104    Returns token type. */
1105 static int
1106 parse_string (struct lexer *lexer, enum string_type type)
1107 {
1108   if (type != CHARACTER_STRING)
1109     lexer->prog++;
1110
1111   /* Accumulate the entire string, joining sections indicated by +
1112      signs. */
1113   for (;;)
1114     {
1115       /* Single or double quote. */
1116       int c = *lexer->prog++;
1117
1118       /* Accumulate section. */
1119       for (;;)
1120         {
1121           /* Check end of line. */
1122           if (*lexer->prog == '\0')
1123             {
1124               msg (SE, _("Unterminated string constant."));
1125               goto finish;
1126             }
1127
1128           /* Double quote characters to embed them in strings. */
1129           if (*lexer->prog == c)
1130             {
1131               if (lexer->prog[1] == c)
1132                 lexer->prog++;
1133               else
1134                 break;
1135             }
1136
1137           ds_put_char (&lexer->tokstr, *lexer->prog++);
1138         }
1139       lexer->prog++;
1140
1141       /* Skip whitespace after final quote mark. */
1142       if (lexer->prog == NULL)
1143         break;
1144       for (;;)
1145         {
1146           while (isspace ((unsigned char) *lexer->prog))
1147             lexer->prog++;
1148           if (*lexer->prog)
1149             break;
1150
1151           if (lexer->dot)
1152             goto finish;
1153
1154           if (!lex_get_line (lexer))
1155             goto finish;
1156         }
1157
1158       /* Skip plus sign. */
1159       if (*lexer->prog != '+')
1160         break;
1161       lexer->prog++;
1162
1163       /* Skip whitespace after plus sign. */
1164       if (lexer->prog == NULL)
1165         break;
1166       for (;;)
1167         {
1168           while (isspace ((unsigned char) *lexer->prog))
1169             lexer->prog++;
1170           if (*lexer->prog)
1171             break;
1172
1173           if (lexer->dot)
1174             goto finish;
1175
1176           if (!lex_get_line (lexer))
1177             {
1178               msg (SE, _("Unexpected end of file in string concatenation."));
1179               goto finish;
1180             }
1181         }
1182
1183       /* Ensure that a valid string follows. */
1184       if (*lexer->prog != '\'' && *lexer->prog != '"')
1185         {
1186           msg (SE, _("String expected following `+'."));
1187           goto finish;
1188         }
1189     }
1190
1191   /* We come here when we've finished concatenating all the string sections
1192      into one large string. */
1193 finish:
1194   if (type != CHARACTER_STRING)
1195     convert_numeric_string_to_char_string (lexer, type);
1196
1197   if (ds_length (&lexer->tokstr) > 255)
1198     {
1199       msg (SE, _("String exceeds 255 characters in length (%d characters)."),
1200            (int) ds_length (&lexer->tokstr));
1201       ds_truncate (&lexer->tokstr, 255);
1202     }
1203
1204   return T_STRING;
1205 }
1206 \f
1207 #if DUMP_TOKENS
1208 /* Reads one token from the lexer and writes a textual representation
1209    on stdout for debugging purposes. */
1210 static void
1211 dump_token (struct lexer *lexer)
1212 {
1213   {
1214     const char *curfn;
1215     int curln;
1216
1217     curln = getl_source_location (lexer->ss);
1218     curfn = getl_source_name (lexer->ss);
1219     if (curfn)
1220       fprintf (stderr, "%s:%d\t", curfn, curln);
1221   }
1222
1223   switch (lexer->token)
1224     {
1225     case T_ID:
1226       fprintf (stderr, "ID\t%s\n", lexer->tokid);
1227       break;
1228
1229     case T_POS_NUM:
1230     case T_NEG_NUM:
1231       fprintf (stderr, "NUM\t%f\n", lexer->tokval);
1232       break;
1233
1234     case T_STRING:
1235       fprintf (stderr, "STRING\t\"%s\"\n", ds_cstr (&lexer->tokstr));
1236       break;
1237
1238     case T_STOP:
1239       fprintf (stderr, "STOP\n");
1240       break;
1241
1242     case T_EXP:
1243       fprintf (stderr, "MISC\tEXP\"");
1244       break;
1245
1246     case 0:
1247       fprintf (stderr, "MISC\tEOF\n");
1248       break;
1249
1250     default:
1251       if (lex_is_keyword (lexer->token))
1252         fprintf (stderr, "KEYWORD\t%s\n", lex_token_name (lexer->token));
1253       else
1254         fprintf (stderr, "PUNCT\t%c\n", lexer->token);
1255       break;
1256     }
1257 }
1258 #endif /* DUMP_TOKENS */
1259
1260
1261 /* Token Accessor Functions */
1262
1263 int
1264 lex_token (const struct lexer *lexer)
1265 {
1266   return lexer->token;
1267 }
1268
1269 double
1270 lex_tokval (const struct lexer *lexer)
1271 {
1272   return lexer->tokval;
1273 }
1274
1275 const char *
1276 lex_tokid (const struct lexer *lexer)
1277 {
1278   return lexer->tokid;
1279 }
1280
1281 const struct string *
1282 lex_tokstr (const struct lexer *lexer)
1283 {
1284   return &lexer->tokstr;
1285 }