src/language/lexer/lexer.c

   1 /* PSPP - computes sample statistics.
   2    Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
   3    Written by Ben Pfaff <blp@gnu.org>.
   4
   5    This program is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU General Public License as
   7    published by the Free Software Foundation; either version 2 of the
   8    License, or (at your option) any later version.
   9
  10    This program is distributed in the hope that it will be useful, but
  11    WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program; if not, write to the Free Software
  17    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  18    02110-1301, USA. */
  19
  20 #include <config.h>
  21 #include "lexer.h"
  22 #include <libpspp/message.h>
  23 #include <ctype.h>
  24 #include <errno.h>
  25 #include <limits.h>
  26 #include <math.h>
  27 #include <stdarg.h>
  28 #include <stdlib.h>
  29 #include <libpspp/alloc.h>
  30 #include <libpspp/assertion.h>
  31 #include <language/command.h>
  32 #include <libpspp/message.h>
  33 #include <libpspp/magic.h>
  34 #include <data/settings.h>
  35 #include <libpspp/getl.h>
  36 #include <libpspp/str.h>
  37
  38 #include "size_max.h"
  39
  40 #include "gettext.h"
  41 #define _(msgid) gettext (msgid)
  42 #define N_(msgid) msgid
  43
  44 /*
  45 #define DUMP_TOKENS 1
  46 */
  47
  48 struct lexer
  49 {
  50   struct string line_buffer;
  51
  52   struct source_stream *ss;
  53
  54   int token;      /* Current token. */
  55   double tokval;  /* T_POS_NUM, T_NEG_NUM: the token's value. */
  56
  57   char tokid [LONG_NAME_LEN + 1];   /* T_ID: the identifier. */
  58
  59   struct string tokstr;   /* T_ID, T_STRING: token string value.
  60                             For T_ID, this is not truncated as is
  61                             tokid. */
  62
  63   char *prog; /* Pointer to next token in line_buffer. */
  64   bool dot;   /* True only if this line ends with a terminal dot. */
  65   bool eof;   /* True only if the last token returned was T_STOP. */
  66
  67   int put_token ; /* If nonzero, next token returned by lex_get().
  68                     Used only in exceptional circumstances. */
  69
  70   struct string put_tokstr;
  71   double put_tokval;
  72 };
  73
  74
  75 static int parse_id (struct lexer *);
  76
  77 /* How a string represents its contents. */
  78 enum string_type
  79   {
  80     CHARACTER_STRING,   /* Characters. */
  81     BINARY_STRING,      /* Binary digits. */
  82     OCTAL_STRING,       /* Octal digits. */
  83     HEX_STRING          /* Hexadecimal digits. */
  84   };
  85
  86 static int parse_string (struct lexer *, enum string_type);
  87
  88 #if DUMP_TOKENS
  89 static void dump_token (void);
  90 #endif
  91 \f
  92 /* Initialization. */
  93
  94 /* Initializes the lexer. */
  95 struct lexer *
  96 lex_create (struct source_stream *ss)
  97 {
  98   struct lexer *lexer = xzalloc (sizeof (*lexer));
  99
 100   ds_init_empty (&lexer->tokstr);
 101   ds_init_empty (&lexer->put_tokstr);
 102   ds_init_empty (&lexer->line_buffer);
 103   lexer->ss = ss;
 104
 105   if (!lex_get_line (lexer))
 106     lexer->eof = true;
 107
 108   return lexer;
 109 }
 110
 111 struct source_stream *
 112 lex_get_source_stream (const struct lexer *lex)
 113 {
 114   return lex->ss;
 115 }
 116
 117
 118 void
 119 lex_destroy (struct lexer *lexer)
 120 {
 121   if ( NULL != lexer )
 122     {
 123       ds_destroy (&lexer->put_tokstr);
 124       ds_destroy (&lexer->tokstr);
 125       ds_destroy (&lexer->line_buffer);
 126
 127       free (lexer);
 128     }
 129 }
 130
 131 \f
 132 /* Common functions. */
 133
 134 /* Copies put_token, lexer->put_tokstr, put_tokval into token, tokstr,
 135    tokval, respectively, and sets tokid appropriately. */
 136 static void
 137 restore_token (struct lexer *lexer)
 138 {
 139   assert (lexer->put_token != 0);
 140   lexer->token = lexer->put_token;
 141   ds_assign_string (&lexer->tokstr, &lexer->put_tokstr);
 142   str_copy_trunc (lexer->tokid, sizeof lexer->tokid, ds_cstr (&lexer->tokstr));
 143   lexer->tokval = lexer->put_tokval;
 144   lexer->put_token = 0;
 145 }
 146
 147 /* Copies token, tokstr, lexer->tokval into lexer->put_token, put_tokstr,
 148    put_lexer->tokval respectively. */
 149 static void
 150 save_token (struct lexer *lexer)
 151 {
 152   lexer->put_token = lexer->token;
 153   ds_assign_string (&lexer->put_tokstr, &lexer->tokstr);
 154   lexer->put_tokval = lexer->tokval;
 155 }
 156
 157 /* Parses a single token, setting appropriate global variables to
 158    indicate the token's attributes. */
 159 void
 160 lex_get (struct lexer *lexer)
 161 {
 162   /* If a token was pushed ahead, return it. */
 163   if (lexer->put_token)
 164     {
 165       restore_token (lexer);
 166 #if DUMP_TOKENS
 167       dump_token ();
 168 #endif
 169       return;
 170     }
 171
 172   /* Find a token. */
 173   for (;;)
 174     {
 175       /* Skip whitespace. */
 176       if (lexer->eof)
 177         {
 178           lexer->token = T_STOP;
 179           return;
 180         }
 181
 182       for (;;)
 183         {
 184           while (isspace ((unsigned char) *lexer->prog))
 185             lexer->prog++;
 186           if (*lexer->prog)
 187             break;
 188
 189           if (lexer->dot)
 190             {
 191               lexer->dot = 0;
 192               lexer->token = '.';
 193 #if DUMP_TOKENS
 194               dump_token ();
 195 #endif
 196               return;
 197             }
 198           else if (!lex_get_line (lexer))
 199             {
 200               lexer->eof = true;
 201               lexer->token = T_STOP;
 202 #if DUMP_TOKENS
 203               dump_token ();
 204 #endif
 205               return;
 206             }
 207
 208           if (lexer->put_token)
 209             {
 210               restore_token (lexer);
 211 #if DUMP_TOKENS
 212               dump_token ();
 213 #endif
 214               return;
 215             }
 216         }
 217
 218
 219       /* Actually parse the token. */
 220       ds_clear (&lexer->tokstr);
 221
 222       switch (*lexer->prog)
 223         {
 224         case '-': case '.':
 225         case '0': case '1': case '2': case '3': case '4':
 226         case '5': case '6': case '7': case '8': case '9':
 227           {
 228             char *tail;
 229
 230             /* `-' can introduce a negative number, or it can be a
 231                token by itself.  If it is not followed by a digit or a
 232                decimal point, it is definitely not a number.
 233                Otherwise, it might be either, but most of the time we
 234                want it as a number.  When the syntax calls for a `-'
 235                token, lex_negative_to_dash() must be used to break
 236                negative numbers into two tokens. */
 237             if (*lexer->prog == '-')
 238               {
 239                 ds_put_char (&lexer->tokstr, *lexer->prog++);
 240                 while (isspace ((unsigned char) *lexer->prog))
 241                   lexer->prog++;
 242
 243                 if (!isdigit ((unsigned char) *lexer->prog) && *lexer->prog != '.')
 244                   {
 245                     lexer->token = '-';
 246                     break;
 247                   }
 248                 lexer->token = T_NEG_NUM;
 249               }
 250             else
 251               lexer->token = T_POS_NUM;
 252
 253             /* Parse the number, copying it into tokstr. */
 254             while (isdigit ((unsigned char) *lexer->prog))
 255               ds_put_char (&lexer->tokstr, *lexer->prog++);
 256             if (*lexer->prog == '.')
 257               {
 258                 ds_put_char (&lexer->tokstr, *lexer->prog++);
 259                 while (isdigit ((unsigned char) *lexer->prog))
 260                   ds_put_char (&lexer->tokstr, *lexer->prog++);
 261               }
 262             if (*lexer->prog == 'e' || *lexer->prog == 'E')
 263               {
 264                 ds_put_char (&lexer->tokstr, *lexer->prog++);
 265                 if (*lexer->prog == '+' || *lexer->prog == '-')
 266                   ds_put_char (&lexer->tokstr, *lexer->prog++);
 267                 while (isdigit ((unsigned char) *lexer->prog))
 268                   ds_put_char (&lexer->tokstr, *lexer->prog++);
 269               }
 270
 271             /* Parse as floating point. */
 272             lexer->tokval = strtod (ds_cstr (&lexer->tokstr), &tail);
 273             if (*tail)
 274               {
 275                 msg (SE, _("%s does not form a valid number."),
 276                      ds_cstr (&lexer->tokstr));
 277                 lexer->tokval = 0.0;
 278
 279                 ds_clear (&lexer->tokstr);
 280                 ds_put_char (&lexer->tokstr, '0');
 281               }
 282
 283             break;
 284           }
 285
 286         case '\'': case '"':
 287           lexer->token = parse_string (lexer, CHARACTER_STRING);
 288           break;
 289
 290         case '(': case ')': case ',': case '=': case '+': case '/':
 291           lexer->token = *lexer->prog++;
 292           break;
 293
 294         case '*':
 295           if (*++lexer->prog == '*')
 296             {
 297               lexer->prog++;
 298               lexer->token = T_EXP;
 299             }
 300           else
 301             lexer->token = '*';
 302           break;
 303
 304         case '<':
 305           if (*++lexer->prog == '=')
 306             {
 307               lexer->prog++;
 308               lexer->token = T_LE;
 309             }
 310           else if (*lexer->prog == '>')
 311             {
 312               lexer->prog++;
 313               lexer->token = T_NE;
 314             }
 315           else
 316             lexer->token = T_LT;
 317           break;
 318
 319         case '>':
 320           if (*++lexer->prog == '=')
 321             {
 322               lexer->prog++;
 323               lexer->token = T_GE;
 324             }
 325           else
 326             lexer->token = T_GT;
 327           break;
 328
 329         case '~':
 330           if (*++lexer->prog == '=')
 331             {
 332               lexer->prog++;
 333               lexer->token = T_NE;
 334             }
 335           else
 336             lexer->token = T_NOT;
 337           break;
 338
 339         case '&':
 340           lexer->prog++;
 341           lexer->token = T_AND;
 342           break;
 343
 344         case '|':
 345           lexer->prog++;
 346           lexer->token = T_OR;
 347           break;
 348
 349         case 'b': case 'B':
 350           if (lexer->prog[1] == '\'' || lexer->prog[1] == '"')
 351             lexer->token = parse_string (lexer, BINARY_STRING);
 352           else
 353             lexer->token = parse_id (lexer);
 354           break;
 355
 356         case 'o': case 'O':
 357           if (lexer->prog[1] == '\'' || lexer->prog[1] == '"')
 358             lexer->token = parse_string (lexer, OCTAL_STRING);
 359           else
 360             lexer->token = parse_id (lexer);
 361           break;
 362
 363         case 'x': case 'X':
 364           if (lexer->prog[1] == '\'' || lexer->prog[1] == '"')
 365             lexer->token = parse_string (lexer, HEX_STRING);
 366           else
 367             lexer->token = parse_id (lexer);
 368           break;
 369
 370         default:
 371           if (lex_is_id1 (*lexer->prog))
 372             {
 373               lexer->token = parse_id (lexer);
 374               break;
 375             }
 376           else
 377             {
 378               if (isgraph ((unsigned char) *lexer->prog))
 379                 msg (SE, _("Bad character in input: `%c'."), *lexer->prog++);
 380               else
 381                 msg (SE, _("Bad character in input: `\\%o'."), *lexer->prog++);
 382               continue;
 383             }
 384         }
 385       break;
 386     }
 387
 388 #if DUMP_TOKENS
 389   dump_token ();
 390 #endif
 391 }
 392
 393 /* Parses an identifier at the current position into tokid and
 394    tokstr.
 395    Returns the correct token type. */
 396 static int
 397 parse_id (struct lexer *lexer)
 398 {
 399   struct substring rest_of_line
 400     = ss_substr (ds_ss (&lexer->line_buffer),
 401                  ds_pointer_to_position (&lexer->line_buffer, lexer->prog),
 402                  SIZE_MAX);
 403   struct substring id = ss_head (rest_of_line,
 404                                  lex_id_get_length (rest_of_line));
 405   lexer->prog += ss_length (id);
 406
 407   ds_assign_substring (&lexer->tokstr, id);
 408   str_copy_trunc (lexer->tokid, sizeof lexer->tokid, ds_cstr (&lexer->tokstr));
 409   return lex_id_to_token (id);
 410 }
 411
 412 /* Reports an error to the effect that subcommand SBC may only be
 413    specified once. */
 414 void
 415 lex_sbc_only_once (const char *sbc)
 416 {
 417   msg (SE, _("Subcommand %s may only be specified once."), sbc);
 418 }
 419
 420 /* Reports an error to the effect that subcommand SBC is
 421    missing. */
 422 void
 423 lex_sbc_missing (struct lexer *lexer, const char *sbc)
 424 {
 425   lex_error (lexer, _("missing required subcommand %s"), sbc);
 426 }
 427
 428 /* Prints a syntax error message containing the current token and
 429    given message MESSAGE (if non-null). */
 430 void
 431 lex_error (struct lexer *lexer, const char *message, ...)
 432 {
 433   char *token_rep;
 434   char where[128];
 435
 436   token_rep = lex_token_representation (lexer);
 437   if (lexer->token == T_STOP)
 438     strcpy (where, "end of file");
 439   else if (lexer->token == '.')
 440     strcpy (where, "end of command");
 441   else
 442     snprintf (where, sizeof where, "`%s'", token_rep);
 443   free (token_rep);
 444
 445   if (message)
 446     {
 447       char buf[1024];
 448       va_list args;
 449
 450       va_start (args, message);
 451       vsnprintf (buf, 1024, message, args);
 452       va_end (args);
 453
 454       msg (SE, _("Syntax error %s at %s."), buf, where);
 455     }
 456   else
 457     msg (SE, _("Syntax error at %s."), where);
 458 }
 459
 460 /* Checks that we're at end of command.
 461    If so, returns a successful command completion code.
 462    If not, flags a syntax error and returns an error command
 463    completion code. */
 464 int
 465 lex_end_of_command (struct lexer *lexer)
 466 {
 467   if (lexer->token != '.')
 468     {
 469       lex_error (lexer, _("expecting end of command"));
 470       return CMD_FAILURE;
 471     }
 472   else
 473     return CMD_SUCCESS;
 474 }
 475 \f
 476 /* Token testing functions. */
 477
 478 /* Returns true if the current token is a number. */
 479 bool
 480 lex_is_number (struct lexer *lexer)
 481 {
 482   return lexer->token == T_POS_NUM || lexer->token == T_NEG_NUM;
 483 }
 484
 485 /* Returns the value of the current token, which must be a
 486    floating point number. */
 487 double
 488 lex_number (struct lexer *lexer)
 489 {
 490   assert (lex_is_number (lexer));
 491   return lexer->tokval;
 492 }
 493
 494 /* Returns true iff the current token is an integer. */
 495 bool
 496 lex_is_integer (struct lexer *lexer)
 497 {
 498   return (lex_is_number (lexer)
 499           && lexer->tokval != NOT_LONG
 500           && lexer->tokval >= LONG_MIN
 501           && lexer->tokval <= LONG_MAX
 502           && floor (lexer->tokval) == lexer->tokval);
 503 }
 504
 505 /* Returns the value of the current token, which must be an
 506    integer. */
 507 long
 508 lex_integer (struct lexer *lexer)
 509 {
 510   assert (lex_is_integer (lexer));
 511   return lexer->tokval;
 512 }
 513 \f
 514 /* Token matching functions. */
 515
 516 /* If TOK is the current token, skips it and returns true
 517    Otherwise, returns false. */
 518 bool
 519 lex_match (struct lexer *lexer, int t)
 520 {
 521   if (lexer->token == t)
 522     {
 523       lex_get (lexer);
 524       return true;
 525     }
 526   else
 527     return false;
 528 }
 529
 530 /* If the current token is the identifier S, skips it and returns
 531    true.  The identifier may be abbreviated to its first three
 532    letters.
 533    Otherwise, returns false. */
 534 bool
 535 lex_match_id (struct lexer *lexer, const char *s)
 536 {
 537   if (lexer->token == T_ID
 538       && lex_id_match (ss_cstr (s), ss_cstr (lexer->tokid)))
 539     {
 540       lex_get (lexer);
 541       return true;
 542     }
 543   else
 544     return false;
 545 }
 546
 547 /* If the current token is integer N, skips it and returns true.
 548    Otherwise, returns false. */
 549 bool
 550 lex_match_int (struct lexer *lexer, int x)
 551 {
 552   if (lex_is_integer (lexer) && lex_integer (lexer) == x)
 553     {
 554       lex_get (lexer);
 555       return true;
 556     }
 557   else
 558     return false;
 559 }
 560 \f
 561 /* Forced matches. */
 562
 563 /* If this token is identifier S, fetches the next token and returns
 564    nonzero.
 565    Otherwise, reports an error and returns zero. */
 566 bool
 567 lex_force_match_id (struct lexer *lexer, const char *s)
 568 {
 569   if (lex_match_id (lexer, s))
 570     return true;
 571   else
 572     {
 573       lex_error (lexer, _("expecting `%s'"), s);
 574       return false;
 575     }
 576 }
 577
 578 /* If the current token is T, skips the token.  Otherwise, reports an
 579    error and returns from the current function with return value false. */
 580 bool
 581 lex_force_match (struct lexer *lexer, int t)
 582 {
 583   if (lexer->token == t)
 584     {
 585       lex_get (lexer);
 586       return true;
 587     }
 588   else
 589     {
 590       lex_error (lexer, _("expecting `%s'"), lex_token_name (t));
 591       return false;
 592     }
 593 }
 594
 595 /* If this token is a string, does nothing and returns true.
 596    Otherwise, reports an error and returns false. */
 597 bool
 598 lex_force_string (struct lexer *lexer)
 599 {
 600   if (lexer->token == T_STRING)
 601     return true;
 602   else
 603     {
 604       lex_error (lexer, _("expecting string"));
 605       return false;
 606     }
 607 }
 608
 609 /* If this token is an integer, does nothing and returns true.
 610    Otherwise, reports an error and returns false. */
 611 bool
 612 lex_force_int (struct lexer *lexer)
 613 {
 614   if (lex_is_integer (lexer))
 615     return true;
 616   else
 617     {
 618       lex_error (lexer, _("expecting integer"));
 619       return false;
 620     }
 621 }
 622
 623 /* If this token is a number, does nothing and returns true.
 624    Otherwise, reports an error and returns false. */
 625 bool
 626 lex_force_num (struct lexer *lexer)
 627 {
 628   if (lex_is_number (lexer))
 629     return true;
 630
 631   lex_error (lexer, _("expecting number"));
 632   return false;
 633 }
 634
 635 /* If this token is an identifier, does nothing and returns true.
 636    Otherwise, reports an error and returns false. */
 637 bool
 638 lex_force_id (struct lexer *lexer)
 639 {
 640   if (lexer->token == T_ID)
 641     return true;
 642
 643   lex_error (lexer, _("expecting identifier"));
 644   return false;
 645 }
 646
 647 /* Weird token functions. */
 648
 649 /* Returns the first character of the next token, except that if the
 650    next token is not an identifier, the character returned will not be
 651    a character that can begin an identifier.  Specifically, the
 652    hexstring lead-in X' causes lookahead() to return '.  Note that an
 653    alphanumeric return value doesn't guarantee an ID token, it could
 654    also be a reserved-word token. */
 655 int
 656 lex_look_ahead (struct lexer *lexer)
 657 {
 658   if (lexer->put_token)
 659     return lexer->put_token;
 660
 661   for (;;)
 662     {
 663       if (lexer->eof)
 664         return 0;
 665
 666       for (;;)
 667         {
 668           while (isspace ((unsigned char) *lexer->prog))
 669             lexer->prog++;
 670           if (*lexer->prog)
 671             break;
 672
 673           if (lexer->dot)
 674             return '.';
 675           else if (!lex_get_line (lexer))
 676             return 0;
 677
 678           if (lexer->put_token)
 679             return lexer->put_token;
 680         }
 681
 682       if ((toupper ((unsigned char) *lexer->prog) == 'X'
 683            || toupper ((unsigned char) *lexer->prog) == 'B'
 684            || toupper ((unsigned char) *lexer->prog) == 'O')
 685           && (lexer->prog[1] == '\'' || lexer->prog[1] == '"'))
 686         return '\'';
 687
 688       return *lexer->prog;
 689     }
 690 }
 691
 692 /* Makes the current token become the next token to be read; the
 693    current token is set to T. */
 694 void
 695 lex_put_back (struct lexer *lexer, int t)
 696 {
 697   save_token (lexer);
 698   lexer->token = t;
 699 }
 700
 701 /* Makes the current token become the next token to be read; the
 702    current token is set to the identifier ID. */
 703 void
 704 lex_put_back_id (struct lexer *lexer, const char *id)
 705 {
 706   assert (lex_id_to_token (ss_cstr (id)) == T_ID);
 707   save_token (lexer);
 708   lexer->token = T_ID;
 709   ds_assign_cstr (&lexer->tokstr, id);
 710   str_copy_trunc (lexer->tokid, sizeof lexer->tokid, ds_cstr (&lexer->tokstr));
 711 }
 712 \f
 713 /* Weird line processing functions. */
 714
 715 /* Returns the entire contents of the current line. */
 716 const char *
 717 lex_entire_line (struct lexer *lexer)
 718 {
 719   return ds_cstr (&lexer->line_buffer);
 720 }
 721
 722 const struct string *
 723 lex_entire_line_ds (struct lexer *lexer)
 724 {
 725   return &lexer->line_buffer;
 726 }
 727
 728 /* As lex_entire_line(), but only returns the part of the current line
 729    that hasn't already been tokenized.
 730    If END_DOT is non-null, stores nonzero into *END_DOT if the line
 731    ends with a terminal dot, or zero if it doesn't. */
 732 const char *
 733 lex_rest_of_line (struct lexer *lexer, int *end_dot)
 734 {
 735   if (end_dot)
 736     *end_dot = lexer->dot;
 737   return lexer->prog;
 738 }
 739
 740 /* Causes the rest of the current input line to be ignored for
 741    tokenization purposes. */
 742 void
 743 lex_discard_line (struct lexer *lexer)
 744 {
 745   ds_cstr (&lexer->line_buffer);  /* Ensures ds_end points to something valid */
 746   lexer->prog = ds_end (&lexer->line_buffer);
 747   lexer->dot = false;
 748   lexer->put_token = 0;
 749 }
 750
 751
 752 /* Discards the rest of the current command.
 753    When we're reading commands from a file, we skip tokens until
 754    a terminal dot or EOF.
 755    When we're reading commands interactively from the user,
 756    that's just discarding the current line, because presumably
 757    the user doesn't want to finish typing a command that will be
 758    ignored anyway. */
 759 void
 760 lex_discard_rest_of_command (struct lexer *lexer)
 761 {
 762   if (!getl_is_interactive (lexer->ss))
 763     {
 764       while (lexer->token != T_STOP && lexer->token != '.')
 765         lex_get (lexer);
 766     }
 767   else
 768     lex_discard_line (lexer);
 769 }
 770 \f
 771 /* Weird line reading functions. */
 772
 773 /* Remove C-style comments in STRING, begun by slash-star and
 774    terminated by star-slash or newline. */
 775 static void
 776 strip_comments (struct string *string)
 777 {
 778   char *cp;
 779   int quote;
 780   bool in_comment;
 781
 782   in_comment = false;
 783   quote = EOF;
 784   for (cp = ds_cstr (string); *cp; )
 785     {
 786       /* If we're not in a comment, check for quote marks. */
 787       if (!in_comment)
 788         {
 789           if (*cp == quote)
 790             quote = EOF;
 791           else if (*cp == '\'' || *cp == '"')
 792             quote = *cp;
 793         }
 794
 795       /* If we're not inside a quotation, check for comment. */
 796       if (quote == EOF)
 797         {
 798           if (cp[0] == '/' && cp[1] == '*')
 799             {
 800               in_comment = true;
 801               *cp++ = ' ';
 802               *cp++ = ' ';
 803               continue;
 804             }
 805           else if (in_comment && cp[0] == '*' && cp[1] == '/')
 806             {
 807               in_comment = false;
 808               *cp++ = ' ';
 809               *cp++ = ' ';
 810               continue;
 811             }
 812         }
 813
 814       /* Check commenting. */
 815       if (in_comment)
 816         *cp = ' ';
 817       cp++;
 818     }
 819 }
 820
 821 /* Prepares LINE, which is subject to the given SYNTAX rules, for
 822    tokenization by stripping comments and determining whether it
 823    is the beginning or end of a command and storing into
 824    *LINE_STARTS_COMMAND and *LINE_ENDS_COMMAND appropriately. */
 825 void
 826 lex_preprocess_line (struct string *line,
 827                      enum getl_syntax syntax,
 828                      bool *line_starts_command,
 829                      bool *line_ends_command)
 830 {
 831   strip_comments (line);
 832   ds_rtrim (line, ss_cstr (CC_SPACES));
 833   *line_ends_command = (ds_chomp (line, get_endcmd ())
 834                         || (ds_is_empty (line) && get_nulline ()));
 835   *line_starts_command = false;
 836   if (syntax == GETL_BATCH)
 837     {
 838       int first = ds_first (line);
 839       *line_starts_command = !isspace (first);
 840       if (first == '+' || first == '-')
 841         *ds_data (line) = ' ';
 842     }
 843 }
 844
 845 /* Reads a line, without performing any preprocessing.
 846    Sets *SYNTAX, if SYNTAX is non-null, to the line's syntax
 847    mode. */
 848 bool
 849 lex_get_line_raw (struct lexer *lexer, enum getl_syntax *syntax)
 850 {
 851   enum getl_syntax dummy;
 852   bool ok = getl_read_line (lexer->ss, &lexer->line_buffer,
 853                               syntax != NULL ? syntax : &dummy);
 854   return ok;
 855 }
 856
 857 /* Reads a line for use by the tokenizer, and preprocesses it by
 858    removing comments, stripping trailing whitespace and the
 859    terminal dot, and removing leading indentors. */
 860 bool
 861 lex_get_line (struct lexer *lexer)
 862 {
 863   bool line_starts_command;
 864   enum getl_syntax syntax;
 865
 866   if (!lex_get_line_raw (lexer, &syntax))
 867     return false;
 868
 869   lex_preprocess_line (&lexer->line_buffer, syntax,
 870                        &line_starts_command, &lexer->dot);
 871   if (line_starts_command)
 872     lexer->put_token = '.';
 873
 874   lexer->prog = ds_cstr (&lexer->line_buffer);
 875   return true;
 876 }
 877 \f
 878 /* Token names. */
 879
 880 /* Returns the name of a token. */
 881 const char *
 882 lex_token_name (int token)
 883 {
 884   if (lex_is_keyword (token))
 885     return lex_id_name (token);
 886   else if (token < 256)
 887     {
 888       static char t[256][2];
 889       char *s = t[token];
 890       s[0] = token;
 891       s[1] = '\0';
 892       return s;
 893     }
 894   else
 895     NOT_REACHED ();
 896 }
 897
 898 /* Returns an ASCII representation of the current token as a
 899    malloc()'d string. */
 900 char *
 901 lex_token_representation (struct lexer *lexer)
 902 {
 903   char *token_rep;
 904
 905   switch (lexer->token)
 906     {
 907     case T_ID:
 908     case T_POS_NUM:
 909     case T_NEG_NUM:
 910       return ds_xstrdup (&lexer->tokstr);
 911       break;
 912
 913     case T_STRING:
 914       {
 915         int hexstring = 0;
 916         char *sp, *dp;
 917
 918         for (sp = ds_cstr (&lexer->tokstr); sp < ds_end (&lexer->tokstr); sp++)
 919           if (!isprint ((unsigned char) *sp))
 920             {
 921               hexstring = 1;
 922               break;
 923             }
 924
 925         token_rep = xmalloc (2 + ds_length (&lexer->tokstr) * 2 + 1 + 1);
 926
 927         dp = token_rep;
 928         if (hexstring)
 929           *dp++ = 'X';
 930         *dp++ = '\'';
 931
 932         if (!hexstring)
 933           for (sp = ds_cstr (&lexer->tokstr); *sp; )
 934             {
 935               if (*sp == '\'')
 936                 *dp++ = '\'';
 937               *dp++ = (unsigned char) *sp++;
 938             }
 939         else
 940           for (sp = ds_cstr (&lexer->tokstr); sp < ds_end (&lexer->tokstr); sp++)
 941             {
 942               *dp++ = (((unsigned char) *sp) >> 4)["0123456789ABCDEF"];
 943               *dp++ = (((unsigned char) *sp) & 15)["0123456789ABCDEF"];
 944             }
 945         *dp++ = '\'';
 946         *dp = '\0';
 947
 948         return token_rep;
 949       }
 950     break;
 951
 952     case T_STOP:
 953       token_rep = xmalloc (1);
 954       *token_rep = '\0';
 955       return token_rep;
 956
 957     case T_EXP:
 958       return xstrdup ("**");
 959
 960     default:
 961       return xstrdup (lex_token_name (lexer->token));
 962     }
 963
 964   NOT_REACHED ();
 965 }
 966 \f
 967 /* Really weird functions. */
 968
 969 /* Most of the time, a `-' is a lead-in to a negative number.  But
 970    sometimes it's actually part of the syntax.  If a dash can be part
 971    of syntax then this function is called to rip it off of a
 972    number. */
 973 void
 974 lex_negative_to_dash (struct lexer *lexer)
 975 {
 976   if (lexer->token == T_NEG_NUM)
 977     {
 978       lexer->token = T_POS_NUM;
 979       lexer->tokval = -lexer->tokval;
 980       ds_assign_substring (&lexer->tokstr, ds_substr (&lexer->tokstr, 1, SIZE_MAX));
 981       save_token (lexer);
 982       lexer->token = '-';
 983     }
 984 }
 985
 986 /* Skip a COMMENT command. */
 987 void
 988 lex_skip_comment (struct lexer *lexer)
 989 {
 990   for (;;)
 991     {
 992       if (!lex_get_line (lexer))
 993         {
 994           lexer->put_token = T_STOP;
 995           lexer->eof = true;
 996           return;
 997         }
 998
 999       if (lexer->put_token == '.')
1000         break;
1001
1002       ds_cstr (&lexer->line_buffer); /* Ensures ds_end will point to a valid char */
1003       lexer->prog = ds_end (&lexer->line_buffer);
1004       if (lexer->dot)
1005         break;
1006     }
1007 }
1008 \f
1009 /* Private functions. */
1010
1011 /* When invoked, tokstr contains a string of binary, octal, or
1012    hex digits, according to TYPE.  The string is converted to
1013    characters having the specified values. */
1014 static void
1015 convert_numeric_string_to_char_string (struct lexer *lexer,
1016                                        enum string_type type)
1017 {
1018   const char *base_name;
1019   int base;
1020   int chars_per_byte;
1021   size_t byte_cnt;
1022   size_t i;
1023   char *p;
1024
1025   switch (type)
1026     {
1027     case BINARY_STRING:
1028       base_name = _("binary");
1029       base = 2;
1030       chars_per_byte = 8;
1031       break;
1032     case OCTAL_STRING:
1033       base_name = _("octal");
1034       base = 8;
1035       chars_per_byte = 3;
1036       break;
1037     case HEX_STRING:
1038       base_name = _("hex");
1039       base = 16;
1040       chars_per_byte = 2;
1041       break;
1042     default:
1043       NOT_REACHED ();
1044     }
1045
1046   byte_cnt = ds_length (&lexer->tokstr) / chars_per_byte;
1047   if (ds_length (&lexer->tokstr) % chars_per_byte)
1048     msg (SE, _("String of %s digits has %d characters, which is not a "
1049                "multiple of %d."),
1050          base_name, ds_length (&lexer->tokstr), chars_per_byte);
1051
1052   p = ds_cstr (&lexer->tokstr);
1053   for (i = 0; i < byte_cnt; i++)
1054     {
1055       int value;
1056       int j;
1057
1058       value = 0;
1059       for (j = 0; j < chars_per_byte; j++, p++)
1060         {
1061           int v;
1062
1063           if (*p >= '0' && *p <= '9')
1064             v = *p - '0';
1065           else
1066             {
1067               static const char alpha[] = "abcdef";
1068               const char *q = strchr (alpha, tolower ((unsigned char) *p));
1069
1070               if (q)
1071                 v = q - alpha + 10;
1072               else
1073                 v = base;
1074             }
1075
1076           if (v >= base)
1077             msg (SE, _("`%c' is not a valid %s digit."), *p, base_name);
1078
1079           value = value * base + v;
1080         }
1081
1082       ds_cstr (&lexer->tokstr)[i] = (unsigned char) value;
1083     }
1084
1085   ds_truncate (&lexer->tokstr, byte_cnt);
1086 }
1087
1088 /* Parses a string from the input buffer into tokstr.  The input
1089    buffer pointer lexer->prog must point to the initial single or double
1090    quote.  TYPE indicates the type of string to be parsed.
1091    Returns token type. */
1092 static int
1093 parse_string (struct lexer *lexer, enum string_type type)
1094 {
1095   if (type != CHARACTER_STRING)
1096     lexer->prog++;
1097
1098   /* Accumulate the entire string, joining sections indicated by +
1099      signs. */
1100   for (;;)
1101     {
1102       /* Single or double quote. */
1103       int c = *lexer->prog++;
1104
1105       /* Accumulate section. */
1106       for (;;)
1107         {
1108           /* Check end of line. */
1109           if (*lexer->prog == '\0')
1110             {
1111               msg (SE, _("Unterminated string constant."));
1112               goto finish;
1113             }
1114
1115           /* Double quote characters to embed them in strings. */
1116           if (*lexer->prog == c)
1117             {
1118               if (lexer->prog[1] == c)
1119                 lexer->prog++;
1120               else
1121                 break;
1122             }
1123
1124           ds_put_char (&lexer->tokstr, *lexer->prog++);
1125         }
1126       lexer->prog++;
1127
1128       /* Skip whitespace after final quote mark. */
1129       if (lexer->eof)
1130         break;
1131       for (;;)
1132         {
1133           while (isspace ((unsigned char) *lexer->prog))
1134             lexer->prog++;
1135           if (*lexer->prog)
1136             break;
1137
1138           if (lexer->dot)
1139             goto finish;
1140
1141           if (!lex_get_line (lexer))
1142             goto finish;
1143         }
1144
1145       /* Skip plus sign. */
1146       if (*lexer->prog != '+')
1147         break;
1148       lexer->prog++;
1149
1150       /* Skip whitespace after plus sign. */
1151       if (lexer->eof)
1152         break;
1153       for (;;)
1154         {
1155           while (isspace ((unsigned char) *lexer->prog))
1156             lexer->prog++;
1157           if (*lexer->prog)
1158             break;
1159
1160           if (lexer->dot)
1161             goto finish;
1162
1163           if (!lex_get_line (lexer))
1164             {
1165               msg (SE, _("Unexpected end of file in string concatenation."));
1166               goto finish;
1167             }
1168         }
1169
1170       /* Ensure that a valid string follows. */
1171       if (*lexer->prog != '\'' && *lexer->prog != '"')
1172         {
1173           msg (SE, _("String expected following `+'."));
1174           goto finish;
1175         }
1176     }
1177
1178   /* We come here when we've finished concatenating all the string sections
1179      into one large string. */
1180 finish:
1181   if (type != CHARACTER_STRING)
1182     convert_numeric_string_to_char_string (lexer, type);
1183
1184   if (ds_length (&lexer->tokstr) > 255)
1185     {
1186       msg (SE, _("String exceeds 255 characters in length (%d characters)."),
1187            ds_length (&lexer->tokstr));
1188       ds_truncate (&lexer->tokstr, 255);
1189     }
1190
1191   return T_STRING;
1192 }
1193 \f
1194 #if DUMP_TOKENS
1195 /* Reads one token from the lexer and writes a textual representation
1196    on stdout for debugging purposes. */
1197 static void
1198 dump_token (struct lexer *lexer)
1199 {
1200   {
1201     const char *curfn;
1202     int curln;
1203
1204     getl_location (&curfn, &curln);
1205     if (curfn)
1206       fprintf (stderr, "%s:%d\t", curfn, curln);
1207   }
1208
1209   switch (lexer->token)
1210     {
1211     case T_ID:
1212       fprintf (stderr, "ID\t%s\n", lexer->tokid);
1213       break;
1214
1215     case T_POS_NUM:
1216     case T_NEG_NUM:
1217       fprintf (stderr, "NUM\t%f\n", lexer->tokval);
1218       break;
1219
1220     case T_STRING:
1221       fprintf (stderr, "STRING\t\"%s\"\n", ds_cstr (&lexer->tokstr));
1222       break;
1223
1224     case T_STOP:
1225       fprintf (stderr, "STOP\n");
1226       break;
1227
1228     case T_EXP:
1229       fprintf (stderr, "MISC\tEXP\"");
1230       break;
1231
1232     case 0:
1233       fprintf (stderr, "MISC\tEOF\n");
1234       break;
1235
1236     default:
1237       if (lex_is_keyword (token))
1238         fprintf (stderr, "KEYWORD\t%s\n", lex_token_name (token));
1239       else
1240         fprintf (stderr, "PUNCT\t%c\n", lexer->token);
1241       break;
1242     }
1243 }
1244 #endif /* DUMP_TOKENS */
1245
1246
1247 /* Token Accessor Functions */
1248
1249 int
1250 lex_token (const struct lexer *lexer)
1251 {
1252   return lexer->token;
1253 }
1254
1255 double
1256 lex_tokval (const struct lexer *lexer)
1257 {
1258   return lexer->tokval;
1259 }
1260
1261 const char *
1262 lex_tokid (const struct lexer *lexer)
1263 {
1264   return lexer->tokid;
1265 }
1266
1267 const struct string *
1268 lex_tokstr (const struct lexer *lexer)
1269 {
1270   return &lexer->tokstr;
1271 }