src/language/lexer/lexer.c

   1 /* PSPP - computes sample statistics.
   2    Copyright (C) 1997-9, 2000, 2006 Free Software Foundation, Inc.
   3    Written by Ben Pfaff <blp@gnu.org>.
   4
   5    This program is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU General Public License as
   7    published by the Free Software Foundation; either version 2 of the
   8    License, or (at your option) any later version.
   9
  10    This program is distributed in the hope that it will be useful, but
  11    WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program; if not, write to the Free Software
  17    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  18    02110-1301, USA. */
  19
  20 #include <config.h>
  21 #include "lexer.h"
  22 #include <libpspp/message.h>
  23 #include <ctype.h>
  24 #include <errno.h>
  25 #include <limits.h>
  26 #include <math.h>
  27 #include <stdarg.h>
  28 #include <stdlib.h>
  29 #include <libpspp/alloc.h>
  30 #include <libpspp/assertion.h>
  31 #include <language/command.h>
  32 #include <libpspp/message.h>
  33 #include <libpspp/magic.h>
  34 #include <data/settings.h>
  35 #include <libpspp/getl.h>
  36 #include <libpspp/str.h>
  37
  38 #include "size_max.h"
  39
  40 #include "gettext.h"
  41 #define _(msgid) gettext (msgid)
  42 #define N_(msgid) msgid
  43
  44
  45 #define DUMP_TOKENS 0
  46
  47
  48 struct lexer
  49 {
  50   struct string line_buffer;
  51
  52   struct source_stream *ss;
  53
  54   int token;      /* Current token. */
  55   double tokval;  /* T_POS_NUM, T_NEG_NUM: the token's value. */
  56
  57   char tokid [LONG_NAME_LEN + 1];   /* T_ID: the identifier. */
  58
  59   struct string tokstr;   /* T_ID, T_STRING: token string value.
  60                             For T_ID, this is not truncated as is
  61                             tokid. */
  62
  63   char *prog; /* Pointer to next token in line_buffer. */
  64   bool dot;   /* True only if this line ends with a terminal dot. */
  65
  66   int put_token ; /* If nonzero, next token returned by lex_get().
  67                     Used only in exceptional circumstances. */
  68
  69   struct string put_tokstr;
  70   double put_tokval;
  71 };
  72
  73
  74 static int parse_id (struct lexer *);
  75
  76 /* How a string represents its contents. */
  77 enum string_type
  78   {
  79     CHARACTER_STRING,   /* Characters. */
  80     BINARY_STRING,      /* Binary digits. */
  81     OCTAL_STRING,       /* Octal digits. */
  82     HEX_STRING          /* Hexadecimal digits. */
  83   };
  84
  85 static int parse_string (struct lexer *, enum string_type);
  86
  87 #if DUMP_TOKENS
  88 static void dump_token (struct lexer *);
  89 #endif
  90 \f
  91 /* Initialization. */
  92
  93 /* Initializes the lexer. */
  94 struct lexer *
  95 lex_create (struct source_stream *ss)
  96 {
  97   struct lexer *lexer = xzalloc (sizeof (*lexer));
  98
  99   ds_init_empty (&lexer->tokstr);
 100   ds_init_empty (&lexer->put_tokstr);
 101   ds_init_empty (&lexer->line_buffer);
 102   lexer->ss = ss;
 103
 104   return lexer;
 105 }
 106
 107 struct source_stream *
 108 lex_get_source_stream (const struct lexer *lex)
 109 {
 110   return lex->ss;
 111 }
 112
 113
 114 void
 115 lex_destroy (struct lexer *lexer)
 116 {
 117   if ( NULL != lexer )
 118     {
 119       ds_destroy (&lexer->put_tokstr);
 120       ds_destroy (&lexer->tokstr);
 121       ds_destroy (&lexer->line_buffer);
 122
 123       free (lexer);
 124     }
 125 }
 126
 127 \f
 128 /* Common functions. */
 129
 130 /* Copies put_token, lexer->put_tokstr, put_tokval into token, tokstr,
 131    tokval, respectively, and sets tokid appropriately. */
 132 static void
 133 restore_token (struct lexer *lexer)
 134 {
 135   assert (lexer->put_token != 0);
 136   lexer->token = lexer->put_token;
 137   ds_assign_string (&lexer->tokstr, &lexer->put_tokstr);
 138   str_copy_trunc (lexer->tokid, sizeof lexer->tokid, ds_cstr (&lexer->tokstr));
 139   lexer->tokval = lexer->put_tokval;
 140   lexer->put_token = 0;
 141 }
 142
 143 /* Copies token, tokstr, lexer->tokval into lexer->put_token, put_tokstr,
 144    put_lexer->tokval respectively. */
 145 static void
 146 save_token (struct lexer *lexer)
 147 {
 148   lexer->put_token = lexer->token;
 149   ds_assign_string (&lexer->put_tokstr, &lexer->tokstr);
 150   lexer->put_tokval = lexer->tokval;
 151 }
 152
 153 /* Parses a single token, setting appropriate global variables to
 154    indicate the token's attributes. */
 155 void
 156 lex_get (struct lexer *lexer)
 157 {
 158   /* Find a token. */
 159   for (;;)
 160     {
 161       if (NULL == lexer->prog && ! lex_get_line (lexer) )
 162         {
 163           lexer->token = T_STOP;
 164           return;
 165         }
 166
 167   /* If a token was pushed ahead, return it. */
 168   if (lexer->put_token)
 169     {
 170       restore_token (lexer);
 171 #if DUMP_TOKENS
 172           dump_token (lexer);
 173 #endif
 174       return;
 175     }
 176
 177   for (;;)
 178     {
 179       /* Skip whitespace. */
 180           while (isspace ((unsigned char) *lexer->prog))
 181             lexer->prog++;
 182
 183           if (*lexer->prog)
 184             break;
 185
 186           if (lexer->dot)
 187             {
 188               lexer->dot = 0;
 189               lexer->token = '.';
 190 #if DUMP_TOKENS
 191               dump_token (lexer);
 192 #endif
 193               return;
 194             }
 195           else if (!lex_get_line (lexer))
 196             {
 197               lexer->prog = NULL;
 198               lexer->token = T_STOP;
 199 #if DUMP_TOKENS
 200               dump_token (lexer);
 201 #endif
 202               return;
 203             }
 204
 205           if (lexer->put_token)
 206             {
 207               restore_token (lexer);
 208 #if DUMP_TOKENS
 209               dump_token (lexer);
 210 #endif
 211               return;
 212             }
 213         }
 214
 215
 216       /* Actually parse the token. */
 217       ds_clear (&lexer->tokstr);
 218
 219       switch (*lexer->prog)
 220         {
 221         case '-': case '.':
 222         case '0': case '1': case '2': case '3': case '4':
 223         case '5': case '6': case '7': case '8': case '9':
 224           {
 225             char *tail;
 226
 227             /* `-' can introduce a negative number, or it can be a
 228                token by itself.  If it is not followed by a digit or a
 229                decimal point, it is definitely not a number.
 230                Otherwise, it might be either, but most of the time we
 231                want it as a number.  When the syntax calls for a `-'
 232                token, lex_negative_to_dash() must be used to break
 233                negative numbers into two tokens. */
 234             if (*lexer->prog == '-')
 235               {
 236                 ds_put_char (&lexer->tokstr, *lexer->prog++);
 237                 while (isspace ((unsigned char) *lexer->prog))
 238                   lexer->prog++;
 239
 240                 if (!isdigit ((unsigned char) *lexer->prog) && *lexer->prog != '.')
 241                   {
 242                     lexer->token = '-';
 243                     break;
 244                   }
 245                 lexer->token = T_NEG_NUM;
 246               }
 247             else
 248               lexer->token = T_POS_NUM;
 249
 250             /* Parse the number, copying it into tokstr. */
 251             while (isdigit ((unsigned char) *lexer->prog))
 252               ds_put_char (&lexer->tokstr, *lexer->prog++);
 253             if (*lexer->prog == '.')
 254               {
 255                 ds_put_char (&lexer->tokstr, *lexer->prog++);
 256                 while (isdigit ((unsigned char) *lexer->prog))
 257                   ds_put_char (&lexer->tokstr, *lexer->prog++);
 258               }
 259             if (*lexer->prog == 'e' || *lexer->prog == 'E')
 260               {
 261                 ds_put_char (&lexer->tokstr, *lexer->prog++);
 262                 if (*lexer->prog == '+' || *lexer->prog == '-')
 263                   ds_put_char (&lexer->tokstr, *lexer->prog++);
 264                 while (isdigit ((unsigned char) *lexer->prog))
 265                   ds_put_char (&lexer->tokstr, *lexer->prog++);
 266               }
 267
 268             /* Parse as floating point. */
 269             lexer->tokval = strtod (ds_cstr (&lexer->tokstr), &tail);
 270             if (*tail)
 271               {
 272                 msg (SE, _("%s does not form a valid number."),
 273                      ds_cstr (&lexer->tokstr));
 274                 lexer->tokval = 0.0;
 275
 276                 ds_clear (&lexer->tokstr);
 277                 ds_put_char (&lexer->tokstr, '0');
 278               }
 279
 280             break;
 281           }
 282
 283         case '\'': case '"':
 284           lexer->token = parse_string (lexer, CHARACTER_STRING);
 285           break;
 286
 287         case '(': case ')': case ',': case '=': case '+': case '/':
 288           lexer->token = *lexer->prog++;
 289           break;
 290
 291         case '*':
 292           if (*++lexer->prog == '*')
 293             {
 294               lexer->prog++;
 295               lexer->token = T_EXP;
 296             }
 297           else
 298             lexer->token = '*';
 299           break;
 300
 301         case '<':
 302           if (*++lexer->prog == '=')
 303             {
 304               lexer->prog++;
 305               lexer->token = T_LE;
 306             }
 307           else if (*lexer->prog == '>')
 308             {
 309               lexer->prog++;
 310               lexer->token = T_NE;
 311             }
 312           else
 313             lexer->token = T_LT;
 314           break;
 315
 316         case '>':
 317           if (*++lexer->prog == '=')
 318             {
 319               lexer->prog++;
 320               lexer->token = T_GE;
 321             }
 322           else
 323             lexer->token = T_GT;
 324           break;
 325
 326         case '~':
 327           if (*++lexer->prog == '=')
 328             {
 329               lexer->prog++;
 330               lexer->token = T_NE;
 331             }
 332           else
 333             lexer->token = T_NOT;
 334           break;
 335
 336         case '&':
 337           lexer->prog++;
 338           lexer->token = T_AND;
 339           break;
 340
 341         case '|':
 342           lexer->prog++;
 343           lexer->token = T_OR;
 344           break;
 345
 346         case 'b': case 'B':
 347           if (lexer->prog[1] == '\'' || lexer->prog[1] == '"')
 348             lexer->token = parse_string (lexer, BINARY_STRING);
 349           else
 350             lexer->token = parse_id (lexer);
 351           break;
 352
 353         case 'o': case 'O':
 354           if (lexer->prog[1] == '\'' || lexer->prog[1] == '"')
 355             lexer->token = parse_string (lexer, OCTAL_STRING);
 356           else
 357             lexer->token = parse_id (lexer);
 358           break;
 359
 360         case 'x': case 'X':
 361           if (lexer->prog[1] == '\'' || lexer->prog[1] == '"')
 362             lexer->token = parse_string (lexer, HEX_STRING);
 363           else
 364             lexer->token = parse_id (lexer);
 365           break;
 366
 367         default:
 368           if (lex_is_id1 (*lexer->prog))
 369             {
 370               lexer->token = parse_id (lexer);
 371               break;
 372             }
 373           else
 374             {
 375               if (isgraph ((unsigned char) *lexer->prog))
 376                 msg (SE, _("Bad character in input: `%c'."), *lexer->prog++);
 377               else
 378                 msg (SE, _("Bad character in input: `\\%o'."), *lexer->prog++);
 379               continue;
 380             }
 381         }
 382       break;
 383     }
 384
 385 #if DUMP_TOKENS
 386   dump_token (lexer);
 387 #endif
 388 }
 389
 390 /* Parses an identifier at the current position into tokid and
 391    tokstr.
 392    Returns the correct token type. */
 393 static int
 394 parse_id (struct lexer *lexer)
 395 {
 396   struct substring rest_of_line
 397     = ss_substr (ds_ss (&lexer->line_buffer),
 398                  ds_pointer_to_position (&lexer->line_buffer, lexer->prog),
 399                  SIZE_MAX);
 400   struct substring id = ss_head (rest_of_line,
 401                                  lex_id_get_length (rest_of_line));
 402   lexer->prog += ss_length (id);
 403
 404   ds_assign_substring (&lexer->tokstr, id);
 405   str_copy_trunc (lexer->tokid, sizeof lexer->tokid, ds_cstr (&lexer->tokstr));
 406   return lex_id_to_token (id);
 407 }
 408
 409 /* Reports an error to the effect that subcommand SBC may only be
 410    specified once. */
 411 void
 412 lex_sbc_only_once (const char *sbc)
 413 {
 414   msg (SE, _("Subcommand %s may only be specified once."), sbc);
 415 }
 416
 417 /* Reports an error to the effect that subcommand SBC is
 418    missing. */
 419 void
 420 lex_sbc_missing (struct lexer *lexer, const char *sbc)
 421 {
 422   lex_error (lexer, _("missing required subcommand %s"), sbc);
 423 }
 424
 425 /* Prints a syntax error message containing the current token and
 426    given message MESSAGE (if non-null). */
 427 void
 428 lex_error (struct lexer *lexer, const char *message, ...)
 429 {
 430   char *token_rep;
 431   char where[128];
 432
 433   token_rep = lex_token_representation (lexer);
 434   if (lexer->token == T_STOP)
 435     strcpy (where, "end of file");
 436   else if (lexer->token == '.')
 437     strcpy (where, "end of command");
 438   else
 439     snprintf (where, sizeof where, "`%s'", token_rep);
 440   free (token_rep);
 441
 442   if (message)
 443     {
 444       char buf[1024];
 445       va_list args;
 446
 447       va_start (args, message);
 448       vsnprintf (buf, 1024, message, args);
 449       va_end (args);
 450
 451       msg (SE, _("Syntax error %s at %s."), buf, where);
 452     }
 453   else
 454     msg (SE, _("Syntax error at %s."), where);
 455 }
 456
 457 /* Checks that we're at end of command.
 458    If so, returns a successful command completion code.
 459    If not, flags a syntax error and returns an error command
 460    completion code. */
 461 int
 462 lex_end_of_command (struct lexer *lexer)
 463 {
 464   if (lexer->token != '.')
 465     {
 466       lex_error (lexer, _("expecting end of command"));
 467       return CMD_FAILURE;
 468     }
 469   else
 470     return CMD_SUCCESS;
 471 }
 472 \f
 473 /* Token testing functions. */
 474
 475 /* Returns true if the current token is a number. */
 476 bool
 477 lex_is_number (struct lexer *lexer)
 478 {
 479   return lexer->token == T_POS_NUM || lexer->token == T_NEG_NUM;
 480 }
 481
 482 /* Returns the value of the current token, which must be a
 483    floating point number. */
 484 double
 485 lex_number (struct lexer *lexer)
 486 {
 487   assert (lex_is_number (lexer));
 488   return lexer->tokval;
 489 }
 490
 491 /* Returns true iff the current token is an integer. */
 492 bool
 493 lex_is_integer (struct lexer *lexer)
 494 {
 495   return (lex_is_number (lexer)
 496           && lexer->tokval != NOT_LONG
 497           && lexer->tokval >= LONG_MIN
 498           && lexer->tokval <= LONG_MAX
 499           && floor (lexer->tokval) == lexer->tokval);
 500 }
 501
 502 /* Returns the value of the current token, which must be an
 503    integer. */
 504 long
 505 lex_integer (struct lexer *lexer)
 506 {
 507   assert (lex_is_integer (lexer));
 508   return lexer->tokval;
 509 }
 510 \f
 511 /* Token matching functions. */
 512
 513 /* If TOK is the current token, skips it and returns true
 514    Otherwise, returns false. */
 515 bool
 516 lex_match (struct lexer *lexer, int t)
 517 {
 518   if (lexer->token == t)
 519     {
 520       lex_get (lexer);
 521       return true;
 522     }
 523   else
 524     return false;
 525 }
 526
 527 /* If the current token is the identifier S, skips it and returns
 528    true.  The identifier may be abbreviated to its first three
 529    letters.
 530    Otherwise, returns false. */
 531 bool
 532 lex_match_id (struct lexer *lexer, const char *s)
 533 {
 534   if (lexer->token == T_ID
 535       && lex_id_match (ss_cstr (s), ss_cstr (lexer->tokid)))
 536     {
 537       lex_get (lexer);
 538       return true;
 539     }
 540   else
 541     return false;
 542 }
 543
 544 /* If the current token is integer N, skips it and returns true.
 545    Otherwise, returns false. */
 546 bool
 547 lex_match_int (struct lexer *lexer, int x)
 548 {
 549   if (lex_is_integer (lexer) && lex_integer (lexer) == x)
 550     {
 551       lex_get (lexer);
 552       return true;
 553     }
 554   else
 555     return false;
 556 }
 557 \f
 558 /* Forced matches. */
 559
 560 /* If this token is identifier S, fetches the next token and returns
 561    nonzero.
 562    Otherwise, reports an error and returns zero. */
 563 bool
 564 lex_force_match_id (struct lexer *lexer, const char *s)
 565 {
 566   if (lex_match_id (lexer, s))
 567     return true;
 568   else
 569     {
 570       lex_error (lexer, _("expecting `%s'"), s);
 571       return false;
 572     }
 573 }
 574
 575 /* If the current token is T, skips the token.  Otherwise, reports an
 576    error and returns from the current function with return value false. */
 577 bool
 578 lex_force_match (struct lexer *lexer, int t)
 579 {
 580   if (lexer->token == t)
 581     {
 582       lex_get (lexer);
 583       return true;
 584     }
 585   else
 586     {
 587       lex_error (lexer, _("expecting `%s'"), lex_token_name (t));
 588       return false;
 589     }
 590 }
 591
 592 /* If this token is a string, does nothing and returns true.
 593    Otherwise, reports an error and returns false. */
 594 bool
 595 lex_force_string (struct lexer *lexer)
 596 {
 597   if (lexer->token == T_STRING)
 598     return true;
 599   else
 600     {
 601       lex_error (lexer, _("expecting string"));
 602       return false;
 603     }
 604 }
 605
 606 /* If this token is an integer, does nothing and returns true.
 607    Otherwise, reports an error and returns false. */
 608 bool
 609 lex_force_int (struct lexer *lexer)
 610 {
 611   if (lex_is_integer (lexer))
 612     return true;
 613   else
 614     {
 615       lex_error (lexer, _("expecting integer"));
 616       return false;
 617     }
 618 }
 619
 620 /* If this token is a number, does nothing and returns true.
 621    Otherwise, reports an error and returns false. */
 622 bool
 623 lex_force_num (struct lexer *lexer)
 624 {
 625   if (lex_is_number (lexer))
 626     return true;
 627
 628   lex_error (lexer, _("expecting number"));
 629   return false;
 630 }
 631
 632 /* If this token is an identifier, does nothing and returns true.
 633    Otherwise, reports an error and returns false. */
 634 bool
 635 lex_force_id (struct lexer *lexer)
 636 {
 637   if (lexer->token == T_ID)
 638     return true;
 639
 640   lex_error (lexer, _("expecting identifier"));
 641   return false;
 642 }
 643
 644 /* Weird token functions. */
 645
 646 /* Returns the first character of the next token, except that if the
 647    next token is not an identifier, the character returned will not be
 648    a character that can begin an identifier.  Specifically, the
 649    hexstring lead-in X' causes lookahead() to return '.  Note that an
 650    alphanumeric return value doesn't guarantee an ID token, it could
 651    also be a reserved-word token. */
 652 int
 653 lex_look_ahead (struct lexer *lexer)
 654 {
 655   if (lexer->put_token)
 656     return lexer->put_token;
 657
 658   for (;;)
 659     {
 660       if (NULL == lexer->prog && ! lex_get_line (lexer) )
 661         return 0;
 662
 663       for (;;)
 664         {
 665           while (isspace ((unsigned char) *lexer->prog))
 666             lexer->prog++;
 667           if (*lexer->prog)
 668             break;
 669
 670           if (lexer->dot)
 671             return '.';
 672           else if (!lex_get_line (lexer))
 673             return 0;
 674
 675           if (lexer->put_token)
 676             return lexer->put_token;
 677         }
 678
 679       if ((toupper ((unsigned char) *lexer->prog) == 'X'
 680            || toupper ((unsigned char) *lexer->prog) == 'B'
 681            || toupper ((unsigned char) *lexer->prog) == 'O')
 682           && (lexer->prog[1] == '\'' || lexer->prog[1] == '"'))
 683         return '\'';
 684
 685       return *lexer->prog;
 686     }
 687 }
 688
 689 /* Makes the current token become the next token to be read; the
 690    current token is set to T. */
 691 void
 692 lex_put_back (struct lexer *lexer, int t)
 693 {
 694   save_token (lexer);
 695   lexer->token = t;
 696 }
 697
 698 /* Makes the current token become the next token to be read; the
 699    current token is set to the identifier ID. */
 700 void
 701 lex_put_back_id (struct lexer *lexer, const char *id)
 702 {
 703   assert (lex_id_to_token (ss_cstr (id)) == T_ID);
 704   save_token (lexer);
 705   lexer->token = T_ID;
 706   ds_assign_cstr (&lexer->tokstr, id);
 707   str_copy_trunc (lexer->tokid, sizeof lexer->tokid, ds_cstr (&lexer->tokstr));
 708 }
 709 \f
 710 /* Weird line processing functions. */
 711
 712 /* Returns the entire contents of the current line. */
 713 const char *
 714 lex_entire_line (struct lexer *lexer)
 715 {
 716   return ds_cstr (&lexer->line_buffer);
 717 }
 718
 719 const struct string *
 720 lex_entire_line_ds (struct lexer *lexer)
 721 {
 722   return &lexer->line_buffer;
 723 }
 724
 725 /* As lex_entire_line(), but only returns the part of the current line
 726    that hasn't already been tokenized.
 727    If END_DOT is non-null, stores nonzero into *END_DOT if the line
 728    ends with a terminal dot, or zero if it doesn't. */
 729 const char *
 730 lex_rest_of_line (struct lexer *lexer, int *end_dot)
 731 {
 732   if (end_dot)
 733     *end_dot = lexer->dot;
 734   return lexer->prog;
 735 }
 736
 737 /* Causes the rest of the current input line to be ignored for
 738    tokenization purposes. */
 739 void
 740 lex_discard_line (struct lexer *lexer)
 741 {
 742   ds_cstr (&lexer->line_buffer);  /* Ensures ds_end points to something valid */
 743   lexer->prog = ds_end (&lexer->line_buffer);
 744   lexer->dot = false;
 745   lexer->put_token = 0;
 746 }
 747
 748
 749 /* Discards the rest of the current command.
 750    When we're reading commands from a file, we skip tokens until
 751    a terminal dot or EOF.
 752    When we're reading commands interactively from the user,
 753    that's just discarding the current line, because presumably
 754    the user doesn't want to finish typing a command that will be
 755    ignored anyway. */
 756 void
 757 lex_discard_rest_of_command (struct lexer *lexer)
 758 {
 759   if (!getl_is_interactive (lexer->ss))
 760     {
 761       while (lexer->token != T_STOP && lexer->token != '.')
 762         lex_get (lexer);
 763     }
 764   else
 765     lex_discard_line (lexer);
 766 }
 767 \f
 768 /* Weird line reading functions. */
 769
 770 /* Remove C-style comments in STRING, begun by slash-star and
 771    terminated by star-slash or newline. */
 772 static void
 773 strip_comments (struct string *string)
 774 {
 775   char *cp;
 776   int quote;
 777   bool in_comment;
 778
 779   in_comment = false;
 780   quote = EOF;
 781   for (cp = ds_cstr (string); *cp; )
 782     {
 783       /* If we're not in a comment, check for quote marks. */
 784       if (!in_comment)
 785         {
 786           if (*cp == quote)
 787             quote = EOF;
 788           else if (*cp == '\'' || *cp == '"')
 789             quote = *cp;
 790         }
 791
 792       /* If we're not inside a quotation, check for comment. */
 793       if (quote == EOF)
 794         {
 795           if (cp[0] == '/' && cp[1] == '*')
 796             {
 797               in_comment = true;
 798               *cp++ = ' ';
 799               *cp++ = ' ';
 800               continue;
 801             }
 802           else if (in_comment && cp[0] == '*' && cp[1] == '/')
 803             {
 804               in_comment = false;
 805               *cp++ = ' ';
 806               *cp++ = ' ';
 807               continue;
 808             }
 809         }
 810
 811       /* Check commenting. */
 812       if (in_comment)
 813         *cp = ' ';
 814       cp++;
 815     }
 816 }
 817
 818 /* Prepares LINE, which is subject to the given SYNTAX rules, for
 819    tokenization by stripping comments and determining whether it
 820    is the beginning or end of a command and storing into
 821    *LINE_STARTS_COMMAND and *LINE_ENDS_COMMAND appropriately. */
 822 void
 823 lex_preprocess_line (struct string *line,
 824                      enum getl_syntax syntax,
 825                      bool *line_starts_command,
 826                      bool *line_ends_command)
 827 {
 828   strip_comments (line);
 829   ds_rtrim (line, ss_cstr (CC_SPACES));
 830   *line_ends_command = (ds_chomp (line, get_endcmd ())
 831                         || (ds_is_empty (line) && get_nulline ()));
 832   *line_starts_command = false;
 833   if (syntax == GETL_BATCH)
 834     {
 835       int first = ds_first (line);
 836       *line_starts_command = !isspace (first);
 837       if (first == '+' || first == '-')
 838         *ds_data (line) = ' ';
 839     }
 840 }
 841
 842 /* Reads a line, without performing any preprocessing.
 843    Sets *SYNTAX, if SYNTAX is non-null, to the line's syntax
 844    mode. */
 845 bool
 846 lex_get_line_raw (struct lexer *lexer, enum getl_syntax *syntax)
 847 {
 848   enum getl_syntax dummy;
 849   bool ok = getl_read_line (lexer->ss, &lexer->line_buffer,
 850                               syntax != NULL ? syntax : &dummy);
 851   return ok;
 852 }
 853
 854 /* Reads a line for use by the tokenizer, and preprocesses it by
 855    removing comments, stripping trailing whitespace and the
 856    terminal dot, and removing leading indentors. */
 857 bool
 858 lex_get_line (struct lexer *lexer)
 859 {
 860   bool line_starts_command;
 861   enum getl_syntax syntax;
 862
 863   if (!lex_get_line_raw (lexer, &syntax))
 864     {
 865       lexer->prog = NULL;
 866     return false;
 867     }
 868
 869   lex_preprocess_line (&lexer->line_buffer, syntax,
 870                        &line_starts_command, &lexer->dot);
 871
 872   if (line_starts_command)
 873     lexer->put_token = '.';
 874
 875   lexer->prog = ds_cstr (&lexer->line_buffer);
 876   return true;
 877 }
 878 \f
 879 /* Token names. */
 880
 881 /* Returns the name of a token. */
 882 const char *
 883 lex_token_name (int token)
 884 {
 885   if (lex_is_keyword (token))
 886     return lex_id_name (token);
 887   else if (token < 256)
 888     {
 889       static char t[256][2];
 890       char *s = t[token];
 891       s[0] = token;
 892       s[1] = '\0';
 893       return s;
 894     }
 895   else
 896     NOT_REACHED ();
 897 }
 898
 899 /* Returns an ASCII representation of the current token as a
 900    malloc()'d string. */
 901 char *
 902 lex_token_representation (struct lexer *lexer)
 903 {
 904   char *token_rep;
 905
 906   switch (lexer->token)
 907     {
 908     case T_ID:
 909     case T_POS_NUM:
 910     case T_NEG_NUM:
 911       return ds_xstrdup (&lexer->tokstr);
 912       break;
 913
 914     case T_STRING:
 915       {
 916         int hexstring = 0;
 917         char *sp, *dp;
 918
 919         for (sp = ds_cstr (&lexer->tokstr); sp < ds_end (&lexer->tokstr); sp++)
 920           if (!isprint ((unsigned char) *sp))
 921             {
 922               hexstring = 1;
 923               break;
 924             }
 925
 926         token_rep = xmalloc (2 + ds_length (&lexer->tokstr) * 2 + 1 + 1);
 927
 928         dp = token_rep;
 929         if (hexstring)
 930           *dp++ = 'X';
 931         *dp++ = '\'';
 932
 933         if (!hexstring)
 934           for (sp = ds_cstr (&lexer->tokstr); *sp; )
 935             {
 936               if (*sp == '\'')
 937                 *dp++ = '\'';
 938               *dp++ = (unsigned char) *sp++;
 939             }
 940         else
 941           for (sp = ds_cstr (&lexer->tokstr); sp < ds_end (&lexer->tokstr); sp++)
 942             {
 943               *dp++ = (((unsigned char) *sp) >> 4)["0123456789ABCDEF"];
 944               *dp++ = (((unsigned char) *sp) & 15)["0123456789ABCDEF"];
 945             }
 946         *dp++ = '\'';
 947         *dp = '\0';
 948
 949         return token_rep;
 950       }
 951     break;
 952
 953     case T_STOP:
 954       token_rep = xmalloc (1);
 955       *token_rep = '\0';
 956       return token_rep;
 957
 958     case T_EXP:
 959       return xstrdup ("**");
 960
 961     default:
 962       return xstrdup (lex_token_name (lexer->token));
 963     }
 964
 965   NOT_REACHED ();
 966 }
 967 \f
 968 /* Really weird functions. */
 969
 970 /* Most of the time, a `-' is a lead-in to a negative number.  But
 971    sometimes it's actually part of the syntax.  If a dash can be part
 972    of syntax then this function is called to rip it off of a
 973    number. */
 974 void
 975 lex_negative_to_dash (struct lexer *lexer)
 976 {
 977   if (lexer->token == T_NEG_NUM)
 978     {
 979       lexer->token = T_POS_NUM;
 980       lexer->tokval = -lexer->tokval;
 981       ds_assign_substring (&lexer->tokstr, ds_substr (&lexer->tokstr, 1, SIZE_MAX));
 982       save_token (lexer);
 983       lexer->token = '-';
 984     }
 985 }
 986
 987 /* Skip a COMMENT command. */
 988 void
 989 lex_skip_comment (struct lexer *lexer)
 990 {
 991   for (;;)
 992     {
 993       if (!lex_get_line (lexer))
 994         {
 995           lexer->put_token = T_STOP;
 996           lexer->prog = NULL;
 997           return;
 998         }
 999
1000       if (lexer->put_token == '.')
1001         break;
1002
1003       ds_cstr (&lexer->line_buffer); /* Ensures ds_end will point to a valid char */
1004       lexer->prog = ds_end (&lexer->line_buffer);
1005       if (lexer->dot)
1006         break;
1007     }
1008 }
1009 \f
1010 /* Private functions. */
1011
1012 /* When invoked, tokstr contains a string of binary, octal, or
1013    hex digits, according to TYPE.  The string is converted to
1014    characters having the specified values. */
1015 static void
1016 convert_numeric_string_to_char_string (struct lexer *lexer,
1017                                        enum string_type type)
1018 {
1019   const char *base_name;
1020   int base;
1021   int chars_per_byte;
1022   size_t byte_cnt;
1023   size_t i;
1024   char *p;
1025
1026   switch (type)
1027     {
1028     case BINARY_STRING:
1029       base_name = _("binary");
1030       base = 2;
1031       chars_per_byte = 8;
1032       break;
1033     case OCTAL_STRING:
1034       base_name = _("octal");
1035       base = 8;
1036       chars_per_byte = 3;
1037       break;
1038     case HEX_STRING:
1039       base_name = _("hex");
1040       base = 16;
1041       chars_per_byte = 2;
1042       break;
1043     default:
1044       NOT_REACHED ();
1045     }
1046
1047   byte_cnt = ds_length (&lexer->tokstr) / chars_per_byte;
1048   if (ds_length (&lexer->tokstr) % chars_per_byte)
1049     msg (SE, _("String of %s digits has %d characters, which is not a "
1050                "multiple of %d."),
1051          base_name, ds_length (&lexer->tokstr), chars_per_byte);
1052
1053   p = ds_cstr (&lexer->tokstr);
1054   for (i = 0; i < byte_cnt; i++)
1055     {
1056       int value;
1057       int j;
1058
1059       value = 0;
1060       for (j = 0; j < chars_per_byte; j++, p++)
1061         {
1062           int v;
1063
1064           if (*p >= '0' && *p <= '9')
1065             v = *p - '0';
1066           else
1067             {
1068               static const char alpha[] = "abcdef";
1069               const char *q = strchr (alpha, tolower ((unsigned char) *p));
1070
1071               if (q)
1072                 v = q - alpha + 10;
1073               else
1074                 v = base;
1075             }
1076
1077           if (v >= base)
1078             msg (SE, _("`%c' is not a valid %s digit."), *p, base_name);
1079
1080           value = value * base + v;
1081         }
1082
1083       ds_cstr (&lexer->tokstr)[i] = (unsigned char) value;
1084     }
1085
1086   ds_truncate (&lexer->tokstr, byte_cnt);
1087 }
1088
1089 /* Parses a string from the input buffer into tokstr.  The input
1090    buffer pointer lexer->prog must point to the initial single or double
1091    quote.  TYPE indicates the type of string to be parsed.
1092    Returns token type. */
1093 static int
1094 parse_string (struct lexer *lexer, enum string_type type)
1095 {
1096   if (type != CHARACTER_STRING)
1097     lexer->prog++;
1098
1099   /* Accumulate the entire string, joining sections indicated by +
1100      signs. */
1101   for (;;)
1102     {
1103       /* Single or double quote. */
1104       int c = *lexer->prog++;
1105
1106       /* Accumulate section. */
1107       for (;;)
1108         {
1109           /* Check end of line. */
1110           if (*lexer->prog == '\0')
1111             {
1112               msg (SE, _("Unterminated string constant."));
1113               goto finish;
1114             }
1115
1116           /* Double quote characters to embed them in strings. */
1117           if (*lexer->prog == c)
1118             {
1119               if (lexer->prog[1] == c)
1120                 lexer->prog++;
1121               else
1122                 break;
1123             }
1124
1125           ds_put_char (&lexer->tokstr, *lexer->prog++);
1126         }
1127       lexer->prog++;
1128
1129       /* Skip whitespace after final quote mark. */
1130       if (lexer->prog == NULL)
1131         break;
1132       for (;;)
1133         {
1134           while (isspace ((unsigned char) *lexer->prog))
1135             lexer->prog++;
1136           if (*lexer->prog)
1137             break;
1138
1139           if (lexer->dot)
1140             goto finish;
1141
1142           if (!lex_get_line (lexer))
1143             goto finish;
1144         }
1145
1146       /* Skip plus sign. */
1147       if (*lexer->prog != '+')
1148         break;
1149       lexer->prog++;
1150
1151       /* Skip whitespace after plus sign. */
1152       if (lexer->prog == NULL)
1153         break;
1154       for (;;)
1155         {
1156           while (isspace ((unsigned char) *lexer->prog))
1157             lexer->prog++;
1158           if (*lexer->prog)
1159             break;
1160
1161           if (lexer->dot)
1162             goto finish;
1163
1164           if (!lex_get_line (lexer))
1165             {
1166               msg (SE, _("Unexpected end of file in string concatenation."));
1167               goto finish;
1168             }
1169         }
1170
1171       /* Ensure that a valid string follows. */
1172       if (*lexer->prog != '\'' && *lexer->prog != '"')
1173         {
1174           msg (SE, _("String expected following `+'."));
1175           goto finish;
1176         }
1177     }
1178
1179   /* We come here when we've finished concatenating all the string sections
1180      into one large string. */
1181 finish:
1182   if (type != CHARACTER_STRING)
1183     convert_numeric_string_to_char_string (lexer, type);
1184
1185   if (ds_length (&lexer->tokstr) > 255)
1186     {
1187       msg (SE, _("String exceeds 255 characters in length (%d characters)."),
1188            ds_length (&lexer->tokstr));
1189       ds_truncate (&lexer->tokstr, 255);
1190     }
1191
1192   return T_STRING;
1193 }
1194 \f
1195 #if DUMP_TOKENS
1196 /* Reads one token from the lexer and writes a textual representation
1197    on stdout for debugging purposes. */
1198 static void
1199 dump_token (struct lexer *lexer)
1200 {
1201   {
1202     const char *curfn;
1203     int curln;
1204
1205     curln = getl_source_location (lexer->ss);
1206     curfn = getl_source_name (lexer->ss);
1207     if (curfn)
1208       fprintf (stderr, "%s:%d\t", curfn, curln);
1209   }
1210
1211   switch (lexer->token)
1212     {
1213     case T_ID:
1214       fprintf (stderr, "ID\t%s\n", lexer->tokid);
1215       break;
1216
1217     case T_POS_NUM:
1218     case T_NEG_NUM:
1219       fprintf (stderr, "NUM\t%f\n", lexer->tokval);
1220       break;
1221
1222     case T_STRING:
1223       fprintf (stderr, "STRING\t\"%s\"\n", ds_cstr (&lexer->tokstr));
1224       break;
1225
1226     case T_STOP:
1227       fprintf (stderr, "STOP\n");
1228       break;
1229
1230     case T_EXP:
1231       fprintf (stderr, "MISC\tEXP\"");
1232       break;
1233
1234     case 0:
1235       fprintf (stderr, "MISC\tEOF\n");
1236       break;
1237
1238     default:
1239       if (lex_is_keyword (lexer->token))
1240         fprintf (stderr, "KEYWORD\t%s\n", lex_token_name (lexer->token));
1241       else
1242         fprintf (stderr, "PUNCT\t%c\n", lexer->token);
1243       break;
1244     }
1245 }
1246 #endif /* DUMP_TOKENS */
1247
1248
1249 /* Token Accessor Functions */
1250
1251 int
1252 lex_token (const struct lexer *lexer)
1253 {
1254   return lexer->token;
1255 }
1256
1257 double
1258 lex_tokval (const struct lexer *lexer)
1259 {
1260   return lexer->tokval;
1261 }
1262
1263 const char *
1264 lex_tokid (const struct lexer *lexer)
1265 {
1266   return lexer->tokid;
1267 }
1268
1269 const struct string *
1270 lex_tokstr (const struct lexer *lexer)
1271 {
1272   return &lexer->tokstr;
1273 }