src/language/lexer/lexer.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 1997-9, 2000, 2006 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18 #include "lexer.h"
  19 #include <libpspp/message.h>
  20 #include <ctype.h>
  21 #include <errno.h>
  22 #include <limits.h>
  23 #include <math.h>
  24 #include <stdarg.h>
  25 #include <stdint.h>
  26 #include <stdlib.h>
  27 #include <libpspp/assertion.h>
  28 #include <language/command.h>
  29 #include <libpspp/message.h>
  30 #include <data/settings.h>
  31 #include <libpspp/getl.h>
  32 #include <libpspp/str.h>
  33 #include <output/journal.h>
  34
  35 #include "xalloc.h"
  36
  37 #include "gettext.h"
  38 #define _(msgid) gettext (msgid)
  39 #define N_(msgid) msgid
  40
  41
  42 #define DUMP_TOKENS 0
  43
  44
  45
  46 struct lexer
  47 {
  48   struct string line_buffer;
  49
  50   struct source_stream *ss;
  51
  52   int token;      /* Current token. */
  53   double tokval;  /* T_POS_NUM, T_NEG_NUM: the token's value. */
  54
  55   char tokid [VAR_NAME_LEN + 1];   /* T_ID: the identifier. */
  56
  57   struct string tokstr;   /* T_ID, T_STRING: token string value.
  58                             For T_ID, this is not truncated as is
  59                             tokid. */
  60
  61   char *prog; /* Pointer to next token in line_buffer. */
  62   bool dot;   /* True only if this line ends with a terminal dot. */
  63
  64   int put_token ; /* If nonzero, next token returned by lex_get().
  65                     Used only in exceptional circumstances. */
  66
  67   struct string put_tokstr;
  68   double put_tokval;
  69 };
  70
  71
  72 static int parse_id (struct lexer *);
  73
  74 /* How a string represents its contents. */
  75 enum string_type
  76   {
  77     CHARACTER_STRING,   /* Characters. */
  78     BINARY_STRING,      /* Binary digits. */
  79     OCTAL_STRING,       /* Octal digits. */
  80     HEX_STRING          /* Hexadecimal digits. */
  81   };
  82
  83 static int parse_string (struct lexer *, enum string_type);
  84
  85 #if DUMP_TOKENS
  86 static void dump_token (struct lexer *);
  87 #endif
  88 \f
  89 /* Initialization. */
  90
  91 /* Initializes the lexer. */
  92 struct lexer *
  93 lex_create (struct source_stream *ss)
  94 {
  95   struct lexer *lexer = xzalloc (sizeof (*lexer));
  96
  97   ds_init_empty (&lexer->tokstr);
  98   ds_init_empty (&lexer->put_tokstr);
  99   ds_init_empty (&lexer->line_buffer);
 100   lexer->ss = ss;
 101
 102   return lexer;
 103 }
 104
 105 struct source_stream *
 106 lex_get_source_stream (const struct lexer *lex)
 107 {
 108   return lex->ss;
 109 }
 110
 111 enum syntax_mode
 112 lex_current_syntax_mode (const struct lexer *lex)
 113 {
 114   return source_stream_current_syntax_mode (lex->ss);
 115 }
 116
 117 enum error_mode
 118 lex_current_error_mode (const struct lexer *lex)
 119 {
 120   return source_stream_current_error_mode (lex->ss);
 121 }
 122
 123
 124 void
 125 lex_destroy (struct lexer *lexer)
 126 {
 127   if ( NULL != lexer )
 128     {
 129       ds_destroy (&lexer->put_tokstr);
 130       ds_destroy (&lexer->tokstr);
 131       ds_destroy (&lexer->line_buffer);
 132
 133       free (lexer);
 134     }
 135 }
 136
 137 \f
 138 /* Common functions. */
 139
 140 /* Copies put_token, lexer->put_tokstr, put_tokval into token, tokstr,
 141    tokval, respectively, and sets tokid appropriately. */
 142 static void
 143 restore_token (struct lexer *lexer)
 144 {
 145   assert (lexer->put_token != 0);
 146   lexer->token = lexer->put_token;
 147   ds_assign_string (&lexer->tokstr, &lexer->put_tokstr);
 148   str_copy_trunc (lexer->tokid, sizeof lexer->tokid, ds_cstr (&lexer->tokstr));
 149   lexer->tokval = lexer->put_tokval;
 150   lexer->put_token = 0;
 151 }
 152
 153 /* Copies token, tokstr, lexer->tokval into lexer->put_token, put_tokstr,
 154    put_lexer->tokval respectively. */
 155 static void
 156 save_token (struct lexer *lexer)
 157 {
 158   lexer->put_token = lexer->token;
 159   ds_assign_string (&lexer->put_tokstr, &lexer->tokstr);
 160   lexer->put_tokval = lexer->tokval;
 161 }
 162
 163 /* Parses a single token, setting appropriate global variables to
 164    indicate the token's attributes. */
 165 void
 166 lex_get (struct lexer *lexer)
 167 {
 168   /* Find a token. */
 169   for (;;)
 170     {
 171       if (NULL == lexer->prog && ! lex_get_line (lexer) )
 172         {
 173           lexer->token = T_STOP;
 174           return;
 175         }
 176
 177   /* If a token was pushed ahead, return it. */
 178   if (lexer->put_token)
 179     {
 180       restore_token (lexer);
 181 #if DUMP_TOKENS
 182           dump_token (lexer);
 183 #endif
 184       return;
 185     }
 186
 187   for (;;)
 188     {
 189       /* Skip whitespace. */
 190           while (isspace ((unsigned char) *lexer->prog))
 191             lexer->prog++;
 192
 193           if (*lexer->prog)
 194             break;
 195
 196           if (lexer->dot)
 197             {
 198               lexer->dot = 0;
 199               lexer->token = '.';
 200 #if DUMP_TOKENS
 201               dump_token (lexer);
 202 #endif
 203               return;
 204             }
 205           else if (!lex_get_line (lexer))
 206             {
 207               lexer->prog = NULL;
 208               lexer->token = T_STOP;
 209 #if DUMP_TOKENS
 210               dump_token (lexer);
 211 #endif
 212               return;
 213             }
 214
 215           if (lexer->put_token)
 216             {
 217               restore_token (lexer);
 218 #if DUMP_TOKENS
 219               dump_token (lexer);
 220 #endif
 221               return;
 222             }
 223         }
 224
 225
 226       /* Actually parse the token. */
 227       ds_clear (&lexer->tokstr);
 228
 229       switch (*lexer->prog)
 230         {
 231         case '-': case '.':
 232         case '0': case '1': case '2': case '3': case '4':
 233         case '5': case '6': case '7': case '8': case '9':
 234           {
 235             char *tail;
 236
 237             /* `-' can introduce a negative number, or it can be a
 238                token by itself.  If it is not followed by a digit or a
 239                decimal point, it is definitely not a number.
 240                Otherwise, it might be either, but most of the time we
 241                want it as a number.  When the syntax calls for a `-'
 242                token, lex_negative_to_dash() must be used to break
 243                negative numbers into two tokens. */
 244             if (*lexer->prog == '-')
 245               {
 246                 ds_put_char (&lexer->tokstr, *lexer->prog++);
 247                 while (isspace ((unsigned char) *lexer->prog))
 248                   lexer->prog++;
 249
 250                 if (!isdigit ((unsigned char) *lexer->prog) && *lexer->prog != '.')
 251                   {
 252                     lexer->token = '-';
 253                     break;
 254                   }
 255                 lexer->token = T_NEG_NUM;
 256               }
 257             else
 258               lexer->token = T_POS_NUM;
 259
 260             /* Parse the number, copying it into tokstr. */
 261             while (isdigit ((unsigned char) *lexer->prog))
 262               ds_put_char (&lexer->tokstr, *lexer->prog++);
 263             if (*lexer->prog == '.')
 264               {
 265                 ds_put_char (&lexer->tokstr, *lexer->prog++);
 266                 while (isdigit ((unsigned char) *lexer->prog))
 267                   ds_put_char (&lexer->tokstr, *lexer->prog++);
 268               }
 269             if (*lexer->prog == 'e' || *lexer->prog == 'E')
 270               {
 271                 ds_put_char (&lexer->tokstr, *lexer->prog++);
 272                 if (*lexer->prog == '+' || *lexer->prog == '-')
 273                   ds_put_char (&lexer->tokstr, *lexer->prog++);
 274                 while (isdigit ((unsigned char) *lexer->prog))
 275                   ds_put_char (&lexer->tokstr, *lexer->prog++);
 276               }
 277
 278             /* Parse as floating point. */
 279             lexer->tokval = strtod (ds_cstr (&lexer->tokstr), &tail);
 280             if (*tail)
 281               {
 282                 msg (SE, _("%s does not form a valid number."),
 283                      ds_cstr (&lexer->tokstr));
 284                 lexer->tokval = 0.0;
 285
 286                 ds_clear (&lexer->tokstr);
 287                 ds_put_char (&lexer->tokstr, '0');
 288               }
 289
 290             break;
 291           }
 292
 293         case '\'': case '"':
 294           lexer->token = parse_string (lexer, CHARACTER_STRING);
 295           break;
 296
 297         case '(': case ')': case ',': case '=': case '+': case '/':
 298         case '[': case ']':
 299           lexer->token = *lexer->prog++;
 300           break;
 301
 302         case '*':
 303           if (*++lexer->prog == '*')
 304             {
 305               lexer->prog++;
 306               lexer->token = T_EXP;
 307             }
 308           else
 309             lexer->token = '*';
 310           break;
 311
 312         case '<':
 313           if (*++lexer->prog == '=')
 314             {
 315               lexer->prog++;
 316               lexer->token = T_LE;
 317             }
 318           else if (*lexer->prog == '>')
 319             {
 320               lexer->prog++;
 321               lexer->token = T_NE;
 322             }
 323           else
 324             lexer->token = T_LT;
 325           break;
 326
 327         case '>':
 328           if (*++lexer->prog == '=')
 329             {
 330               lexer->prog++;
 331               lexer->token = T_GE;
 332             }
 333           else
 334             lexer->token = T_GT;
 335           break;
 336
 337         case '~':
 338           if (*++lexer->prog == '=')
 339             {
 340               lexer->prog++;
 341               lexer->token = T_NE;
 342             }
 343           else
 344             lexer->token = T_NOT;
 345           break;
 346
 347         case '&':
 348           lexer->prog++;
 349           lexer->token = T_AND;
 350           break;
 351
 352         case '|':
 353           lexer->prog++;
 354           lexer->token = T_OR;
 355           break;
 356
 357         case 'b': case 'B':
 358           if (lexer->prog[1] == '\'' || lexer->prog[1] == '"')
 359             lexer->token = parse_string (lexer, BINARY_STRING);
 360           else
 361             lexer->token = parse_id (lexer);
 362           break;
 363
 364         case 'o': case 'O':
 365           if (lexer->prog[1] == '\'' || lexer->prog[1] == '"')
 366             lexer->token = parse_string (lexer, OCTAL_STRING);
 367           else
 368             lexer->token = parse_id (lexer);
 369           break;
 370
 371         case 'x': case 'X':
 372           if (lexer->prog[1] == '\'' || lexer->prog[1] == '"')
 373             lexer->token = parse_string (lexer, HEX_STRING);
 374           else
 375             lexer->token = parse_id (lexer);
 376           break;
 377
 378         default:
 379           if (lex_is_id1 (*lexer->prog))
 380             {
 381               lexer->token = parse_id (lexer);
 382               break;
 383             }
 384           else
 385             {
 386               if (isgraph ((unsigned char) *lexer->prog))
 387                 msg (SE, _("Bad character in input: `%c'."), *lexer->prog++);
 388               else
 389                 msg (SE, _("Bad character in input: `\\%o'."), *lexer->prog++);
 390               continue;
 391             }
 392         }
 393       break;
 394     }
 395
 396 #if DUMP_TOKENS
 397   dump_token (lexer);
 398 #endif
 399 }
 400
 401 /* Parses an identifier at the current position into tokid and
 402    tokstr.
 403    Returns the correct token type. */
 404 static int
 405 parse_id (struct lexer *lexer)
 406 {
 407   struct substring rest_of_line
 408     = ss_substr (ds_ss (&lexer->line_buffer),
 409                  ds_pointer_to_position (&lexer->line_buffer, lexer->prog),
 410                  SIZE_MAX);
 411   struct substring id = ss_head (rest_of_line,
 412                                  lex_id_get_length (rest_of_line));
 413   lexer->prog += ss_length (id);
 414
 415   ds_assign_substring (&lexer->tokstr, id);
 416   str_copy_trunc (lexer->tokid, sizeof lexer->tokid, ds_cstr (&lexer->tokstr));
 417   return lex_id_to_token (id);
 418 }
 419
 420 /* Reports an error to the effect that subcommand SBC may only be
 421    specified once. */
 422 void
 423 lex_sbc_only_once (const char *sbc)
 424 {
 425   msg (SE, _("Subcommand %s may only be specified once."), sbc);
 426 }
 427
 428 /* Reports an error to the effect that subcommand SBC is
 429    missing. */
 430 void
 431 lex_sbc_missing (struct lexer *lexer, const char *sbc)
 432 {
 433   lex_error (lexer, _("missing required subcommand %s"), sbc);
 434 }
 435
 436 /* Prints a syntax error message containing the current token and
 437    given message MESSAGE (if non-null). */
 438 void
 439 lex_error (struct lexer *lexer, const char *message, ...)
 440 {
 441   char *token_rep;
 442   char where[128];
 443
 444   token_rep = lex_token_representation (lexer);
 445   if (lexer->token == T_STOP)
 446     strcpy (where, "end of file");
 447   else if (lexer->token == '.')
 448     strcpy (where, "end of command");
 449   else
 450     snprintf (where, sizeof where, "`%s'", token_rep);
 451   free (token_rep);
 452
 453   if (message)
 454     {
 455       char buf[1024];
 456       va_list args;
 457
 458       va_start (args, message);
 459       vsnprintf (buf, 1024, message, args);
 460       va_end (args);
 461
 462       msg (SE, _("Syntax error %s at %s."), buf, where);
 463     }
 464   else
 465     msg (SE, _("Syntax error at %s."), where);
 466 }
 467
 468 /* Checks that we're at end of command.
 469    If so, returns a successful command completion code.
 470    If not, flags a syntax error and returns an error command
 471    completion code. */
 472 int
 473 lex_end_of_command (struct lexer *lexer)
 474 {
 475   if (lexer->token != '.')
 476     {
 477       lex_error (lexer, _("expecting end of command"));
 478       return CMD_FAILURE;
 479     }
 480   else
 481     return CMD_SUCCESS;
 482 }
 483 \f
 484 /* Token testing functions. */
 485
 486 /* Returns true if the current token is a number. */
 487 bool
 488 lex_is_number (struct lexer *lexer)
 489 {
 490   return lexer->token == T_POS_NUM || lexer->token == T_NEG_NUM;
 491 }
 492
 493
 494 /* Returns true if the current token is a string. */
 495 bool
 496 lex_is_string (struct lexer *lexer)
 497 {
 498   return lexer->token == T_STRING;
 499 }
 500
 501
 502 /* Returns the value of the current token, which must be a
 503    floating point number. */
 504 double
 505 lex_number (struct lexer *lexer)
 506 {
 507   assert (lex_is_number (lexer));
 508   return lexer->tokval;
 509 }
 510
 511 /* Returns true iff the current token is an integer. */
 512 bool
 513 lex_is_integer (struct lexer *lexer)
 514 {
 515   return (lex_is_number (lexer)
 516           && lexer->tokval > LONG_MIN
 517           && lexer->tokval <= LONG_MAX
 518           && floor (lexer->tokval) == lexer->tokval);
 519 }
 520
 521 /* Returns the value of the current token, which must be an
 522    integer. */
 523 long
 524 lex_integer (struct lexer *lexer)
 525 {
 526   assert (lex_is_integer (lexer));
 527   return lexer->tokval;
 528 }
 529 \f
 530 /* Token matching functions. */
 531
 532 /* If TOK is the current token, skips it and returns true
 533    Otherwise, returns false. */
 534 bool
 535 lex_match (struct lexer *lexer, int t)
 536 {
 537   if (lexer->token == t)
 538     {
 539       lex_get (lexer);
 540       return true;
 541     }
 542   else
 543     return false;
 544 }
 545
 546 /* If the current token is the identifier S, skips it and returns
 547    true.  The identifier may be abbreviated to its first three
 548    letters.
 549    Otherwise, returns false. */
 550 bool
 551 lex_match_id (struct lexer *lexer, const char *s)
 552 {
 553   return lex_match_id_n (lexer, s, 3);
 554 }
 555
 556 /* If the current token is the identifier S, skips it and returns
 557    true.  The identifier may be abbreviated to its first N
 558    letters.
 559    Otherwise, returns false. */
 560 bool
 561 lex_match_id_n (struct lexer *lexer, const char *s, size_t n)
 562 {
 563   if (lexer->token == T_ID
 564       && lex_id_match_n (ss_cstr (s), ss_cstr (lexer->tokid), n))
 565     {
 566       lex_get (lexer);
 567       return true;
 568     }
 569   else
 570     return false;
 571 }
 572
 573 /* If the current token is integer N, skips it and returns true.
 574    Otherwise, returns false. */
 575 bool
 576 lex_match_int (struct lexer *lexer, int x)
 577 {
 578   if (lex_is_integer (lexer) && lex_integer (lexer) == x)
 579     {
 580       lex_get (lexer);
 581       return true;
 582     }
 583   else
 584     return false;
 585 }
 586 \f
 587 /* Forced matches. */
 588
 589 /* If this token is identifier S, fetches the next token and returns
 590    nonzero.
 591    Otherwise, reports an error and returns zero. */
 592 bool
 593 lex_force_match_id (struct lexer *lexer, const char *s)
 594 {
 595   if (lex_match_id (lexer, s))
 596     return true;
 597   else
 598     {
 599       lex_error (lexer, _("expecting `%s'"), s);
 600       return false;
 601     }
 602 }
 603
 604 /* If the current token is T, skips the token.  Otherwise, reports an
 605    error and returns from the current function with return value false. */
 606 bool
 607 lex_force_match (struct lexer *lexer, int t)
 608 {
 609   if (lexer->token == t)
 610     {
 611       lex_get (lexer);
 612       return true;
 613     }
 614   else
 615     {
 616       lex_error (lexer, _("expecting `%s'"), lex_token_name (t));
 617       return false;
 618     }
 619 }
 620
 621 /* If this token is a string, does nothing and returns true.
 622    Otherwise, reports an error and returns false. */
 623 bool
 624 lex_force_string (struct lexer *lexer)
 625 {
 626   if (lexer->token == T_STRING)
 627     return true;
 628   else
 629     {
 630       lex_error (lexer, _("expecting string"));
 631       return false;
 632     }
 633 }
 634
 635 /* If this token is an integer, does nothing and returns true.
 636    Otherwise, reports an error and returns false. */
 637 bool
 638 lex_force_int (struct lexer *lexer)
 639 {
 640   if (lex_is_integer (lexer))
 641     return true;
 642   else
 643     {
 644       lex_error (lexer, _("expecting integer"));
 645       return false;
 646     }
 647 }
 648
 649 /* If this token is a number, does nothing and returns true.
 650    Otherwise, reports an error and returns false. */
 651 bool
 652 lex_force_num (struct lexer *lexer)
 653 {
 654   if (lex_is_number (lexer))
 655     return true;
 656
 657   lex_error (lexer, _("expecting number"));
 658   return false;
 659 }
 660
 661 /* If this token is an identifier, does nothing and returns true.
 662    Otherwise, reports an error and returns false. */
 663 bool
 664 lex_force_id (struct lexer *lexer)
 665 {
 666   if (lexer->token == T_ID)
 667     return true;
 668
 669   lex_error (lexer, _("expecting identifier"));
 670   return false;
 671 }
 672
 673 /* Weird token functions. */
 674
 675 /* Returns the first character of the next token, except that if the
 676    next token is not an identifier, the character returned will not be
 677    a character that can begin an identifier.  Specifically, the
 678    hexstring lead-in X' causes lookahead() to return '.  Note that an
 679    alphanumeric return value doesn't guarantee an ID token, it could
 680    also be a reserved-word token. */
 681 int
 682 lex_look_ahead (struct lexer *lexer)
 683 {
 684   if (lexer->put_token)
 685     return lexer->put_token;
 686
 687   for (;;)
 688     {
 689       if (NULL == lexer->prog && ! lex_get_line (lexer) )
 690         return 0;
 691
 692       for (;;)
 693         {
 694           while (isspace ((unsigned char) *lexer->prog))
 695             lexer->prog++;
 696           if (*lexer->prog)
 697             break;
 698
 699           if (lexer->dot)
 700             return '.';
 701           else if (!lex_get_line (lexer))
 702             return 0;
 703
 704           if (lexer->put_token)
 705             return lexer->put_token;
 706         }
 707
 708       if ((toupper ((unsigned char) *lexer->prog) == 'X'
 709            || toupper ((unsigned char) *lexer->prog) == 'B'
 710            || toupper ((unsigned char) *lexer->prog) == 'O')
 711           && (lexer->prog[1] == '\'' || lexer->prog[1] == '"'))
 712         return '\'';
 713
 714       return *lexer->prog;
 715     }
 716 }
 717
 718 /* Makes the current token become the next token to be read; the
 719    current token is set to T. */
 720 void
 721 lex_put_back (struct lexer *lexer, int t)
 722 {
 723   save_token (lexer);
 724   lexer->token = t;
 725 }
 726
 727 /* Makes the current token become the next token to be read; the
 728    current token is set to the identifier ID. */
 729 void
 730 lex_put_back_id (struct lexer *lexer, const char *id)
 731 {
 732   assert (lex_id_to_token (ss_cstr (id)) == T_ID);
 733   save_token (lexer);
 734   lexer->token = T_ID;
 735   ds_assign_cstr (&lexer->tokstr, id);
 736   str_copy_trunc (lexer->tokid, sizeof lexer->tokid, ds_cstr (&lexer->tokstr));
 737 }
 738 \f
 739 /* Weird line processing functions. */
 740
 741 /* Returns the entire contents of the current line. */
 742 const char *
 743 lex_entire_line (const struct lexer *lexer)
 744 {
 745   return ds_cstr (&lexer->line_buffer);
 746 }
 747
 748 const struct string *
 749 lex_entire_line_ds (const struct lexer *lexer)
 750 {
 751   return &lexer->line_buffer;
 752 }
 753
 754 /* As lex_entire_line(), but only returns the part of the current line
 755    that hasn't already been tokenized. */
 756 const char *
 757 lex_rest_of_line (const struct lexer *lexer)
 758 {
 759   return lexer->prog;
 760 }
 761
 762 /* Returns true if the current line ends in a terminal dot,
 763    false otherwise. */
 764 bool
 765 lex_end_dot (const struct lexer *lexer)
 766 {
 767   return lexer->dot;
 768 }
 769
 770 /* Causes the rest of the current input line to be ignored for
 771    tokenization purposes. */
 772 void
 773 lex_discard_line (struct lexer *lexer)
 774 {
 775   ds_cstr (&lexer->line_buffer);  /* Ensures ds_end points to something valid */
 776   lexer->prog = ds_end (&lexer->line_buffer);
 777   lexer->dot = false;
 778   lexer->put_token = 0;
 779 }
 780
 781
 782 /* Discards the rest of the current command.
 783    When we're reading commands from a file, we skip tokens until
 784    a terminal dot or EOF.
 785    When we're reading commands interactively from the user,
 786    that's just discarding the current line, because presumably
 787    the user doesn't want to finish typing a command that will be
 788    ignored anyway. */
 789 void
 790 lex_discard_rest_of_command (struct lexer *lexer)
 791 {
 792   if (!getl_is_interactive (lexer->ss))
 793     {
 794       while (lexer->token != T_STOP && lexer->token != '.')
 795         lex_get (lexer);
 796     }
 797   else
 798     lex_discard_line (lexer);
 799 }
 800 \f
 801 /* Weird line reading functions. */
 802
 803 /* Remove C-style comments in STRING, begun by slash-star and
 804    terminated by star-slash or newline. */
 805 static void
 806 strip_comments (struct string *string)
 807 {
 808   char *cp;
 809   int quote;
 810   bool in_comment;
 811
 812   in_comment = false;
 813   quote = EOF;
 814   for (cp = ds_cstr (string); *cp; )
 815     {
 816       /* If we're not in a comment, check for quote marks. */
 817       if (!in_comment)
 818         {
 819           if (*cp == quote)
 820             quote = EOF;
 821           else if (*cp == '\'' || *cp == '"')
 822             quote = *cp;
 823         }
 824
 825       /* If we're not inside a quotation, check for comment. */
 826       if (quote == EOF)
 827         {
 828           if (cp[0] == '/' && cp[1] == '*')
 829             {
 830               in_comment = true;
 831               *cp++ = ' ';
 832               *cp++ = ' ';
 833               continue;
 834             }
 835           else if (in_comment && cp[0] == '*' && cp[1] == '/')
 836             {
 837               in_comment = false;
 838               *cp++ = ' ';
 839               *cp++ = ' ';
 840               continue;
 841             }
 842         }
 843
 844       /* Check commenting. */
 845       if (in_comment)
 846         *cp = ' ';
 847       cp++;
 848     }
 849 }
 850
 851 /* Prepares LINE, which is subject to the given SYNTAX rules, for
 852    tokenization by stripping comments and determining whether it
 853    is the beginning or end of a command and storing into
 854    *LINE_STARTS_COMMAND and *LINE_ENDS_COMMAND appropriately. */
 855 void
 856 lex_preprocess_line (struct string *line,
 857                      enum syntax_mode syntax,
 858                      bool *line_starts_command,
 859                      bool *line_ends_command)
 860 {
 861   strip_comments (line);
 862   ds_rtrim (line, ss_cstr (CC_SPACES));
 863   *line_ends_command = (ds_chomp (line, settings_get_endcmd ())
 864                         || (ds_is_empty (line) && settings_get_nulline ()));
 865   *line_starts_command = false;
 866   if (syntax == GETL_BATCH)
 867     {
 868       int first = ds_first (line);
 869       *line_starts_command = !isspace (first);
 870       if (first == '+' || first == '-')
 871         *ds_data (line) = ' ';
 872     }
 873 }
 874
 875 /* Reads a line, without performing any preprocessing.
 876    Sets *SYNTAX, if SYNTAX is non-null, to the line's syntax
 877    mode. */
 878 bool
 879 lex_get_line_raw (struct lexer *lexer)
 880 {
 881   bool ok = getl_read_line (lexer->ss, &lexer->line_buffer);
 882   enum syntax_mode mode = lex_current_syntax_mode (lexer);
 883   journal_write (mode == GETL_BATCH, ds_cstr (&lexer->line_buffer));
 884
 885   return ok;
 886 }
 887
 888 /* Reads a line for use by the tokenizer, and preprocesses it by
 889    removing comments, stripping trailing whitespace and the
 890    terminal dot, and removing leading indentors. */
 891 bool
 892 lex_get_line (struct lexer *lexer)
 893 {
 894   bool line_starts_command;
 895
 896   if (!lex_get_line_raw (lexer))
 897     {
 898       lexer->prog = NULL;
 899       return false;
 900     }
 901
 902   lex_preprocess_line (&lexer->line_buffer,
 903                        lex_current_syntax_mode (lexer),
 904                        &line_starts_command, &lexer->dot);
 905
 906   if (line_starts_command)
 907     lexer->put_token = '.';
 908
 909   lexer->prog = ds_cstr (&lexer->line_buffer);
 910   return true;
 911 }
 912 \f
 913 /* Token names. */
 914
 915 /* Returns the name of a token. */
 916 const char *
 917 lex_token_name (int token)
 918 {
 919   if (lex_is_keyword (token))
 920     return lex_id_name (token);
 921   else if (token < 256)
 922     {
 923       static char t[256][2];
 924       char *s = t[token];
 925       s[0] = token;
 926       s[1] = '\0';
 927       return s;
 928     }
 929   else
 930     NOT_REACHED ();
 931 }
 932
 933 /* Returns an ASCII representation of the current token as a
 934    malloc()'d string. */
 935 char *
 936 lex_token_representation (struct lexer *lexer)
 937 {
 938   char *token_rep;
 939
 940   switch (lexer->token)
 941     {
 942     case T_ID:
 943     case T_POS_NUM:
 944     case T_NEG_NUM:
 945       return ds_xstrdup (&lexer->tokstr);
 946       break;
 947
 948     case T_STRING:
 949       {
 950         int hexstring = 0;
 951         char *sp, *dp;
 952
 953         for (sp = ds_cstr (&lexer->tokstr); sp < ds_end (&lexer->tokstr); sp++)
 954           if (!isprint ((unsigned char) *sp))
 955             {
 956               hexstring = 1;
 957               break;
 958             }
 959
 960         token_rep = xmalloc (2 + ds_length (&lexer->tokstr) * 2 + 1 + 1);
 961
 962         dp = token_rep;
 963         if (hexstring)
 964           *dp++ = 'X';
 965         *dp++ = '\'';
 966
 967         if (!hexstring)
 968           for (sp = ds_cstr (&lexer->tokstr); *sp; )
 969             {
 970               if (*sp == '\'')
 971                 *dp++ = '\'';
 972               *dp++ = (unsigned char) *sp++;
 973             }
 974         else
 975           for (sp = ds_cstr (&lexer->tokstr); sp < ds_end (&lexer->tokstr); sp++)
 976             {
 977               *dp++ = (((unsigned char) *sp) >> 4)["0123456789ABCDEF"];
 978               *dp++ = (((unsigned char) *sp) & 15)["0123456789ABCDEF"];
 979             }
 980         *dp++ = '\'';
 981         *dp = '\0';
 982
 983         return token_rep;
 984       }
 985     break;
 986
 987     case T_STOP:
 988       token_rep = xmalloc (1);
 989       *token_rep = '\0';
 990       return token_rep;
 991
 992     case T_EXP:
 993       return xstrdup ("**");
 994
 995     default:
 996       return xstrdup (lex_token_name (lexer->token));
 997     }
 998
 999   NOT_REACHED ();
1000 }
1001 \f
1002 /* Really weird functions. */
1003
1004 /* Most of the time, a `-' is a lead-in to a negative number.  But
1005    sometimes it's actually part of the syntax.  If a dash can be part
1006    of syntax then this function is called to rip it off of a
1007    number. */
1008 void
1009 lex_negative_to_dash (struct lexer *lexer)
1010 {
1011   if (lexer->token == T_NEG_NUM)
1012     {
1013       lexer->token = T_POS_NUM;
1014       lexer->tokval = -lexer->tokval;
1015       ds_assign_substring (&lexer->tokstr, ds_substr (&lexer->tokstr, 1, SIZE_MAX));
1016       save_token (lexer);
1017       lexer->token = '-';
1018     }
1019 }
1020
1021 /* Skip a COMMENT command. */
1022 void
1023 lex_skip_comment (struct lexer *lexer)
1024 {
1025   for (;;)
1026     {
1027       if (!lex_get_line (lexer))
1028         {
1029           lexer->put_token = T_STOP;
1030           lexer->prog = NULL;
1031           return;
1032         }
1033
1034       if (lexer->put_token == '.')
1035         break;
1036
1037       ds_cstr (&lexer->line_buffer); /* Ensures ds_end will point to a valid char */
1038       lexer->prog = ds_end (&lexer->line_buffer);
1039       if (lexer->dot)
1040         break;
1041     }
1042 }
1043 \f
1044 /* Private functions. */
1045
1046 /* When invoked, tokstr contains a string of binary, octal, or
1047    hex digits, according to TYPE.  The string is converted to
1048    characters having the specified values. */
1049 static void
1050 convert_numeric_string_to_char_string (struct lexer *lexer,
1051                                        enum string_type type)
1052 {
1053   const char *base_name;
1054   int base;
1055   int chars_per_byte;
1056   size_t byte_cnt;
1057   size_t i;
1058   char *p;
1059
1060   switch (type)
1061     {
1062     case BINARY_STRING:
1063       base_name = _("binary");
1064       base = 2;
1065       chars_per_byte = 8;
1066       break;
1067     case OCTAL_STRING:
1068       base_name = _("octal");
1069       base = 8;
1070       chars_per_byte = 3;
1071       break;
1072     case HEX_STRING:
1073       base_name = _("hex");
1074       base = 16;
1075       chars_per_byte = 2;
1076       break;
1077     default:
1078       NOT_REACHED ();
1079     }
1080
1081   byte_cnt = ds_length (&lexer->tokstr) / chars_per_byte;
1082   if (ds_length (&lexer->tokstr) % chars_per_byte)
1083     msg (SE, _("String of %s digits has %zu characters, which is not a "
1084                "multiple of %d."),
1085          base_name, ds_length (&lexer->tokstr), chars_per_byte);
1086
1087   p = ds_cstr (&lexer->tokstr);
1088   for (i = 0; i < byte_cnt; i++)
1089     {
1090       int value;
1091       int j;
1092
1093       value = 0;
1094       for (j = 0; j < chars_per_byte; j++, p++)
1095         {
1096           int v;
1097
1098           if (*p >= '0' && *p <= '9')
1099             v = *p - '0';
1100           else
1101             {
1102               static const char alpha[] = "abcdef";
1103               const char *q = strchr (alpha, tolower ((unsigned char) *p));
1104
1105               if (q)
1106                 v = q - alpha + 10;
1107               else
1108                 v = base;
1109             }
1110
1111           if (v >= base)
1112             msg (SE, _("`%c' is not a valid %s digit."), *p, base_name);
1113
1114           value = value * base + v;
1115         }
1116
1117       ds_cstr (&lexer->tokstr)[i] = (unsigned char) value;
1118     }
1119
1120   ds_truncate (&lexer->tokstr, byte_cnt);
1121 }
1122
1123 /* Parses a string from the input buffer into tokstr.  The input
1124    buffer pointer lexer->prog must point to the initial single or double
1125    quote.  TYPE indicates the type of string to be parsed.
1126    Returns token type. */
1127 static int
1128 parse_string (struct lexer *lexer, enum string_type type)
1129 {
1130   if (type != CHARACTER_STRING)
1131     lexer->prog++;
1132
1133   /* Accumulate the entire string, joining sections indicated by +
1134      signs. */
1135   for (;;)
1136     {
1137       /* Single or double quote. */
1138       int c = *lexer->prog++;
1139
1140       /* Accumulate section. */
1141       for (;;)
1142         {
1143           /* Check end of line. */
1144           if (*lexer->prog == '\0')
1145             {
1146               msg (SE, _("Unterminated string constant."));
1147               goto finish;
1148             }
1149
1150           /* Double quote characters to embed them in strings. */
1151           if (*lexer->prog == c)
1152             {
1153               if (lexer->prog[1] == c)
1154                 lexer->prog++;
1155               else
1156                 break;
1157             }
1158
1159           ds_put_char (&lexer->tokstr, *lexer->prog++);
1160         }
1161       lexer->prog++;
1162
1163       /* Skip whitespace after final quote mark. */
1164       if (lexer->prog == NULL)
1165         break;
1166       for (;;)
1167         {
1168           while (isspace ((unsigned char) *lexer->prog))
1169             lexer->prog++;
1170           if (*lexer->prog)
1171             break;
1172
1173           if (lexer->dot)
1174             goto finish;
1175
1176           if (!lex_get_line (lexer))
1177             goto finish;
1178         }
1179
1180       /* Skip plus sign. */
1181       if (*lexer->prog != '+')
1182         break;
1183       lexer->prog++;
1184
1185       /* Skip whitespace after plus sign. */
1186       if (lexer->prog == NULL)
1187         break;
1188       for (;;)
1189         {
1190           while (isspace ((unsigned char) *lexer->prog))
1191             lexer->prog++;
1192           if (*lexer->prog)
1193             break;
1194
1195           if (lexer->dot)
1196             goto finish;
1197
1198           if (!lex_get_line (lexer))
1199             {
1200               msg (SE, _("Unexpected end of file in string concatenation."));
1201               goto finish;
1202             }
1203         }
1204
1205       /* Ensure that a valid string follows. */
1206       if (*lexer->prog != '\'' && *lexer->prog != '"')
1207         {
1208           msg (SE, _("String expected following `+'."));
1209           goto finish;
1210         }
1211     }
1212
1213   /* We come here when we've finished concatenating all the string sections
1214      into one large string. */
1215 finish:
1216   if (type != CHARACTER_STRING)
1217     convert_numeric_string_to_char_string (lexer, type);
1218
1219   if (ds_length (&lexer->tokstr) > 255)
1220     {
1221       msg (SE, _("String exceeds 255 characters in length (%zu characters)."),
1222            ds_length (&lexer->tokstr));
1223       ds_truncate (&lexer->tokstr, 255);
1224     }
1225
1226   return T_STRING;
1227 }
1228 \f
1229 #if DUMP_TOKENS
1230 /* Reads one token from the lexer and writes a textual representation
1231    on stdout for debugging purposes. */
1232 static void
1233 dump_token (struct lexer *lexer)
1234 {
1235   {
1236     const char *curfn;
1237     int curln;
1238
1239     curln = getl_source_location (lexer->ss);
1240     curfn = getl_source_name (lexer->ss);
1241     if (curfn)
1242       fprintf (stderr, "%s:%d\t", curfn, curln);
1243   }
1244
1245   switch (lexer->token)
1246     {
1247     case T_ID:
1248       fprintf (stderr, "ID\t%s\n", lexer->tokid);
1249       break;
1250
1251     case T_POS_NUM:
1252     case T_NEG_NUM:
1253       fprintf (stderr, "NUM\t%f\n", lexer->tokval);
1254       break;
1255
1256     case T_STRING:
1257       fprintf (stderr, "STRING\t\"%s\"\n", ds_cstr (&lexer->tokstr));
1258       break;
1259
1260     case T_STOP:
1261       fprintf (stderr, "STOP\n");
1262       break;
1263
1264     case T_EXP:
1265       fprintf (stderr, "MISC\tEXP\"");
1266       break;
1267
1268     case 0:
1269       fprintf (stderr, "MISC\tEOF\n");
1270       break;
1271
1272     default:
1273       if (lex_is_keyword (lexer->token))
1274         fprintf (stderr, "KEYWORD\t%s\n", lex_token_name (lexer->token));
1275       else
1276         fprintf (stderr, "PUNCT\t%c\n", lexer->token);
1277       break;
1278     }
1279 }
1280 #endif /* DUMP_TOKENS */
1281
1282
1283 /* Token Accessor Functions */
1284
1285 int
1286 lex_token (const struct lexer *lexer)
1287 {
1288   return lexer->token;
1289 }
1290
1291 double
1292 lex_tokval (const struct lexer *lexer)
1293 {
1294   return lexer->tokval;
1295 }
1296
1297 const char *
1298 lex_tokid (const struct lexer *lexer)
1299 {
1300   return lexer->tokid;
1301 }
1302
1303 const struct string *
1304 lex_tokstr (const struct lexer *lexer)
1305 {
1306   return &lexer->tokstr;
1307 }
1308
1309 /* If the lexer is positioned at the (pseudo)identifier S, which
1310    may contain a hyphen ('-'), skips it and returns true.  Each
1311    half of the identifier may be abbreviated to its first three
1312    letters.
1313    Otherwise, returns false. */
1314 bool
1315 lex_match_hyphenated_word (struct lexer *lexer, const char *s)
1316 {
1317   const char *hyphen = strchr (s, '-');
1318   if (hyphen == NULL)
1319     return lex_match_id (lexer, s);
1320   else if (lexer->token != T_ID
1321            || !lex_id_match (ss_buffer (s, hyphen - s), ss_cstr (lexer->tokid))
1322            || lex_look_ahead (lexer) != '-')
1323     return false;
1324   else
1325     {
1326       lex_get (lexer);
1327       lex_force_match (lexer, '-');
1328       lex_force_match_id (lexer, hyphen + 1);
1329       return true;
1330     }
1331 }
1332