src/language/lexer/lexer.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 1997-9, 2000, 2006, 2009, 2010 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "lexer.h"
  20 #include <c-ctype.h>
  21 #include <c-strtod.h>
  22 #include <errno.h>
  23 #include <limits.h>
  24 #include <math.h>
  25 #include <stdarg.h>
  26 #include <stdint.h>
  27 #include <stdlib.h>
  28 #include <language/command.h>
  29 #include <data/settings.h>
  30 #include <libpspp/assertion.h>
  31 #include <libpspp/getl.h>
  32 #include <libpspp/message.h>
  33 #include <libpspp/str.h>
  34 #include <output/journal.h>
  35 #include <output/text-item.h>
  36
  37 #include "xalloc.h"
  38
  39 #include "gettext.h"
  40 #define _(msgid) gettext (msgid)
  41 #define N_(msgid) msgid
  42
  43 struct lexer
  44 {
  45   struct string line_buffer;
  46
  47   struct source_stream *ss;
  48
  49   int token;      /* Current token. */
  50   double tokval;  /* T_POS_NUM, T_NEG_NUM: the token's value. */
  51
  52   char tokid [VAR_NAME_LEN + 1];   /* T_ID: the identifier. */
  53
  54   struct string tokstr;   /* T_ID, T_STRING: token string value.
  55                             For T_ID, this is not truncated as is
  56                             tokid. */
  57
  58   char *prog; /* Pointer to next token in line_buffer. */
  59   bool dot;   /* True only if this line ends with a terminal dot. */
  60
  61   int put_token ; /* If nonzero, next token returned by lex_get().
  62                     Used only in exceptional circumstances. */
  63
  64   struct string put_tokstr;
  65   double put_tokval;
  66 };
  67
  68
  69 static int parse_id (struct lexer *);
  70
  71 /* How a string represents its contents. */
  72 enum string_type
  73   {
  74     CHARACTER_STRING,   /* Characters. */
  75     BINARY_STRING,      /* Binary digits. */
  76     OCTAL_STRING,       /* Octal digits. */
  77     HEX_STRING          /* Hexadecimal digits. */
  78   };
  79
  80 static int parse_string (struct lexer *, enum string_type);
  81 \f
  82 /* Initialization. */
  83
  84 /* Initializes the lexer. */
  85 struct lexer *
  86 lex_create (struct source_stream *ss)
  87 {
  88   struct lexer *lexer = xzalloc (sizeof (*lexer));
  89
  90   ds_init_empty (&lexer->tokstr);
  91   ds_init_empty (&lexer->put_tokstr);
  92   ds_init_empty (&lexer->line_buffer);
  93   lexer->ss = ss;
  94
  95   return lexer;
  96 }
  97
  98 struct source_stream *
  99 lex_get_source_stream (const struct lexer *lex)
 100 {
 101   return lex->ss;
 102 }
 103
 104 enum syntax_mode
 105 lex_current_syntax_mode (const struct lexer *lex)
 106 {
 107   return source_stream_current_syntax_mode (lex->ss);
 108 }
 109
 110 enum error_mode
 111 lex_current_error_mode (const struct lexer *lex)
 112 {
 113   return source_stream_current_error_mode (lex->ss);
 114 }
 115
 116
 117 void
 118 lex_destroy (struct lexer *lexer)
 119 {
 120   if ( NULL != lexer )
 121     {
 122       ds_destroy (&lexer->put_tokstr);
 123       ds_destroy (&lexer->tokstr);
 124       ds_destroy (&lexer->line_buffer);
 125
 126       free (lexer);
 127     }
 128 }
 129
 130 \f
 131 /* Common functions. */
 132
 133 /* Copies put_token, lexer->put_tokstr, put_tokval into token, tokstr,
 134    tokval, respectively, and sets tokid appropriately. */
 135 static void
 136 restore_token (struct lexer *lexer)
 137 {
 138   assert (lexer->put_token != 0);
 139   lexer->token = lexer->put_token;
 140   ds_assign_string (&lexer->tokstr, &lexer->put_tokstr);
 141   str_copy_trunc (lexer->tokid, sizeof lexer->tokid, ds_cstr (&lexer->tokstr));
 142   lexer->tokval = lexer->put_tokval;
 143   lexer->put_token = 0;
 144 }
 145
 146 /* Copies token, tokstr, lexer->tokval into lexer->put_token, put_tokstr,
 147    put_lexer->tokval respectively. */
 148 static void
 149 save_token (struct lexer *lexer)
 150 {
 151   lexer->put_token = lexer->token;
 152   ds_assign_string (&lexer->put_tokstr, &lexer->tokstr);
 153   lexer->put_tokval = lexer->tokval;
 154 }
 155
 156 /* Parses a single token, setting appropriate global variables to
 157    indicate the token's attributes. */
 158 void
 159 lex_get (struct lexer *lexer)
 160 {
 161   /* Find a token. */
 162   for (;;)
 163     {
 164       if (NULL == lexer->prog && ! lex_get_line (lexer) )
 165         {
 166           lexer->token = T_STOP;
 167           return;
 168         }
 169
 170       /* If a token was pushed ahead, return it. */
 171       if (lexer->put_token)
 172         {
 173           restore_token (lexer);
 174           return;
 175         }
 176
 177       for (;;)
 178         {
 179           /* Skip whitespace. */
 180           while (c_isspace ((unsigned char) *lexer->prog))
 181             lexer->prog++;
 182
 183           if (*lexer->prog)
 184             break;
 185
 186           if (lexer->dot)
 187             {
 188               lexer->dot = 0;
 189               lexer->token = '.';
 190               return;
 191             }
 192           else if (!lex_get_line (lexer))
 193             {
 194               lexer->prog = NULL;
 195               lexer->token = T_STOP;
 196               return;
 197             }
 198
 199           if (lexer->put_token)
 200             {
 201               restore_token (lexer);
 202               return;
 203             }
 204         }
 205
 206
 207       /* Actually parse the token. */
 208       ds_clear (&lexer->tokstr);
 209
 210       switch (*lexer->prog)
 211         {
 212         case '-': case '.':
 213         case '0': case '1': case '2': case '3': case '4':
 214         case '5': case '6': case '7': case '8': case '9':
 215           {
 216             char *tail;
 217
 218             /* `-' can introduce a negative number, or it can be a
 219                token by itself.  If it is not followed by a digit or a
 220                decimal point, it is definitely not a number.
 221                Otherwise, it might be either, but most of the time we
 222                want it as a number.  When the syntax calls for a `-'
 223                token, lex_negative_to_dash() must be used to break
 224                negative numbers into two tokens. */
 225             if (*lexer->prog == '-')
 226               {
 227                 ds_put_byte (&lexer->tokstr, *lexer->prog++);
 228                 while (c_isspace ((unsigned char) *lexer->prog))
 229                   lexer->prog++;
 230
 231                 if (!c_isdigit ((unsigned char) *lexer->prog) && *lexer->prog != '.')
 232                   {
 233                     lexer->token = '-';
 234                     break;
 235                   }
 236                 lexer->token = T_NEG_NUM;
 237               }
 238             else
 239               lexer->token = T_POS_NUM;
 240
 241             /* Parse the number, copying it into tokstr. */
 242             while (c_isdigit ((unsigned char) *lexer->prog))
 243               ds_put_byte (&lexer->tokstr, *lexer->prog++);
 244             if (*lexer->prog == '.')
 245               {
 246                 ds_put_byte (&lexer->tokstr, *lexer->prog++);
 247                 while (c_isdigit ((unsigned char) *lexer->prog))
 248                   ds_put_byte (&lexer->tokstr, *lexer->prog++);
 249               }
 250             if (*lexer->prog == 'e' || *lexer->prog == 'E')
 251               {
 252                 ds_put_byte (&lexer->tokstr, *lexer->prog++);
 253                 if (*lexer->prog == '+' || *lexer->prog == '-')
 254                   ds_put_byte (&lexer->tokstr, *lexer->prog++);
 255                 while (c_isdigit ((unsigned char) *lexer->prog))
 256                   ds_put_byte (&lexer->tokstr, *lexer->prog++);
 257               }
 258
 259             /* Parse as floating point. */
 260             lexer->tokval = c_strtod (ds_cstr (&lexer->tokstr), &tail);
 261             if (*tail)
 262               {
 263                 msg (SE, _("%s does not form a valid number."),
 264                      ds_cstr (&lexer->tokstr));
 265                 lexer->tokval = 0.0;
 266
 267                 ds_clear (&lexer->tokstr);
 268                 ds_put_byte (&lexer->tokstr, '0');
 269               }
 270
 271             break;
 272           }
 273
 274         case '\'': case '"':
 275           lexer->token = parse_string (lexer, CHARACTER_STRING);
 276           break;
 277
 278         case '(': case ')': case ',': case '=': case '+': case '/':
 279         case '[': case ']':
 280           lexer->token = *lexer->prog++;
 281           break;
 282
 283         case '*':
 284           if (*++lexer->prog == '*')
 285             {
 286               lexer->prog++;
 287               lexer->token = T_EXP;
 288             }
 289           else
 290             lexer->token = '*';
 291           break;
 292
 293         case '<':
 294           if (*++lexer->prog == '=')
 295             {
 296               lexer->prog++;
 297               lexer->token = T_LE;
 298             }
 299           else if (*lexer->prog == '>')
 300             {
 301               lexer->prog++;
 302               lexer->token = T_NE;
 303             }
 304           else
 305             lexer->token = T_LT;
 306           break;
 307
 308         case '>':
 309           if (*++lexer->prog == '=')
 310             {
 311               lexer->prog++;
 312               lexer->token = T_GE;
 313             }
 314           else
 315             lexer->token = T_GT;
 316           break;
 317
 318         case '~':
 319           if (*++lexer->prog == '=')
 320             {
 321               lexer->prog++;
 322               lexer->token = T_NE;
 323             }
 324           else
 325             lexer->token = T_NOT;
 326           break;
 327
 328         case '&':
 329           lexer->prog++;
 330           lexer->token = T_AND;
 331           break;
 332
 333         case '|':
 334           lexer->prog++;
 335           lexer->token = T_OR;
 336           break;
 337
 338         case 'b': case 'B':
 339           if (lexer->prog[1] == '\'' || lexer->prog[1] == '"')
 340             lexer->token = parse_string (lexer, BINARY_STRING);
 341           else
 342             lexer->token = parse_id (lexer);
 343           break;
 344
 345         case 'o': case 'O':
 346           if (lexer->prog[1] == '\'' || lexer->prog[1] == '"')
 347             lexer->token = parse_string (lexer, OCTAL_STRING);
 348           else
 349             lexer->token = parse_id (lexer);
 350           break;
 351
 352         case 'x': case 'X':
 353           if (lexer->prog[1] == '\'' || lexer->prog[1] == '"')
 354             lexer->token = parse_string (lexer, HEX_STRING);
 355           else
 356             lexer->token = parse_id (lexer);
 357           break;
 358
 359         default:
 360           if (lex_is_id1 (*lexer->prog))
 361             {
 362               lexer->token = parse_id (lexer);
 363               break;
 364             }
 365           else
 366             {
 367               unsigned char c = *lexer->prog++;
 368               char *c_name = xasprintf (c_isgraph (c) ? "%c" : "\\%o", c);
 369               msg (SE, _("Bad character in input: `%s'."), c_name);
 370               free (c_name);
 371               continue;
 372             }
 373         }
 374       break;
 375     }
 376 }
 377
 378 /* Parses an identifier at the current position into tokid and
 379    tokstr.
 380    Returns the correct token type. */
 381 static int
 382 parse_id (struct lexer *lexer)
 383 {
 384   struct substring rest_of_line
 385     = ss_substr (ds_ss (&lexer->line_buffer),
 386                  ds_pointer_to_position (&lexer->line_buffer, lexer->prog),
 387                  SIZE_MAX);
 388   struct substring id = ss_head (rest_of_line,
 389                                  lex_id_get_length (rest_of_line));
 390   lexer->prog += ss_length (id);
 391
 392   ds_assign_substring (&lexer->tokstr, id);
 393   str_copy_trunc (lexer->tokid, sizeof lexer->tokid, ds_cstr (&lexer->tokstr));
 394   return lex_id_to_token (id);
 395 }
 396
 397 /* Reports an error to the effect that subcommand SBC may only be
 398    specified once. */
 399 void
 400 lex_sbc_only_once (const char *sbc)
 401 {
 402   msg (SE, _("Subcommand %s may only be specified once."), sbc);
 403 }
 404
 405 /* Reports an error to the effect that subcommand SBC is
 406    missing. */
 407 void
 408 lex_sbc_missing (struct lexer *lexer, const char *sbc)
 409 {
 410   lex_error (lexer, _("missing required subcommand %s"), sbc);
 411 }
 412
 413 /* Prints a syntax error message containing the current token and
 414    given message MESSAGE (if non-null). */
 415 void
 416 lex_error (struct lexer *lexer, const char *message, ...)
 417 {
 418   struct string s;
 419
 420   ds_init_empty (&s);
 421
 422   if (lexer->token == T_STOP)
 423     ds_put_cstr (&s, _("Syntax error at end of file"));
 424   else if (lexer->token == '.')
 425     ds_put_cstr (&s, _("Syntax error at end of command"));
 426   else
 427     {
 428       char *token_rep = lex_token_representation (lexer);
 429       ds_put_format (&s, _("Syntax error at `%s'"), token_rep);
 430       free (token_rep);
 431     }
 432
 433   if (message)
 434     {
 435       va_list args;
 436
 437       ds_put_cstr (&s, ": ");
 438
 439       va_start (args, message);
 440       ds_put_vformat (&s, message, args);
 441       va_end (args);
 442     }
 443
 444   msg (SE, "%s.", ds_cstr (&s));
 445   ds_destroy (&s);
 446 }
 447
 448 /* Checks that we're at end of command.
 449    If so, returns a successful command completion code.
 450    If not, flags a syntax error and returns an error command
 451    completion code. */
 452 int
 453 lex_end_of_command (struct lexer *lexer)
 454 {
 455   if (lexer->token != '.')
 456     {
 457       lex_error (lexer, _("expecting end of command"));
 458       return CMD_FAILURE;
 459     }
 460   else
 461     return CMD_SUCCESS;
 462 }
 463 \f
 464 /* Token testing functions. */
 465
 466 /* Returns true if the current token is a number. */
 467 bool
 468 lex_is_number (struct lexer *lexer)
 469 {
 470   return lexer->token == T_POS_NUM || lexer->token == T_NEG_NUM;
 471 }
 472
 473
 474 /* Returns true if the current token is a string. */
 475 bool
 476 lex_is_string (struct lexer *lexer)
 477 {
 478   return lexer->token == T_STRING;
 479 }
 480
 481
 482 /* Returns the value of the current token, which must be a
 483    floating point number. */
 484 double
 485 lex_number (struct lexer *lexer)
 486 {
 487   assert (lex_is_number (lexer));
 488   return lexer->tokval;
 489 }
 490
 491 /* Returns true iff the current token is an integer. */
 492 bool
 493 lex_is_integer (struct lexer *lexer)
 494 {
 495   return (lex_is_number (lexer)
 496           && lexer->tokval > LONG_MIN
 497           && lexer->tokval <= LONG_MAX
 498           && floor (lexer->tokval) == lexer->tokval);
 499 }
 500
 501 /* Returns the value of the current token, which must be an
 502    integer. */
 503 long
 504 lex_integer (struct lexer *lexer)
 505 {
 506   assert (lex_is_integer (lexer));
 507   return lexer->tokval;
 508 }
 509 \f
 510 /* Token matching functions. */
 511
 512 /* If TOK is the current token, skips it and returns true
 513    Otherwise, returns false. */
 514 bool
 515 lex_match (struct lexer *lexer, int t)
 516 {
 517   if (lexer->token == t)
 518     {
 519       lex_get (lexer);
 520       return true;
 521     }
 522   else
 523     return false;
 524 }
 525
 526 /* If the current token is the identifier S, skips it and returns
 527    true.  The identifier may be abbreviated to its first three
 528    letters.
 529    Otherwise, returns false. */
 530 bool
 531 lex_match_id (struct lexer *lexer, const char *s)
 532 {
 533   return lex_match_id_n (lexer, s, 3);
 534 }
 535
 536 /* If the current token is the identifier S, skips it and returns
 537    true.  The identifier may be abbreviated to its first N
 538    letters.
 539    Otherwise, returns false. */
 540 bool
 541 lex_match_id_n (struct lexer *lexer, const char *s, size_t n)
 542 {
 543   if (lexer->token == T_ID
 544       && lex_id_match_n (ss_cstr (s), ss_cstr (lexer->tokid), n))
 545     {
 546       lex_get (lexer);
 547       return true;
 548     }
 549   else
 550     return false;
 551 }
 552
 553 /* If the current token is integer N, skips it and returns true.
 554    Otherwise, returns false. */
 555 bool
 556 lex_match_int (struct lexer *lexer, int x)
 557 {
 558   if (lex_is_integer (lexer) && lex_integer (lexer) == x)
 559     {
 560       lex_get (lexer);
 561       return true;
 562     }
 563   else
 564     return false;
 565 }
 566 \f
 567 /* Forced matches. */
 568
 569 /* If this token is identifier S, fetches the next token and returns
 570    nonzero.
 571    Otherwise, reports an error and returns zero. */
 572 bool
 573 lex_force_match_id (struct lexer *lexer, const char *s)
 574 {
 575   if (lex_match_id (lexer, s))
 576     return true;
 577   else
 578     {
 579       lex_error (lexer, _("expecting `%s'"), s);
 580       return false;
 581     }
 582 }
 583
 584 /* If the current token is T, skips the token.  Otherwise, reports an
 585    error and returns from the current function with return value false. */
 586 bool
 587 lex_force_match (struct lexer *lexer, int t)
 588 {
 589   if (lexer->token == t)
 590     {
 591       lex_get (lexer);
 592       return true;
 593     }
 594   else
 595     {
 596       lex_error (lexer, _("expecting `%s'"), lex_token_name (t));
 597       return false;
 598     }
 599 }
 600
 601 /* If this token is a string, does nothing and returns true.
 602    Otherwise, reports an error and returns false. */
 603 bool
 604 lex_force_string (struct lexer *lexer)
 605 {
 606   if (lex_is_string (lexer))
 607     return true;
 608   else
 609     {
 610       lex_error (lexer, _("expecting string"));
 611       return false;
 612     }
 613 }
 614
 615 /* If this token is an integer, does nothing and returns true.
 616    Otherwise, reports an error and returns false. */
 617 bool
 618 lex_force_int (struct lexer *lexer)
 619 {
 620   if (lex_is_integer (lexer))
 621     return true;
 622   else
 623     {
 624       lex_error (lexer, _("expecting integer"));
 625       return false;
 626     }
 627 }
 628
 629 /* If this token is a number, does nothing and returns true.
 630    Otherwise, reports an error and returns false. */
 631 bool
 632 lex_force_num (struct lexer *lexer)
 633 {
 634   if (lex_is_number (lexer))
 635     return true;
 636
 637   lex_error (lexer, _("expecting number"));
 638   return false;
 639 }
 640
 641 /* If this token is an identifier, does nothing and returns true.
 642    Otherwise, reports an error and returns false. */
 643 bool
 644 lex_force_id (struct lexer *lexer)
 645 {
 646   if (lexer->token == T_ID)
 647     return true;
 648
 649   lex_error (lexer, _("expecting identifier"));
 650   return false;
 651 }
 652
 653 /* Weird token functions. */
 654
 655 /* Returns the first character of the next token, except that if the
 656    next token is not an identifier, the character returned will not be
 657    a character that can begin an identifier.  Specifically, the
 658    hexstring lead-in X' causes lookahead() to return '.  Note that an
 659    alphanumeric return value doesn't guarantee an ID token, it could
 660    also be a reserved-word token. */
 661 int
 662 lex_look_ahead (struct lexer *lexer)
 663 {
 664   if (lexer->put_token)
 665     return lexer->put_token;
 666
 667   for (;;)
 668     {
 669       if (NULL == lexer->prog && ! lex_get_line (lexer) )
 670         return 0;
 671
 672       for (;;)
 673         {
 674           while (c_isspace ((unsigned char) *lexer->prog))
 675             lexer->prog++;
 676           if (*lexer->prog)
 677             break;
 678
 679           if (lexer->dot)
 680             return '.';
 681           else if (!lex_get_line (lexer))
 682             return 0;
 683
 684           if (lexer->put_token)
 685             return lexer->put_token;
 686         }
 687
 688       if ((toupper ((unsigned char) *lexer->prog) == 'X'
 689            || toupper ((unsigned char) *lexer->prog) == 'B'
 690            || toupper ((unsigned char) *lexer->prog) == 'O')
 691           && (lexer->prog[1] == '\'' || lexer->prog[1] == '"'))
 692         return '\'';
 693
 694       return *lexer->prog;
 695     }
 696 }
 697
 698 /* Makes the current token become the next token to be read; the
 699    current token is set to T. */
 700 void
 701 lex_put_back (struct lexer *lexer, int t)
 702 {
 703   save_token (lexer);
 704   lexer->token = t;
 705 }
 706
 707 /* Makes the current token become the next token to be read; the
 708    current token is set to the identifier ID. */
 709 void
 710 lex_put_back_id (struct lexer *lexer, const char *id)
 711 {
 712   assert (lex_id_to_token (ss_cstr (id)) == T_ID);
 713   save_token (lexer);
 714   lexer->token = T_ID;
 715   ds_assign_cstr (&lexer->tokstr, id);
 716   str_copy_trunc (lexer->tokid, sizeof lexer->tokid, ds_cstr (&lexer->tokstr));
 717 }
 718 \f
 719 /* Weird line processing functions. */
 720
 721 /* Returns the entire contents of the current line. */
 722 const char *
 723 lex_entire_line (const struct lexer *lexer)
 724 {
 725   return ds_cstr (&lexer->line_buffer);
 726 }
 727
 728 const struct string *
 729 lex_entire_line_ds (const struct lexer *lexer)
 730 {
 731   return &lexer->line_buffer;
 732 }
 733
 734 /* As lex_entire_line(), but only returns the part of the current line
 735    that hasn't already been tokenized. */
 736 const char *
 737 lex_rest_of_line (const struct lexer *lexer)
 738 {
 739   return lexer->prog;
 740 }
 741
 742 /* Returns true if the current line ends in a terminal dot,
 743    false otherwise. */
 744 bool
 745 lex_end_dot (const struct lexer *lexer)
 746 {
 747   return lexer->dot;
 748 }
 749
 750 /* Causes the rest of the current input line to be ignored for
 751    tokenization purposes. */
 752 void
 753 lex_discard_line (struct lexer *lexer)
 754 {
 755   ds_cstr (&lexer->line_buffer);  /* Ensures ds_end points to something valid */
 756   lexer->prog = ds_end (&lexer->line_buffer);
 757   lexer->dot = false;
 758   lexer->put_token = 0;
 759 }
 760
 761
 762 /* Discards the rest of the current command.
 763    When we're reading commands from a file, we skip tokens until
 764    a terminal dot or EOF.
 765    When we're reading commands interactively from the user,
 766    that's just discarding the current line, because presumably
 767    the user doesn't want to finish typing a command that will be
 768    ignored anyway. */
 769 void
 770 lex_discard_rest_of_command (struct lexer *lexer)
 771 {
 772   if (!getl_is_interactive (lexer->ss))
 773     {
 774       while (lexer->token != T_STOP && lexer->token != '.')
 775         lex_get (lexer);
 776     }
 777   else
 778     lex_discard_line (lexer);
 779 }
 780 \f
 781 /* Weird line reading functions. */
 782
 783 /* Remove C-style comments in STRING, begun by slash-star and
 784    terminated by star-slash or newline. */
 785 static void
 786 strip_comments (struct string *string)
 787 {
 788   char *cp;
 789   int quote;
 790   bool in_comment;
 791
 792   in_comment = false;
 793   quote = EOF;
 794   for (cp = ds_cstr (string); *cp; )
 795     {
 796       /* If we're not in a comment, check for quote marks. */
 797       if (!in_comment)
 798         {
 799           if (*cp == quote)
 800             quote = EOF;
 801           else if (*cp == '\'' || *cp == '"')
 802             quote = *cp;
 803         }
 804
 805       /* If we're not inside a quotation, check for comment. */
 806       if (quote == EOF)
 807         {
 808           if (cp[0] == '/' && cp[1] == '*')
 809             {
 810               in_comment = true;
 811               *cp++ = ' ';
 812               *cp++ = ' ';
 813               continue;
 814             }
 815           else if (in_comment && cp[0] == '*' && cp[1] == '/')
 816             {
 817               in_comment = false;
 818               *cp++ = ' ';
 819               *cp++ = ' ';
 820               continue;
 821             }
 822         }
 823
 824       /* Check commenting. */
 825       if (in_comment)
 826         *cp = ' ';
 827       cp++;
 828     }
 829 }
 830
 831 /* Prepares LINE, which is subject to the given SYNTAX rules, for
 832    tokenization by stripping comments and determining whether it
 833    is the beginning or end of a command and storing into
 834    *LINE_STARTS_COMMAND and *LINE_ENDS_COMMAND appropriately. */
 835 void
 836 lex_preprocess_line (struct string *line,
 837                      enum syntax_mode syntax,
 838                      bool *line_starts_command,
 839                      bool *line_ends_command)
 840 {
 841   strip_comments (line);
 842   ds_rtrim (line, ss_cstr (CC_SPACES));
 843   *line_ends_command = (ds_chomp (line, settings_get_endcmd ())
 844                         || (ds_is_empty (line) && settings_get_nulline ()));
 845   *line_starts_command = false;
 846   if (syntax == GETL_BATCH)
 847     {
 848       int first = ds_first (line);
 849       *line_starts_command = !c_isspace (first);
 850       if (first == '+' || first == '-')
 851         *ds_data (line) = ' ';
 852     }
 853 }
 854
 855 /* Reads a line, without performing any preprocessing. */
 856 bool
 857 lex_get_line_raw (struct lexer *lexer)
 858 {
 859   bool ok = getl_read_line (lexer->ss, &lexer->line_buffer);
 860   if (ok)
 861     {
 862       const char *line = ds_cstr (&lexer->line_buffer);
 863       text_item_submit (text_item_create (TEXT_ITEM_SYNTAX, line));
 864     }
 865   else
 866     lexer->prog = NULL;
 867   return ok;
 868 }
 869
 870 /* Reads a line for use by the tokenizer, and preprocesses it by
 871    removing comments, stripping trailing whitespace and the
 872    terminal dot, and removing leading indentors. */
 873 bool
 874 lex_get_line (struct lexer *lexer)
 875 {
 876   bool line_starts_command;
 877
 878   if (!lex_get_line_raw (lexer))
 879     return false;
 880
 881   lex_preprocess_line (&lexer->line_buffer,
 882                        lex_current_syntax_mode (lexer),
 883                        &line_starts_command, &lexer->dot);
 884
 885   if (line_starts_command)
 886     lexer->put_token = '.';
 887
 888   lexer->prog = ds_cstr (&lexer->line_buffer);
 889   return true;
 890 }
 891 \f
 892 /* Token names. */
 893
 894 /* Returns the name of a token. */
 895 const char *
 896 lex_token_name (int token)
 897 {
 898   if (lex_is_keyword (token))
 899     return lex_id_name (token);
 900   else if (token < 256)
 901     {
 902       static char t[256][2];
 903       char *s = t[token];
 904       s[0] = token;
 905       s[1] = '\0';
 906       return s;
 907     }
 908   else
 909     NOT_REACHED ();
 910 }
 911
 912 /* Returns an ASCII representation of the current token as a
 913    malloc()'d string. */
 914 char *
 915 lex_token_representation (struct lexer *lexer)
 916 {
 917   char *token_rep;
 918
 919   switch (lexer->token)
 920     {
 921     case T_ID:
 922     case T_POS_NUM:
 923     case T_NEG_NUM:
 924       return ds_xstrdup (&lexer->tokstr);
 925       break;
 926
 927     case T_STRING:
 928       {
 929         int hexstring = 0;
 930         char *sp, *dp;
 931
 932         for (sp = ds_cstr (&lexer->tokstr); sp < ds_end (&lexer->tokstr); sp++)
 933           if (!c_isprint ((unsigned char) *sp))
 934             {
 935               hexstring = 1;
 936               break;
 937             }
 938
 939         token_rep = xmalloc (2 + ds_length (&lexer->tokstr) * 2 + 1 + 1);
 940
 941         dp = token_rep;
 942         if (hexstring)
 943           *dp++ = 'X';
 944         *dp++ = '\'';
 945
 946         if (!hexstring)
 947           for (sp = ds_cstr (&lexer->tokstr); *sp; )
 948             {
 949               if (*sp == '\'')
 950                 *dp++ = '\'';
 951               *dp++ = (unsigned char) *sp++;
 952             }
 953         else
 954           for (sp = ds_cstr (&lexer->tokstr); sp < ds_end (&lexer->tokstr); sp++)
 955             {
 956               *dp++ = (((unsigned char) *sp) >> 4)["0123456789ABCDEF"];
 957               *dp++ = (((unsigned char) *sp) & 15)["0123456789ABCDEF"];
 958             }
 959         *dp++ = '\'';
 960         *dp = '\0';
 961
 962         return token_rep;
 963       }
 964     break;
 965
 966     case T_STOP:
 967       token_rep = xmalloc (1);
 968       *token_rep = '\0';
 969       return token_rep;
 970
 971     case T_EXP:
 972       return xstrdup ("**");
 973
 974     default:
 975       return xstrdup (lex_token_name (lexer->token));
 976     }
 977
 978   NOT_REACHED ();
 979 }
 980 \f
 981 /* Really weird functions. */
 982
 983 /* Most of the time, a `-' is a lead-in to a negative number.  But
 984    sometimes it's actually part of the syntax.  If a dash can be part
 985    of syntax then this function is called to rip it off of a
 986    number. */
 987 void
 988 lex_negative_to_dash (struct lexer *lexer)
 989 {
 990   if (lexer->token == T_NEG_NUM)
 991     {
 992       lexer->token = T_POS_NUM;
 993       lexer->tokval = -lexer->tokval;
 994       ds_assign_substring (&lexer->tokstr, ds_substr (&lexer->tokstr, 1, SIZE_MAX));
 995       save_token (lexer);
 996       lexer->token = '-';
 997     }
 998 }
 999
1000 /* Skip a COMMENT command. */
1001 void
1002 lex_skip_comment (struct lexer *lexer)
1003 {
1004   for (;;)
1005     {
1006       if (!lex_get_line (lexer))
1007         {
1008           lexer->put_token = T_STOP;
1009           lexer->prog = NULL;
1010           return;
1011         }
1012
1013       if (lexer->put_token == '.')
1014         break;
1015
1016       ds_cstr (&lexer->line_buffer); /* Ensures ds_end will point to a valid char */
1017       lexer->prog = ds_end (&lexer->line_buffer);
1018       if (lexer->dot)
1019         break;
1020     }
1021 }
1022 \f
1023 /* Private functions. */
1024
1025 /* When invoked, tokstr contains a string of binary, octal, or
1026    hex digits, according to TYPE.  The string is converted to
1027    characters having the specified values. */
1028 static void
1029 convert_numeric_string_to_char_string (struct lexer *lexer,
1030                                        enum string_type type)
1031 {
1032   const char *base_name;
1033   int base;
1034   int chars_per_byte;
1035   size_t byte_cnt;
1036   size_t i;
1037   char *p;
1038
1039   switch (type)
1040     {
1041     case BINARY_STRING:
1042       base_name = _("binary");
1043       base = 2;
1044       chars_per_byte = 8;
1045       break;
1046     case OCTAL_STRING:
1047       base_name = _("octal");
1048       base = 8;
1049       chars_per_byte = 3;
1050       break;
1051     case HEX_STRING:
1052       base_name = _("hex");
1053       base = 16;
1054       chars_per_byte = 2;
1055       break;
1056     default:
1057       NOT_REACHED ();
1058     }
1059
1060   byte_cnt = ds_length (&lexer->tokstr) / chars_per_byte;
1061   if (ds_length (&lexer->tokstr) % chars_per_byte)
1062     msg (SE, _("String of %s digits has %zu characters, which is not a "
1063                "multiple of %d."),
1064          base_name, ds_length (&lexer->tokstr), chars_per_byte);
1065
1066   p = ds_cstr (&lexer->tokstr);
1067   for (i = 0; i < byte_cnt; i++)
1068     {
1069       int value;
1070       int j;
1071
1072       value = 0;
1073       for (j = 0; j < chars_per_byte; j++, p++)
1074         {
1075           int v;
1076
1077           if (*p >= '0' && *p <= '9')
1078             v = *p - '0';
1079           else
1080             {
1081               static const char alpha[] = "abcdef";
1082               const char *q = strchr (alpha, tolower ((unsigned char) *p));
1083
1084               if (q)
1085                 v = q - alpha + 10;
1086               else
1087                 v = base;
1088             }
1089
1090           if (v >= base)
1091             msg (SE, _("`%c' is not a valid %s digit."), *p, base_name);
1092
1093           value = value * base + v;
1094         }
1095
1096       ds_cstr (&lexer->tokstr)[i] = (unsigned char) value;
1097     }
1098
1099   ds_truncate (&lexer->tokstr, byte_cnt);
1100 }
1101
1102 /* Parses a string from the input buffer into tokstr.  The input
1103    buffer pointer lexer->prog must point to the initial single or double
1104    quote.  TYPE indicates the type of string to be parsed.
1105    Returns token type. */
1106 static int
1107 parse_string (struct lexer *lexer, enum string_type type)
1108 {
1109   if (type != CHARACTER_STRING)
1110     lexer->prog++;
1111
1112   /* Accumulate the entire string, joining sections indicated by +
1113      signs. */
1114   for (;;)
1115     {
1116       /* Single or double quote. */
1117       int c = *lexer->prog++;
1118
1119       /* Accumulate section. */
1120       for (;;)
1121         {
1122           /* Check end of line. */
1123           if (*lexer->prog == '\0')
1124             {
1125               msg (SE, _("Unterminated string constant."));
1126               goto finish;
1127             }
1128
1129           /* Double quote characters to embed them in strings. */
1130           if (*lexer->prog == c)
1131             {
1132               if (lexer->prog[1] == c)
1133                 lexer->prog++;
1134               else
1135                 break;
1136             }
1137
1138           ds_put_byte (&lexer->tokstr, *lexer->prog++);
1139         }
1140       lexer->prog++;
1141
1142       /* Skip whitespace after final quote mark. */
1143       if (lexer->prog == NULL)
1144         break;
1145       for (;;)
1146         {
1147           while (c_isspace ((unsigned char) *lexer->prog))
1148             lexer->prog++;
1149           if (*lexer->prog)
1150             break;
1151
1152           if (lexer->dot)
1153             goto finish;
1154
1155           if (!lex_get_line (lexer))
1156             goto finish;
1157         }
1158
1159       /* Skip plus sign. */
1160       if (*lexer->prog != '+')
1161         break;
1162       lexer->prog++;
1163
1164       /* Skip whitespace after plus sign. */
1165       if (lexer->prog == NULL)
1166         break;
1167       for (;;)
1168         {
1169           while (c_isspace ((unsigned char) *lexer->prog))
1170             lexer->prog++;
1171           if (*lexer->prog)
1172             break;
1173
1174           if (lexer->dot)
1175             goto finish;
1176
1177           if (!lex_get_line (lexer))
1178             {
1179               msg (SE, _("Unexpected end of file in string concatenation."));
1180               goto finish;
1181             }
1182         }
1183
1184       /* Ensure that a valid string follows. */
1185       if (*lexer->prog != '\'' && *lexer->prog != '"')
1186         {
1187           msg (SE, _("String expected following `+'."));
1188           goto finish;
1189         }
1190     }
1191
1192   /* We come here when we've finished concatenating all the string sections
1193      into one large string. */
1194 finish:
1195   if (type != CHARACTER_STRING)
1196     convert_numeric_string_to_char_string (lexer, type);
1197
1198   return T_STRING;
1199 }
1200 \f
1201 /* Token Accessor Functions */
1202
1203 int
1204 lex_token (const struct lexer *lexer)
1205 {
1206   return lexer->token;
1207 }
1208
1209 double
1210 lex_tokval (const struct lexer *lexer)
1211 {
1212   return lexer->tokval;
1213 }
1214
1215 const char *
1216 lex_tokid (const struct lexer *lexer)
1217 {
1218   return lexer->tokid;
1219 }
1220
1221 const struct string *
1222 lex_tokstr (const struct lexer *lexer)
1223 {
1224   return &lexer->tokstr;
1225 }
1226
1227 /* If the lexer is positioned at the (pseudo)identifier S, which
1228    may contain a hyphen ('-'), skips it and returns true.  Each
1229    half of the identifier may be abbreviated to its first three
1230    letters.
1231    Otherwise, returns false. */
1232 bool
1233 lex_match_hyphenated_word (struct lexer *lexer, const char *s)
1234 {
1235   const char *hyphen = strchr (s, '-');
1236   if (hyphen == NULL)
1237     return lex_match_id (lexer, s);
1238   else if (lexer->token != T_ID
1239            || !lex_id_match (ss_buffer (s, hyphen - s), ss_cstr (lexer->tokid))
1240            || lex_look_ahead (lexer) != '-')
1241     return false;
1242   else
1243     {
1244       lex_get (lexer);
1245       lex_force_match (lexer, '-');
1246       lex_force_match_id (lexer, hyphen + 1);
1247       return true;
1248     }
1249 }
1250