pintos-os.org Git - pspp/blob - src/language/lexer/lexer.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 1997-9, 2000, 2006, 2009, 2010 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18 #include "lexer.h"
  19 #include <libpspp/message.h>
  20 #include <c-ctype.h>
  21 #include <c-strtod.h>
  22 #include <errno.h>
  23 #include <limits.h>
  24 #include <math.h>
  25 #include <stdarg.h>
  26 #include <stdint.h>
  27 #include <stdlib.h>
  28 #include <libpspp/assertion.h>
  29 #include <language/command.h>
  30 #include <libpspp/message.h>
  31 #include <data/settings.h>
  32 #include <libpspp/getl.h>
  33 #include <libpspp/str.h>
  34 #include <output/journal.h>
  35 #include <output/text-item.h>
  36
  37 #include "xalloc.h"
  38
  39 #include "gettext.h"
  40 #define _(msgid) gettext (msgid)
  41 #define N_(msgid) msgid
  42
  43 struct lexer
  44 {
  45   struct string line_buffer;
  46
  47   struct source_stream *ss;
  48
  49   int token;      /* Current token. */
  50   double tokval;  /* T_POS_NUM, T_NEG_NUM: the token's value. */
  51
  52   char tokid [VAR_NAME_LEN + 1];   /* T_ID: the identifier. */
  53
  54   struct string tokstr;   /* T_ID, T_STRING: token string value.
  55                             For T_ID, this is not truncated as is
  56                             tokid. */
  57
  58   char *prog; /* Pointer to next token in line_buffer. */
  59   bool dot;   /* True only if this line ends with a terminal dot. */
  60
  61   int put_token ; /* If nonzero, next token returned by lex_get().
  62                     Used only in exceptional circumstances. */
  63
  64   struct string put_tokstr;
  65   double put_tokval;
  66 };
  67
  68
  69 static int parse_id (struct lexer *);
  70
  71 /* How a string represents its contents. */
  72 enum string_type
  73   {
  74     CHARACTER_STRING,   /* Characters. */
  75     BINARY_STRING,      /* Binary digits. */
  76     OCTAL_STRING,       /* Octal digits. */
  77     HEX_STRING          /* Hexadecimal digits. */
  78   };
  79
  80 static int parse_string (struct lexer *, enum string_type);
  81 \f
  82 /* Initialization. */
  83
  84 /* Initializes the lexer. */
  85 struct lexer *
  86 lex_create (struct source_stream *ss)
  87 {
  88   struct lexer *lexer = xzalloc (sizeof (*lexer));
  89
  90   ds_init_empty (&lexer->tokstr);
  91   ds_init_empty (&lexer->put_tokstr);
  92   ds_init_empty (&lexer->line_buffer);
  93   lexer->ss = ss;
  94
  95   return lexer;
  96 }
  97
  98 struct source_stream *
  99 lex_get_source_stream (const struct lexer *lex)
 100 {
 101   return lex->ss;
 102 }
 103
 104 enum syntax_mode
 105 lex_current_syntax_mode (const struct lexer *lex)
 106 {
 107   return source_stream_current_syntax_mode (lex->ss);
 108 }
 109
 110 enum error_mode
 111 lex_current_error_mode (const struct lexer *lex)
 112 {
 113   return source_stream_current_error_mode (lex->ss);
 114 }
 115
 116
 117 void
 118 lex_destroy (struct lexer *lexer)
 119 {
 120   if ( NULL != lexer )
 121     {
 122       ds_destroy (&lexer->put_tokstr);
 123       ds_destroy (&lexer->tokstr);
 124       ds_destroy (&lexer->line_buffer);
 125
 126       free (lexer);
 127     }
 128 }
 129
 130 \f
 131 /* Common functions. */
 132
 133 /* Copies put_token, lexer->put_tokstr, put_tokval into token, tokstr,
 134    tokval, respectively, and sets tokid appropriately. */
 135 static void
 136 restore_token (struct lexer *lexer)
 137 {
 138   assert (lexer->put_token != 0);
 139   lexer->token = lexer->put_token;
 140   ds_assign_string (&lexer->tokstr, &lexer->put_tokstr);
 141   str_copy_trunc (lexer->tokid, sizeof lexer->tokid, ds_cstr (&lexer->tokstr));
 142   lexer->tokval = lexer->put_tokval;
 143   lexer->put_token = 0;
 144 }
 145
 146 /* Copies token, tokstr, lexer->tokval into lexer->put_token, put_tokstr,
 147    put_lexer->tokval respectively. */
 148 static void
 149 save_token (struct lexer *lexer)
 150 {
 151   lexer->put_token = lexer->token;
 152   ds_assign_string (&lexer->put_tokstr, &lexer->tokstr);
 153   lexer->put_tokval = lexer->tokval;
 154 }
 155
 156 /* Parses a single token, setting appropriate global variables to
 157    indicate the token's attributes. */
 158 void
 159 lex_get (struct lexer *lexer)
 160 {
 161   /* Find a token. */
 162   for (;;)
 163     {
 164       if (NULL == lexer->prog && ! lex_get_line (lexer) )
 165         {
 166           lexer->token = T_STOP;
 167           return;
 168         }
 169
 170       /* If a token was pushed ahead, return it. */
 171       if (lexer->put_token)
 172         {
 173           restore_token (lexer);
 174           return;
 175         }
 176
 177       for (;;)
 178         {
 179           /* Skip whitespace. */
 180           while (c_isspace ((unsigned char) *lexer->prog))
 181             lexer->prog++;
 182
 183           if (*lexer->prog)
 184             break;
 185
 186           if (lexer->dot)
 187             {
 188               lexer->dot = 0;
 189               lexer->token = '.';
 190               return;
 191             }
 192           else if (!lex_get_line (lexer))
 193             {
 194               lexer->prog = NULL;
 195               lexer->token = T_STOP;
 196               return;
 197             }
 198
 199           if (lexer->put_token)
 200             {
 201               restore_token (lexer);
 202               return;
 203             }
 204         }
 205
 206
 207       /* Actually parse the token. */
 208       ds_clear (&lexer->tokstr);
 209
 210       switch (*lexer->prog)
 211         {
 212         case '-': case '.':
 213         case '0': case '1': case '2': case '3': case '4':
 214         case '5': case '6': case '7': case '8': case '9':
 215           {
 216             char *tail;
 217
 218             /* `-' can introduce a negative number, or it can be a
 219                token by itself.  If it is not followed by a digit or a
 220                decimal point, it is definitely not a number.
 221                Otherwise, it might be either, but most of the time we
 222                want it as a number.  When the syntax calls for a `-'
 223                token, lex_negative_to_dash() must be used to break
 224                negative numbers into two tokens. */
 225             if (*lexer->prog == '-')
 226               {
 227                 ds_put_char (&lexer->tokstr, *lexer->prog++);
 228                 while (c_isspace ((unsigned char) *lexer->prog))
 229                   lexer->prog++;
 230
 231                 if (!c_isdigit ((unsigned char) *lexer->prog) && *lexer->prog != '.')
 232                   {
 233                     lexer->token = '-';
 234                     break;
 235                   }
 236                 lexer->token = T_NEG_NUM;
 237               }
 238             else
 239               lexer->token = T_POS_NUM;
 240
 241             /* Parse the number, copying it into tokstr. */
 242             while (c_isdigit ((unsigned char) *lexer->prog))
 243               ds_put_char (&lexer->tokstr, *lexer->prog++);
 244             if (*lexer->prog == '.')
 245               {
 246                 ds_put_char (&lexer->tokstr, *lexer->prog++);
 247                 while (c_isdigit ((unsigned char) *lexer->prog))
 248                   ds_put_char (&lexer->tokstr, *lexer->prog++);
 249               }
 250             if (*lexer->prog == 'e' || *lexer->prog == 'E')
 251               {
 252                 ds_put_char (&lexer->tokstr, *lexer->prog++);
 253                 if (*lexer->prog == '+' || *lexer->prog == '-')
 254                   ds_put_char (&lexer->tokstr, *lexer->prog++);
 255                 while (c_isdigit ((unsigned char) *lexer->prog))
 256                   ds_put_char (&lexer->tokstr, *lexer->prog++);
 257               }
 258
 259             /* Parse as floating point. */
 260             lexer->tokval = c_strtod (ds_cstr (&lexer->tokstr), &tail);
 261             if (*tail)
 262               {
 263                 msg (SE, _("%s does not form a valid number."),
 264                      ds_cstr (&lexer->tokstr));
 265                 lexer->tokval = 0.0;
 266
 267                 ds_clear (&lexer->tokstr);
 268                 ds_put_char (&lexer->tokstr, '0');
 269               }
 270
 271             break;
 272           }
 273
 274         case '\'': case '"':
 275           lexer->token = parse_string (lexer, CHARACTER_STRING);
 276           break;
 277
 278         case '(': case ')': case ',': case '=': case '+': case '/':
 279         case '[': case ']':
 280           lexer->token = *lexer->prog++;
 281           break;
 282
 283         case '*':
 284           if (*++lexer->prog == '*')
 285             {
 286               lexer->prog++;
 287               lexer->token = T_EXP;
 288             }
 289           else
 290             lexer->token = '*';
 291           break;
 292
 293         case '<':
 294           if (*++lexer->prog == '=')
 295             {
 296               lexer->prog++;
 297               lexer->token = T_LE;
 298             }
 299           else if (*lexer->prog == '>')
 300             {
 301               lexer->prog++;
 302               lexer->token = T_NE;
 303             }
 304           else
 305             lexer->token = T_LT;
 306           break;
 307
 308         case '>':
 309           if (*++lexer->prog == '=')
 310             {
 311               lexer->prog++;
 312               lexer->token = T_GE;
 313             }
 314           else
 315             lexer->token = T_GT;
 316           break;
 317
 318         case '~':
 319           if (*++lexer->prog == '=')
 320             {
 321               lexer->prog++;
 322               lexer->token = T_NE;
 323             }
 324           else
 325             lexer->token = T_NOT;
 326           break;
 327
 328         case '&':
 329           lexer->prog++;
 330           lexer->token = T_AND;
 331           break;
 332
 333         case '|':
 334           lexer->prog++;
 335           lexer->token = T_OR;
 336           break;
 337
 338         case 'b': case 'B':
 339           if (lexer->prog[1] == '\'' || lexer->prog[1] == '"')
 340             lexer->token = parse_string (lexer, BINARY_STRING);
 341           else
 342             lexer->token = parse_id (lexer);
 343           break;
 344
 345         case 'o': case 'O':
 346           if (lexer->prog[1] == '\'' || lexer->prog[1] == '"')
 347             lexer->token = parse_string (lexer, OCTAL_STRING);
 348           else
 349             lexer->token = parse_id (lexer);
 350           break;
 351
 352         case 'x': case 'X':
 353           if (lexer->prog[1] == '\'' || lexer->prog[1] == '"')
 354             lexer->token = parse_string (lexer, HEX_STRING);
 355           else
 356             lexer->token = parse_id (lexer);
 357           break;
 358
 359         default:
 360           if (lex_is_id1 (*lexer->prog))
 361             {
 362               lexer->token = parse_id (lexer);
 363               break;
 364             }
 365           else
 366             {
 367               unsigned char c = *lexer->prog++;
 368               char *c_name = xasprintf (c_isgraph (c) ? "%c" : "\\%o", c);
 369               msg (SE, _("Bad character in input: `%s'."), c_name);
 370               free (c_name);
 371               continue;
 372             }
 373         }
 374       break;
 375     }
 376 }
 377
 378 /* Parses an identifier at the current position into tokid and
 379    tokstr.
 380    Returns the correct token type. */
 381 static int
 382 parse_id (struct lexer *lexer)
 383 {
 384   struct substring rest_of_line
 385     = ss_substr (ds_ss (&lexer->line_buffer),
 386                  ds_pointer_to_position (&lexer->line_buffer, lexer->prog),
 387                  SIZE_MAX);
 388   struct substring id = ss_head (rest_of_line,
 389                                  lex_id_get_length (rest_of_line));
 390   lexer->prog += ss_length (id);
 391
 392   ds_assign_substring (&lexer->tokstr, id);
 393   str_copy_trunc (lexer->tokid, sizeof lexer->tokid, ds_cstr (&lexer->tokstr));
 394   return lex_id_to_token (id);
 395 }
 396
 397 /* Reports an error to the effect that subcommand SBC may only be
 398    specified once. */
 399 void
 400 lex_sbc_only_once (const char *sbc)
 401 {
 402   msg (SE, _("Subcommand %s may only be specified once."), sbc);
 403 }
 404
 405 /* Reports an error to the effect that subcommand SBC is
 406    missing. */
 407 void
 408 lex_sbc_missing (struct lexer *lexer, const char *sbc)
 409 {
 410   lex_error (lexer, _("missing required subcommand %s"), sbc);
 411 }
 412
 413 /* Prints a syntax error message containing the current token and
 414    given message MESSAGE (if non-null). */
 415 void
 416 lex_error (struct lexer *lexer, const char *message, ...)
 417 {
 418   struct string s;
 419
 420   ds_init_empty (&s);
 421
 422   if (lexer->token == T_STOP)
 423     ds_put_cstr (&s, _("Syntax error at end of file"));
 424   else if (lexer->token == '.')
 425     ds_put_cstr (&s, _("Syntax error at end of command"));
 426   else
 427     {
 428       char *token_rep = lex_token_representation (lexer);
 429       ds_put_format (&s, _("Syntax error at `%s'"), token_rep);
 430       free (token_rep);
 431     }
 432
 433   if (message)
 434     {
 435       va_list args;
 436
 437       ds_put_cstr (&s, ": ");
 438
 439       va_start (args, message);
 440       ds_put_vformat (&s, message, args);
 441       va_end (args);
 442     }
 443
 444   msg (SE, "%s.", ds_cstr (&s));
 445   ds_destroy (&s);
 446 }
 447
 448 /* Checks that we're at end of command.
 449    If so, returns a successful command completion code.
 450    If not, flags a syntax error and returns an error command
 451    completion code. */
 452 int
 453 lex_end_of_command (struct lexer *lexer)
 454 {
 455   if (lexer->token != '.')
 456     {
 457       lex_error (lexer, _("expecting end of command"));
 458       return CMD_FAILURE;
 459     }
 460   else
 461     return CMD_SUCCESS;
 462 }
 463 \f
 464 /* Token testing functions. */
 465
 466 /* Returns true if the current token is a number. */
 467 bool
 468 lex_is_number (struct lexer *lexer)
 469 {
 470   return lexer->token == T_POS_NUM || lexer->token == T_NEG_NUM;
 471 }
 472
 473
 474 /* Returns true if the current token is a string. */
 475 bool
 476 lex_is_string (struct lexer *lexer)
 477 {
 478   return lexer->token == T_STRING;
 479 }
 480
 481
 482 /* Returns the value of the current token, which must be a
 483    floating point number. */
 484 double
 485 lex_number (struct lexer *lexer)
 486 {
 487   assert (lex_is_number (lexer));
 488   return lexer->tokval;
 489 }
 490
 491 /* Returns true iff the current token is an integer. */
 492 bool
 493 lex_is_integer (struct lexer *lexer)
 494 {
 495   return (lex_is_number (lexer)
 496           && lexer->tokval > LONG_MIN
 497           && lexer->tokval <= LONG_MAX
 498           && floor (lexer->tokval) == lexer->tokval);
 499 }
 500
 501 /* Returns the value of the current token, which must be an
 502    integer. */
 503 long
 504 lex_integer (struct lexer *lexer)
 505 {
 506   assert (lex_is_integer (lexer));
 507   return lexer->tokval;
 508 }
 509 \f
 510 /* Token matching functions. */
 511
 512 /* If TOK is the current token, skips it and returns true
 513    Otherwise, returns false. */
 514 bool
 515 lex_match (struct lexer *lexer, int t)
 516 {
 517   if (lexer->token == t)
 518     {
 519       lex_get (lexer);
 520       return true;
 521     }
 522   else
 523     return false;
 524 }
 525
 526 /* If the current token is the identifier S, skips it and returns
 527    true.  The identifier may be abbreviated to its first three
 528    letters.
 529    Otherwise, returns false. */
 530 bool
 531 lex_match_id (struct lexer *lexer, const char *s)
 532 {
 533   return lex_match_id_n (lexer, s, 3);
 534 }
 535
 536 /* If the current token is the identifier S, skips it and returns
 537    true.  The identifier may be abbreviated to its first N
 538    letters.
 539    Otherwise, returns false. */
 540 bool
 541 lex_match_id_n (struct lexer *lexer, const char *s, size_t n)
 542 {
 543   if (lexer->token == T_ID
 544       && lex_id_match_n (ss_cstr (s), ss_cstr (lexer->tokid), n))
 545     {
 546       lex_get (lexer);
 547       return true;
 548     }
 549   else
 550     return false;
 551 }
 552
 553 /* If the current token is integer N, skips it and returns true.
 554    Otherwise, returns false. */
 555 bool
 556 lex_match_int (struct lexer *lexer, int x)
 557 {
 558   if (lex_is_integer (lexer) && lex_integer (lexer) == x)
 559     {
 560       lex_get (lexer);
 561       return true;
 562     }
 563   else
 564     return false;
 565 }
 566 \f
 567 /* Forced matches. */
 568
 569 /* If this token is identifier S, fetches the next token and returns
 570    nonzero.
 571    Otherwise, reports an error and returns zero. */
 572 bool
 573 lex_force_match_id (struct lexer *lexer, const char *s)
 574 {
 575   if (lex_match_id (lexer, s))
 576     return true;
 577   else
 578     {
 579       lex_error (lexer, _("expecting `%s'"), s);
 580       return false;
 581     }
 582 }
 583
 584 /* If the current token is T, skips the token.  Otherwise, reports an
 585    error and returns from the current function with return value false. */
 586 bool
 587 lex_force_match (struct lexer *lexer, int t)
 588 {
 589   if (lexer->token == t)
 590     {
 591       lex_get (lexer);
 592       return true;
 593     }
 594   else
 595     {
 596       lex_error (lexer, _("expecting `%s'"), lex_token_name (t));
 597       return false;
 598     }
 599 }
 600
 601 /* If this token is a string, does nothing and returns true.
 602    Otherwise, reports an error and returns false. */
 603 bool
 604 lex_force_string (struct lexer *lexer)
 605 {
 606   if (lex_is_string (lexer))
 607     return true;
 608   else
 609     {
 610       lex_error (lexer, _("expecting string"));
 611       return false;
 612     }
 613 }
 614
 615 /* If this token is an integer, does nothing and returns true.
 616    Otherwise, reports an error and returns false. */
 617 bool
 618 lex_force_int (struct lexer *lexer)
 619 {
 620   if (lex_is_integer (lexer))
 621     return true;
 622   else
 623     {
 624       lex_error (lexer, _("expecting integer"));
 625       return false;
 626     }
 627 }
 628
 629 /* If this token is a number, does nothing and returns true.
 630    Otherwise, reports an error and returns false. */
 631 bool
 632 lex_force_num (struct lexer *lexer)
 633 {
 634   if (lex_is_number (lexer))
 635     return true;
 636
 637   lex_error (lexer, _("expecting number"));
 638   return false;
 639 }
 640
 641 /* If this token is an identifier, does nothing and returns true.
 642    Otherwise, reports an error and returns false. */
 643 bool
 644 lex_force_id (struct lexer *lexer)
 645 {
 646   if (lexer->token == T_ID)
 647     return true;
 648
 649   lex_error (lexer, _("expecting identifier"));
 650   return false;
 651 }
 652
 653 /* Weird token functions. */
 654
 655 /* Returns the first character of the next token, except that if the
 656    next token is not an identifier, the character returned will not be
 657    a character that can begin an identifier.  Specifically, the
 658    hexstring lead-in X' causes lookahead() to return '.  Note that an
 659    alphanumeric return value doesn't guarantee an ID token, it could
 660    also be a reserved-word token. */
 661 int
 662 lex_look_ahead (struct lexer *lexer)
 663 {
 664   if (lexer->put_token)
 665     return lexer->put_token;
 666
 667   for (;;)
 668     {
 669       if (NULL == lexer->prog && ! lex_get_line (lexer) )
 670         return 0;
 671
 672       for (;;)
 673         {
 674           while (c_isspace ((unsigned char) *lexer->prog))
 675             lexer->prog++;
 676           if (*lexer->prog)
 677             break;
 678
 679           if (lexer->dot)
 680             return '.';
 681           else if (!lex_get_line (lexer))
 682             return 0;
 683
 684           if (lexer->put_token)
 685             return lexer->put_token;
 686         }
 687
 688       if ((toupper ((unsigned char) *lexer->prog) == 'X'
 689            || toupper ((unsigned char) *lexer->prog) == 'B'
 690            || toupper ((unsigned char) *lexer->prog) == 'O')
 691           && (lexer->prog[1] == '\'' || lexer->prog[1] == '"'))
 692         return '\'';
 693
 694       return *lexer->prog;
 695     }
 696 }
 697
 698 /* Makes the current token become the next token to be read; the
 699    current token is set to T. */
 700 void
 701 lex_put_back (struct lexer *lexer, int t)
 702 {
 703   save_token (lexer);
 704   lexer->token = t;
 705 }
 706
 707 /* Makes the current token become the next token to be read; the
 708    current token is set to the identifier ID. */
 709 void
 710 lex_put_back_id (struct lexer *lexer, const char *id)
 711 {
 712   assert (lex_id_to_token (ss_cstr (id)) == T_ID);
 713   save_token (lexer);
 714   lexer->token = T_ID;
 715   ds_assign_cstr (&lexer->tokstr, id);
 716   str_copy_trunc (lexer->tokid, sizeof lexer->tokid, ds_cstr (&lexer->tokstr));
 717 }
 718 \f
 719 /* Weird line processing functions. */
 720
 721 /* Returns the entire contents of the current line. */
 722 const char *
 723 lex_entire_line (const struct lexer *lexer)
 724 {
 725   return ds_cstr (&lexer->line_buffer);
 726 }
 727
 728 const struct string *
 729 lex_entire_line_ds (const struct lexer *lexer)
 730 {
 731   return &lexer->line_buffer;
 732 }
 733
 734 /* As lex_entire_line(), but only returns the part of the current line
 735    that hasn't already been tokenized. */
 736 const char *
 737 lex_rest_of_line (const struct lexer *lexer)
 738 {
 739   return lexer->prog;
 740 }
 741
 742 /* Returns true if the current line ends in a terminal dot,
 743    false otherwise. */
 744 bool
 745 lex_end_dot (const struct lexer *lexer)
 746 {
 747   return lexer->dot;
 748 }
 749
 750 /* Causes the rest of the current input line to be ignored for
 751    tokenization purposes. */
 752 void
 753 lex_discard_line (struct lexer *lexer)
 754 {
 755   ds_cstr (&lexer->line_buffer);  /* Ensures ds_end points to something valid */
 756   lexer->prog = ds_end (&lexer->line_buffer);
 757   lexer->dot = false;
 758   lexer->put_token = 0;
 759 }
 760
 761
 762 /* Discards the rest of the current command.
 763    When we're reading commands from a file, we skip tokens until
 764    a terminal dot or EOF.
 765    When we're reading commands interactively from the user,
 766    that's just discarding the current line, because presumably
 767    the user doesn't want to finish typing a command that will be
 768    ignored anyway. */
 769 void
 770 lex_discard_rest_of_command (struct lexer *lexer)
 771 {
 772   if (!getl_is_interactive (lexer->ss))
 773     {
 774       while (lexer->token != T_STOP && lexer->token != '.')
 775         lex_get (lexer);
 776     }
 777   else
 778     lex_discard_line (lexer);
 779 }
 780 \f
 781 /* Weird line reading functions. */
 782
 783 /* Remove C-style comments in STRING, begun by slash-star and
 784    terminated by star-slash or newline. */
 785 static void
 786 strip_comments (struct string *string)
 787 {
 788   char *cp;
 789   int quote;
 790   bool in_comment;
 791
 792   in_comment = false;
 793   quote = EOF;
 794   for (cp = ds_cstr (string); *cp; )
 795     {
 796       /* If we're not in a comment, check for quote marks. */
 797       if (!in_comment)
 798         {
 799           if (*cp == quote)
 800             quote = EOF;
 801           else if (*cp == '\'' || *cp == '"')
 802             quote = *cp;
 803         }
 804
 805       /* If we're not inside a quotation, check for comment. */
 806       if (quote == EOF)
 807         {
 808           if (cp[0] == '/' && cp[1] == '*')
 809             {
 810               in_comment = true;
 811               *cp++ = ' ';
 812               *cp++ = ' ';
 813               continue;
 814             }
 815           else if (in_comment && cp[0] == '*' && cp[1] == '/')
 816             {
 817               in_comment = false;
 818               *cp++ = ' ';
 819               *cp++ = ' ';
 820               continue;
 821             }
 822         }
 823
 824       /* Check commenting. */
 825       if (in_comment)
 826         *cp = ' ';
 827       cp++;
 828     }
 829 }
 830
 831 /* Prepares LINE, which is subject to the given SYNTAX rules, for
 832    tokenization by stripping comments and determining whether it
 833    is the beginning or end of a command and storing into
 834    *LINE_STARTS_COMMAND and *LINE_ENDS_COMMAND appropriately. */
 835 void
 836 lex_preprocess_line (struct string *line,
 837                      enum syntax_mode syntax,
 838                      bool *line_starts_command,
 839                      bool *line_ends_command)
 840 {
 841   strip_comments (line);
 842   ds_rtrim (line, ss_cstr (CC_SPACES));
 843   *line_ends_command = (ds_chomp (line, settings_get_endcmd ())
 844                         || (ds_is_empty (line) && settings_get_nulline ()));
 845   *line_starts_command = false;
 846   if (syntax == GETL_BATCH)
 847     {
 848       int first = ds_first (line);
 849       *line_starts_command = !c_isspace (first);
 850       if (first == '+' || first == '-')
 851         *ds_data (line) = ' ';
 852     }
 853 }
 854
 855 /* Reads a line, without performing any preprocessing. */
 856 bool
 857 lex_get_line_raw (struct lexer *lexer)
 858 {
 859   bool ok = getl_read_line (lexer->ss, &lexer->line_buffer);
 860   if (ok)
 861     {
 862       const char *line = ds_cstr (&lexer->line_buffer);
 863       text_item_submit (text_item_create (TEXT_ITEM_SYNTAX, line));
 864     }
 865   return ok;
 866 }
 867
 868 /* Reads a line for use by the tokenizer, and preprocesses it by
 869    removing comments, stripping trailing whitespace and the
 870    terminal dot, and removing leading indentors. */
 871 bool
 872 lex_get_line (struct lexer *lexer)
 873 {
 874   bool line_starts_command;
 875
 876   if (!lex_get_line_raw (lexer))
 877     {
 878       lexer->prog = NULL;
 879       return false;
 880     }
 881
 882   lex_preprocess_line (&lexer->line_buffer,
 883                        lex_current_syntax_mode (lexer),
 884                        &line_starts_command, &lexer->dot);
 885
 886   if (line_starts_command)
 887     lexer->put_token = '.';
 888
 889   lexer->prog = ds_cstr (&lexer->line_buffer);
 890   return true;
 891 }
 892 \f
 893 /* Token names. */
 894
 895 /* Returns the name of a token. */
 896 const char *
 897 lex_token_name (int token)
 898 {
 899   if (lex_is_keyword (token))
 900     return lex_id_name (token);
 901   else if (token < 256)
 902     {
 903       static char t[256][2];
 904       char *s = t[token];
 905       s[0] = token;
 906       s[1] = '\0';
 907       return s;
 908     }
 909   else
 910     NOT_REACHED ();
 911 }
 912
 913 /* Returns an ASCII representation of the current token as a
 914    malloc()'d string. */
 915 char *
 916 lex_token_representation (struct lexer *lexer)
 917 {
 918   char *token_rep;
 919
 920   switch (lexer->token)
 921     {
 922     case T_ID:
 923     case T_POS_NUM:
 924     case T_NEG_NUM:
 925       return ds_xstrdup (&lexer->tokstr);
 926       break;
 927
 928     case T_STRING:
 929       {
 930         int hexstring = 0;
 931         char *sp, *dp;
 932
 933         for (sp = ds_cstr (&lexer->tokstr); sp < ds_end (&lexer->tokstr); sp++)
 934           if (!c_isprint ((unsigned char) *sp))
 935             {
 936               hexstring = 1;
 937               break;
 938             }
 939
 940         token_rep = xmalloc (2 + ds_length (&lexer->tokstr) * 2 + 1 + 1);
 941
 942         dp = token_rep;
 943         if (hexstring)
 944           *dp++ = 'X';
 945         *dp++ = '\'';
 946
 947         if (!hexstring)
 948           for (sp = ds_cstr (&lexer->tokstr); *sp; )
 949             {
 950               if (*sp == '\'')
 951                 *dp++ = '\'';
 952               *dp++ = (unsigned char) *sp++;
 953             }
 954         else
 955           for (sp = ds_cstr (&lexer->tokstr); sp < ds_end (&lexer->tokstr); sp++)
 956             {
 957               *dp++ = (((unsigned char) *sp) >> 4)["0123456789ABCDEF"];
 958               *dp++ = (((unsigned char) *sp) & 15)["0123456789ABCDEF"];
 959             }
 960         *dp++ = '\'';
 961         *dp = '\0';
 962
 963         return token_rep;
 964       }
 965     break;
 966
 967     case T_STOP:
 968       token_rep = xmalloc (1);
 969       *token_rep = '\0';
 970       return token_rep;
 971
 972     case T_EXP:
 973       return xstrdup ("**");
 974
 975     default:
 976       return xstrdup (lex_token_name (lexer->token));
 977     }
 978
 979   NOT_REACHED ();
 980 }
 981 \f
 982 /* Really weird functions. */
 983
 984 /* Most of the time, a `-' is a lead-in to a negative number.  But
 985    sometimes it's actually part of the syntax.  If a dash can be part
 986    of syntax then this function is called to rip it off of a
 987    number. */
 988 void
 989 lex_negative_to_dash (struct lexer *lexer)
 990 {
 991   if (lexer->token == T_NEG_NUM)
 992     {
 993       lexer->token = T_POS_NUM;
 994       lexer->tokval = -lexer->tokval;
 995       ds_assign_substring (&lexer->tokstr, ds_substr (&lexer->tokstr, 1, SIZE_MAX));
 996       save_token (lexer);
 997       lexer->token = '-';
 998     }
 999 }
1000
1001 /* Skip a COMMENT command. */
1002 void
1003 lex_skip_comment (struct lexer *lexer)
1004 {
1005   for (;;)
1006     {
1007       if (!lex_get_line (lexer))
1008         {
1009           lexer->put_token = T_STOP;
1010           lexer->prog = NULL;
1011           return;
1012         }
1013
1014       if (lexer->put_token == '.')
1015         break;
1016
1017       ds_cstr (&lexer->line_buffer); /* Ensures ds_end will point to a valid char */
1018       lexer->prog = ds_end (&lexer->line_buffer);
1019       if (lexer->dot)
1020         break;
1021     }
1022 }
1023 \f
1024 /* Private functions. */
1025
1026 /* When invoked, tokstr contains a string of binary, octal, or
1027    hex digits, according to TYPE.  The string is converted to
1028    characters having the specified values. */
1029 static void
1030 convert_numeric_string_to_char_string (struct lexer *lexer,
1031                                        enum string_type type)
1032 {
1033   const char *base_name;
1034   int base;
1035   int chars_per_byte;
1036   size_t byte_cnt;
1037   size_t i;
1038   char *p;
1039
1040   switch (type)
1041     {
1042     case BINARY_STRING:
1043       base_name = _("binary");
1044       base = 2;
1045       chars_per_byte = 8;
1046       break;
1047     case OCTAL_STRING:
1048       base_name = _("octal");
1049       base = 8;
1050       chars_per_byte = 3;
1051       break;
1052     case HEX_STRING:
1053       base_name = _("hex");
1054       base = 16;
1055       chars_per_byte = 2;
1056       break;
1057     default:
1058       NOT_REACHED ();
1059     }
1060
1061   byte_cnt = ds_length (&lexer->tokstr) / chars_per_byte;
1062   if (ds_length (&lexer->tokstr) % chars_per_byte)
1063     msg (SE, _("String of %s digits has %zu characters, which is not a "
1064                "multiple of %d."),
1065          base_name, ds_length (&lexer->tokstr), chars_per_byte);
1066
1067   p = ds_cstr (&lexer->tokstr);
1068   for (i = 0; i < byte_cnt; i++)
1069     {
1070       int value;
1071       int j;
1072
1073       value = 0;
1074       for (j = 0; j < chars_per_byte; j++, p++)
1075         {
1076           int v;
1077
1078           if (*p >= '0' && *p <= '9')
1079             v = *p - '0';
1080           else
1081             {
1082               static const char alpha[] = "abcdef";
1083               const char *q = strchr (alpha, tolower ((unsigned char) *p));
1084
1085               if (q)
1086                 v = q - alpha + 10;
1087               else
1088                 v = base;
1089             }
1090
1091           if (v >= base)
1092             msg (SE, _("`%c' is not a valid %s digit."), *p, base_name);
1093
1094           value = value * base + v;
1095         }
1096
1097       ds_cstr (&lexer->tokstr)[i] = (unsigned char) value;
1098     }
1099
1100   ds_truncate (&lexer->tokstr, byte_cnt);
1101 }
1102
1103 /* Parses a string from the input buffer into tokstr.  The input
1104    buffer pointer lexer->prog must point to the initial single or double
1105    quote.  TYPE indicates the type of string to be parsed.
1106    Returns token type. */
1107 static int
1108 parse_string (struct lexer *lexer, enum string_type type)
1109 {
1110   if (type != CHARACTER_STRING)
1111     lexer->prog++;
1112
1113   /* Accumulate the entire string, joining sections indicated by +
1114      signs. */
1115   for (;;)
1116     {
1117       /* Single or double quote. */
1118       int c = *lexer->prog++;
1119
1120       /* Accumulate section. */
1121       for (;;)
1122         {
1123           /* Check end of line. */
1124           if (*lexer->prog == '\0')
1125             {
1126               msg (SE, _("Unterminated string constant."));
1127               goto finish;
1128             }
1129
1130           /* Double quote characters to embed them in strings. */
1131           if (*lexer->prog == c)
1132             {
1133               if (lexer->prog[1] == c)
1134                 lexer->prog++;
1135               else
1136                 break;
1137             }
1138
1139           ds_put_char (&lexer->tokstr, *lexer->prog++);
1140         }
1141       lexer->prog++;
1142
1143       /* Skip whitespace after final quote mark. */
1144       if (lexer->prog == NULL)
1145         break;
1146       for (;;)
1147         {
1148           while (c_isspace ((unsigned char) *lexer->prog))
1149             lexer->prog++;
1150           if (*lexer->prog)
1151             break;
1152
1153           if (lexer->dot)
1154             goto finish;
1155
1156           if (!lex_get_line (lexer))
1157             goto finish;
1158         }
1159
1160       /* Skip plus sign. */
1161       if (*lexer->prog != '+')
1162         break;
1163       lexer->prog++;
1164
1165       /* Skip whitespace after plus sign. */
1166       if (lexer->prog == NULL)
1167         break;
1168       for (;;)
1169         {
1170           while (c_isspace ((unsigned char) *lexer->prog))
1171             lexer->prog++;
1172           if (*lexer->prog)
1173             break;
1174
1175           if (lexer->dot)
1176             goto finish;
1177
1178           if (!lex_get_line (lexer))
1179             {
1180               msg (SE, _("Unexpected end of file in string concatenation."));
1181               goto finish;
1182             }
1183         }
1184
1185       /* Ensure that a valid string follows. */
1186       if (*lexer->prog != '\'' && *lexer->prog != '"')
1187         {
1188           msg (SE, _("String expected following `+'."));
1189           goto finish;
1190         }
1191     }
1192
1193   /* We come here when we've finished concatenating all the string sections
1194      into one large string. */
1195 finish:
1196   if (type != CHARACTER_STRING)
1197     convert_numeric_string_to_char_string (lexer, type);
1198
1199   return T_STRING;
1200 }
1201 \f
1202 /* Token Accessor Functions */
1203
1204 int
1205 lex_token (const struct lexer *lexer)
1206 {
1207   return lexer->token;
1208 }
1209
1210 double
1211 lex_tokval (const struct lexer *lexer)
1212 {
1213   return lexer->tokval;
1214 }
1215
1216 const char *
1217 lex_tokid (const struct lexer *lexer)
1218 {
1219   return lexer->tokid;
1220 }
1221
1222 const struct string *
1223 lex_tokstr (const struct lexer *lexer)
1224 {
1225   return &lexer->tokstr;
1226 }
1227
1228 /* If the lexer is positioned at the (pseudo)identifier S, which
1229    may contain a hyphen ('-'), skips it and returns true.  Each
1230    half of the identifier may be abbreviated to its first three
1231    letters.
1232    Otherwise, returns false. */
1233 bool
1234 lex_match_hyphenated_word (struct lexer *lexer, const char *s)
1235 {
1236   const char *hyphen = strchr (s, '-');
1237   if (hyphen == NULL)
1238     return lex_match_id (lexer, s);
1239   else if (lexer->token != T_ID
1240            || !lex_id_match (ss_buffer (s, hyphen - s), ss_cstr (lexer->tokid))
1241            || lex_look_ahead (lexer) != '-')
1242     return false;
1243   else
1244     {
1245       lex_get (lexer);
1246       lex_force_match (lexer, '-');
1247       lex_force_match_id (lexer, hyphen + 1);
1248       return true;
1249     }
1250 }
1251