src/language/lexer/lexer.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "language/lexer/lexer.h"
  20
  21 #include <c-ctype.h>
  22 #include <c-strtod.h>
  23 #include <errno.h>
  24 #include <limits.h>
  25 #include <math.h>
  26 #include <stdarg.h>
  27 #include <stdint.h>
  28 #include <stdlib.h>
  29
  30 #include "data/settings.h"
  31 #include "language/command.h"
  32 #include "libpspp/assertion.h"
  33 #include "libpspp/getl.h"
  34 #include "libpspp/message.h"
  35 #include "libpspp/str.h"
  36 #include "output/journal.h"
  37 #include "output/text-item.h"
  38
  39 #include "gl/xalloc.h"
  40
  41 #include "gettext.h"
  42 #define _(msgid) gettext (msgid)
  43 #define N_(msgid) msgid
  44
  45 struct lexer
  46 {
  47   struct string line_buffer;
  48
  49   struct source_stream *ss;
  50
  51   int token;      /* Current token. */
  52   double tokval;  /* T_POS_NUM, T_NEG_NUM: the token's value. */
  53
  54   struct string tokstr;   /* T_ID, T_STRING: token string value. */
  55
  56   char *prog; /* Pointer to next token in line_buffer. */
  57   bool dot;   /* True only if this line ends with a terminal dot. */
  58
  59   int put_token ; /* If nonzero, next token returned by lex_get().
  60                     Used only in exceptional circumstances. */
  61
  62   struct string put_tokstr;
  63   double put_tokval;
  64 };
  65
  66
  67 static int parse_id (struct lexer *);
  68
  69 /* How a string represents its contents. */
  70 enum string_type
  71   {
  72     CHARACTER_STRING,   /* Characters. */
  73     BINARY_STRING,      /* Binary digits. */
  74     OCTAL_STRING,       /* Octal digits. */
  75     HEX_STRING          /* Hexadecimal digits. */
  76   };
  77
  78 static int parse_string (struct lexer *, enum string_type);
  79 \f
  80 /* Initialization. */
  81
  82 /* Initializes the lexer. */
  83 struct lexer *
  84 lex_create (struct source_stream *ss)
  85 {
  86   struct lexer *lexer = xzalloc (sizeof (*lexer));
  87
  88   ds_init_empty (&lexer->tokstr);
  89   ds_init_empty (&lexer->put_tokstr);
  90   ds_init_empty (&lexer->line_buffer);
  91   lexer->ss = ss;
  92
  93   return lexer;
  94 }
  95
  96 struct source_stream *
  97 lex_get_source_stream (const struct lexer *lex)
  98 {
  99   return lex->ss;
 100 }
 101
 102 enum syntax_mode
 103 lex_current_syntax_mode (const struct lexer *lex)
 104 {
 105   return source_stream_current_syntax_mode (lex->ss);
 106 }
 107
 108 enum error_mode
 109 lex_current_error_mode (const struct lexer *lex)
 110 {
 111   return source_stream_current_error_mode (lex->ss);
 112 }
 113
 114
 115 void
 116 lex_destroy (struct lexer *lexer)
 117 {
 118   if ( NULL != lexer )
 119     {
 120       ds_destroy (&lexer->put_tokstr);
 121       ds_destroy (&lexer->tokstr);
 122       ds_destroy (&lexer->line_buffer);
 123
 124       free (lexer);
 125     }
 126 }
 127
 128 \f
 129 /* Common functions. */
 130
 131 /* Copies put_token, lexer->put_tokstr, put_tokval into token, tokstr,
 132    tokval, respectively, and sets tokid appropriately. */
 133 static void
 134 restore_token (struct lexer *lexer)
 135 {
 136   assert (lexer->put_token != 0);
 137   lexer->token = lexer->put_token;
 138   ds_assign_string (&lexer->tokstr, &lexer->put_tokstr);
 139   lexer->tokval = lexer->put_tokval;
 140   lexer->put_token = 0;
 141 }
 142
 143 /* Copies token, tokstr, lexer->tokval into lexer->put_token, put_tokstr,
 144    put_lexer->tokval respectively. */
 145 static void
 146 save_token (struct lexer *lexer)
 147 {
 148   lexer->put_token = lexer->token;
 149   ds_assign_string (&lexer->put_tokstr, &lexer->tokstr);
 150   lexer->put_tokval = lexer->tokval;
 151 }
 152
 153 /* Parses a single token, setting appropriate global variables to
 154    indicate the token's attributes. */
 155 void
 156 lex_get (struct lexer *lexer)
 157 {
 158   /* Find a token. */
 159   for (;;)
 160     {
 161       if (NULL == lexer->prog && ! lex_get_line (lexer) )
 162         {
 163           lexer->token = T_STOP;
 164           return;
 165         }
 166
 167       /* If a token was pushed ahead, return it. */
 168       if (lexer->put_token)
 169         {
 170           restore_token (lexer);
 171           return;
 172         }
 173
 174       for (;;)
 175         {
 176           /* Skip whitespace. */
 177           while (c_isspace ((unsigned char) *lexer->prog))
 178             lexer->prog++;
 179
 180           if (*lexer->prog)
 181             break;
 182
 183           if (lexer->dot)
 184             {
 185               lexer->dot = 0;
 186               lexer->token = T_ENDCMD;
 187               return;
 188             }
 189           else if (!lex_get_line (lexer))
 190             {
 191               lexer->prog = NULL;
 192               lexer->token = T_STOP;
 193               return;
 194             }
 195
 196           if (lexer->put_token)
 197             {
 198               restore_token (lexer);
 199               return;
 200             }
 201         }
 202
 203
 204       /* Actually parse the token. */
 205       ds_clear (&lexer->tokstr);
 206
 207       switch (*lexer->prog)
 208         {
 209         case '-': case '.':
 210         case '0': case '1': case '2': case '3': case '4':
 211         case '5': case '6': case '7': case '8': case '9':
 212           {
 213             char *tail;
 214
 215             /* `-' can introduce a negative number, or it can be a token by
 216                itself. */
 217             if (*lexer->prog == '-')
 218               {
 219                 ds_put_byte (&lexer->tokstr, *lexer->prog++);
 220                 while (c_isspace ((unsigned char) *lexer->prog))
 221                   lexer->prog++;
 222
 223                 if (!c_isdigit ((unsigned char) *lexer->prog) && *lexer->prog != '.')
 224                   {
 225                     lexer->token = T_DASH;
 226                     break;
 227                   }
 228                 lexer->token = T_NEG_NUM;
 229               }
 230             else
 231               lexer->token = T_POS_NUM;
 232
 233             /* Parse the number, copying it into tokstr. */
 234             while (c_isdigit ((unsigned char) *lexer->prog))
 235               ds_put_byte (&lexer->tokstr, *lexer->prog++);
 236             if (*lexer->prog == '.')
 237               {
 238                 ds_put_byte (&lexer->tokstr, *lexer->prog++);
 239                 while (c_isdigit ((unsigned char) *lexer->prog))
 240                   ds_put_byte (&lexer->tokstr, *lexer->prog++);
 241               }
 242             if (*lexer->prog == 'e' || *lexer->prog == 'E')
 243               {
 244                 ds_put_byte (&lexer->tokstr, *lexer->prog++);
 245                 if (*lexer->prog == '+' || *lexer->prog == '-')
 246                   ds_put_byte (&lexer->tokstr, *lexer->prog++);
 247                 while (c_isdigit ((unsigned char) *lexer->prog))
 248                   ds_put_byte (&lexer->tokstr, *lexer->prog++);
 249               }
 250
 251             /* Parse as floating point. */
 252             lexer->tokval = c_strtod (ds_cstr (&lexer->tokstr), &tail);
 253             if (*tail)
 254               {
 255                 msg (SE, _("%s does not form a valid number."),
 256                      ds_cstr (&lexer->tokstr));
 257                 lexer->tokval = 0.0;
 258
 259                 ds_clear (&lexer->tokstr);
 260                 ds_put_byte (&lexer->tokstr, '0');
 261               }
 262
 263             break;
 264           }
 265
 266         case '\'': case '"':
 267           lexer->token = parse_string (lexer, CHARACTER_STRING);
 268           break;
 269
 270         case '+':
 271           lexer->token = T_PLUS;
 272           lexer->prog++;
 273           break;
 274
 275         case '/':
 276           lexer->token = T_SLASH;
 277           lexer->prog++;
 278           break;
 279
 280         case '=':
 281           lexer->token = T_EQUALS;
 282           lexer->prog++;
 283           break;
 284
 285         case '(':
 286           lexer->token = T_LPAREN;
 287           lexer->prog++;
 288           break;
 289
 290         case ')':
 291           lexer->token = T_RPAREN;
 292           lexer->prog++;
 293           break;
 294
 295         case '[':
 296           lexer->token = T_LBRACK;
 297           lexer->prog++;
 298           break;
 299
 300         case ']':
 301           lexer->token = T_RBRACK;
 302           lexer->prog++;
 303           break;
 304
 305         case ',':
 306           lexer->token = T_COMMA;
 307           lexer->prog++;
 308           break;
 309
 310         case '*':
 311           if (*++lexer->prog == '*')
 312             {
 313               lexer->prog++;
 314               lexer->token = T_EXP;
 315             }
 316           else
 317             lexer->token = T_ASTERISK;
 318           break;
 319
 320         case '<':
 321           if (*++lexer->prog == '=')
 322             {
 323               lexer->prog++;
 324               lexer->token = T_LE;
 325             }
 326           else if (*lexer->prog == '>')
 327             {
 328               lexer->prog++;
 329               lexer->token = T_NE;
 330             }
 331           else
 332             lexer->token = T_LT;
 333           break;
 334
 335         case '>':
 336           if (*++lexer->prog == '=')
 337             {
 338               lexer->prog++;
 339               lexer->token = T_GE;
 340             }
 341           else
 342             lexer->token = T_GT;
 343           break;
 344
 345         case '~':
 346           if (*++lexer->prog == '=')
 347             {
 348               lexer->prog++;
 349               lexer->token = T_NE;
 350             }
 351           else
 352             lexer->token = T_NOT;
 353           break;
 354
 355         case '&':
 356           lexer->prog++;
 357           lexer->token = T_AND;
 358           break;
 359
 360         case '|':
 361           lexer->prog++;
 362           lexer->token = T_OR;
 363           break;
 364
 365         case 'b': case 'B':
 366           if (lexer->prog[1] == '\'' || lexer->prog[1] == '"')
 367             lexer->token = parse_string (lexer, BINARY_STRING);
 368           else
 369             lexer->token = parse_id (lexer);
 370           break;
 371
 372         case 'o': case 'O':
 373           if (lexer->prog[1] == '\'' || lexer->prog[1] == '"')
 374             lexer->token = parse_string (lexer, OCTAL_STRING);
 375           else
 376             lexer->token = parse_id (lexer);
 377           break;
 378
 379         case 'x': case 'X':
 380           if (lexer->prog[1] == '\'' || lexer->prog[1] == '"')
 381             lexer->token = parse_string (lexer, HEX_STRING);
 382           else
 383             lexer->token = parse_id (lexer);
 384           break;
 385
 386         default:
 387           if (lex_is_id1 (*lexer->prog))
 388             {
 389               lexer->token = parse_id (lexer);
 390               break;
 391             }
 392           else
 393             {
 394               unsigned char c = *lexer->prog++;
 395               char *c_name = xasprintf (c_isgraph (c) ? "%c" : "\\%o", c);
 396               msg (SE, _("Bad character in input: `%s'."), c_name);
 397               free (c_name);
 398               continue;
 399             }
 400         }
 401       break;
 402     }
 403 }
 404
 405 /* Parses an identifier at the current position into tokstr.
 406    Returns the correct token type. */
 407 static int
 408 parse_id (struct lexer *lexer)
 409 {
 410   struct substring rest_of_line
 411     = ss_substr (ds_ss (&lexer->line_buffer),
 412                  ds_pointer_to_position (&lexer->line_buffer, lexer->prog),
 413                  SIZE_MAX);
 414   struct substring id = ss_head (rest_of_line,
 415                                  lex_id_get_length (rest_of_line));
 416   lexer->prog += ss_length (id);
 417
 418   ds_assign_substring (&lexer->tokstr, id);
 419   return lex_id_to_token (id);
 420 }
 421
 422 /* Reports an error to the effect that subcommand SBC may only be
 423    specified once. */
 424 void
 425 lex_sbc_only_once (const char *sbc)
 426 {
 427   msg (SE, _("Subcommand %s may only be specified once."), sbc);
 428 }
 429
 430 /* Reports an error to the effect that subcommand SBC is
 431    missing. */
 432 void
 433 lex_sbc_missing (struct lexer *lexer, const char *sbc)
 434 {
 435   lex_error (lexer, _("missing required subcommand %s"), sbc);
 436 }
 437
 438 /* Prints a syntax error message containing the current token and
 439    given message MESSAGE (if non-null). */
 440 void
 441 lex_error (struct lexer *lexer, const char *message, ...)
 442 {
 443   struct string s;
 444
 445   ds_init_empty (&s);
 446
 447   if (lexer->token == T_STOP)
 448     ds_put_cstr (&s, _("Syntax error at end of file"));
 449   else if (lexer->token == T_ENDCMD)
 450     ds_put_cstr (&s, _("Syntax error at end of command"));
 451   else
 452     {
 453       char *token_rep = lex_token_representation (lexer);
 454       ds_put_format (&s, _("Syntax error at `%s'"), token_rep);
 455       free (token_rep);
 456     }
 457
 458   if (message)
 459     {
 460       va_list args;
 461
 462       ds_put_cstr (&s, ": ");
 463
 464       va_start (args, message);
 465       ds_put_vformat (&s, message, args);
 466       va_end (args);
 467     }
 468
 469   msg (SE, "%s.", ds_cstr (&s));
 470   ds_destroy (&s);
 471 }
 472
 473 /* Checks that we're at end of command.
 474    If so, returns a successful command completion code.
 475    If not, flags a syntax error and returns an error command
 476    completion code. */
 477 int
 478 lex_end_of_command (struct lexer *lexer)
 479 {
 480   if (lexer->token != T_ENDCMD)
 481     {
 482       lex_error (lexer, _("expecting end of command"));
 483       return CMD_FAILURE;
 484     }
 485   else
 486     return CMD_SUCCESS;
 487 }
 488 \f
 489 /* Token testing functions. */
 490
 491 /* Returns true if the current token is a number. */
 492 bool
 493 lex_is_number (struct lexer *lexer)
 494 {
 495   return lexer->token == T_POS_NUM || lexer->token == T_NEG_NUM;
 496 }
 497
 498
 499 /* Returns true if the current token is a string. */
 500 bool
 501 lex_is_string (struct lexer *lexer)
 502 {
 503   return lexer->token == T_STRING;
 504 }
 505
 506
 507 /* Returns the value of the current token, which must be a
 508    floating point number. */
 509 double
 510 lex_number (struct lexer *lexer)
 511 {
 512   assert (lex_is_number (lexer));
 513   return lexer->tokval;
 514 }
 515
 516 /* Returns true iff the current token is an integer. */
 517 bool
 518 lex_is_integer (struct lexer *lexer)
 519 {
 520   return (lex_is_number (lexer)
 521           && lexer->tokval > LONG_MIN
 522           && lexer->tokval <= LONG_MAX
 523           && floor (lexer->tokval) == lexer->tokval);
 524 }
 525
 526 /* Returns the value of the current token, which must be an
 527    integer. */
 528 long
 529 lex_integer (struct lexer *lexer)
 530 {
 531   assert (lex_is_integer (lexer));
 532   return lexer->tokval;
 533 }
 534 \f
 535 /* Token matching functions. */
 536
 537 /* If TOK is the current token, skips it and returns true
 538    Otherwise, returns false. */
 539 bool
 540 lex_match (struct lexer *lexer, enum token_type t)
 541 {
 542   if (lexer->token == t)
 543     {
 544       lex_get (lexer);
 545       return true;
 546     }
 547   else
 548     return false;
 549 }
 550
 551 /* If the current token is the identifier S, skips it and returns
 552    true.  The identifier may be abbreviated to its first three
 553    letters.
 554    Otherwise, returns false. */
 555 bool
 556 lex_match_id (struct lexer *lexer, const char *s)
 557 {
 558   return lex_match_id_n (lexer, s, 3);
 559 }
 560
 561 /* If the current token is the identifier S, skips it and returns
 562    true.  The identifier may be abbreviated to its first N
 563    letters.
 564    Otherwise, returns false. */
 565 bool
 566 lex_match_id_n (struct lexer *lexer, const char *s, size_t n)
 567 {
 568   if (lexer->token == T_ID
 569       && lex_id_match_n (ss_cstr (s), lex_tokss (lexer), n))
 570     {
 571       lex_get (lexer);
 572       return true;
 573     }
 574   else
 575     return false;
 576 }
 577
 578 /* If the current token is integer N, skips it and returns true.
 579    Otherwise, returns false. */
 580 bool
 581 lex_match_int (struct lexer *lexer, int x)
 582 {
 583   if (lex_is_integer (lexer) && lex_integer (lexer) == x)
 584     {
 585       lex_get (lexer);
 586       return true;
 587     }
 588   else
 589     return false;
 590 }
 591 \f
 592 /* Forced matches. */
 593
 594 /* If this token is identifier S, fetches the next token and returns
 595    nonzero.
 596    Otherwise, reports an error and returns zero. */
 597 bool
 598 lex_force_match_id (struct lexer *lexer, const char *s)
 599 {
 600   if (lex_match_id (lexer, s))
 601     return true;
 602   else
 603     {
 604       lex_error (lexer, _("expecting `%s'"), s);
 605       return false;
 606     }
 607 }
 608
 609 /* If the current token is T, skips the token.  Otherwise, reports an
 610    error and returns from the current function with return value false. */
 611 bool
 612 lex_force_match (struct lexer *lexer, enum token_type t)
 613 {
 614   if (lexer->token == t)
 615     {
 616       lex_get (lexer);
 617       return true;
 618     }
 619   else
 620     {
 621       lex_error (lexer, _("expecting `%s'"), lex_token_name (t));
 622       return false;
 623     }
 624 }
 625
 626 /* If this token is a string, does nothing and returns true.
 627    Otherwise, reports an error and returns false. */
 628 bool
 629 lex_force_string (struct lexer *lexer)
 630 {
 631   if (lex_is_string (lexer))
 632     return true;
 633   else
 634     {
 635       lex_error (lexer, _("expecting string"));
 636       return false;
 637     }
 638 }
 639
 640 /* If this token is an integer, does nothing and returns true.
 641    Otherwise, reports an error and returns false. */
 642 bool
 643 lex_force_int (struct lexer *lexer)
 644 {
 645   if (lex_is_integer (lexer))
 646     return true;
 647   else
 648     {
 649       lex_error (lexer, _("expecting integer"));
 650       return false;
 651     }
 652 }
 653
 654 /* If this token is a number, does nothing and returns true.
 655    Otherwise, reports an error and returns false. */
 656 bool
 657 lex_force_num (struct lexer *lexer)
 658 {
 659   if (lex_is_number (lexer))
 660     return true;
 661
 662   lex_error (lexer, _("expecting number"));
 663   return false;
 664 }
 665
 666 /* If this token is an identifier, does nothing and returns true.
 667    Otherwise, reports an error and returns false. */
 668 bool
 669 lex_force_id (struct lexer *lexer)
 670 {
 671   if (lexer->token == T_ID)
 672     return true;
 673
 674   lex_error (lexer, _("expecting identifier"));
 675   return false;
 676 }
 677
 678 /* Weird token functions. */
 679
 680 /* Returns the likely type of the next token, or 0 if it's hard to tell. */
 681 enum token_type
 682 lex_look_ahead (struct lexer *lexer)
 683 {
 684   if (lexer->put_token)
 685     return lexer->put_token;
 686
 687   for (;;)
 688     {
 689       if (NULL == lexer->prog && ! lex_get_line (lexer) )
 690         return 0;
 691
 692       for (;;)
 693         {
 694           while (c_isspace ((unsigned char) *lexer->prog))
 695             lexer->prog++;
 696           if (*lexer->prog)
 697             break;
 698
 699           if (lexer->dot)
 700             return T_ENDCMD;
 701           else if (!lex_get_line (lexer))
 702             return 0;
 703
 704           if (lexer->put_token)
 705             return lexer->put_token;
 706         }
 707
 708       switch (toupper ((unsigned char) *lexer->prog))
 709         {
 710         case 'X': case 'B': case 'O':
 711           if (lexer->prog[1] == '\'' || lexer->prog[1] == '"')
 712             return T_STRING;
 713           /* Fall through */
 714
 715         case '-':
 716           return T_DASH;
 717
 718         case '.':
 719         case '0': case '1': case '2': case '3': case '4':
 720         case '5': case '6': case '7': case '8': case '9':
 721           return T_POS_NUM;
 722
 723         case '\'': case '"':
 724           return T_STRING;
 725
 726         case '+':
 727           return T_PLUS;
 728
 729         case '/':
 730           return T_SLASH;
 731
 732         case '=':
 733           return T_EQUALS;
 734
 735         case '(':
 736           return T_LPAREN;
 737
 738         case ')':
 739           return T_RPAREN;
 740
 741         case '[':
 742           return T_LBRACK;
 743
 744         case ']':
 745           return T_RBRACK;
 746
 747         case ',':
 748           return T_COMMA;
 749
 750         case '*':
 751           return lexer->prog[1] == '*' ? T_EXP : T_ASTERISK;
 752
 753         case '<':
 754           return (lexer->prog[1] == '=' ? T_LE
 755                   : lexer->prog[1] == '>' ? T_NE
 756                   : T_LT);
 757
 758         case '>':
 759           return lexer->prog[1] == '=' ? T_GE : T_GT;
 760
 761         case '~':
 762           return lexer->prog[1] == '=' ? T_NE : T_NOT;
 763
 764         case '&':
 765           return T_AND;
 766
 767         case '|':
 768           return T_OR;
 769
 770         default:
 771           if (lex_is_id1 (*lexer->prog))
 772             return T_ID;
 773           return 0;
 774         }
 775     }
 776 }
 777
 778 /* Makes the current token become the next token to be read; the
 779    current token is set to T. */
 780 void
 781 lex_put_back (struct lexer *lexer, enum token_type t)
 782 {
 783   save_token (lexer);
 784   lexer->token = t;
 785 }
 786 \f
 787 /* Weird line processing functions. */
 788
 789 /* Returns the entire contents of the current line. */
 790 const char *
 791 lex_entire_line (const struct lexer *lexer)
 792 {
 793   return ds_cstr (&lexer->line_buffer);
 794 }
 795
 796 const struct string *
 797 lex_entire_line_ds (const struct lexer *lexer)
 798 {
 799   return &lexer->line_buffer;
 800 }
 801
 802 /* As lex_entire_line(), but only returns the part of the current line
 803    that hasn't already been tokenized. */
 804 const char *
 805 lex_rest_of_line (const struct lexer *lexer)
 806 {
 807   return lexer->prog;
 808 }
 809
 810 /* Returns true if the current line ends in a terminal dot,
 811    false otherwise. */
 812 bool
 813 lex_end_dot (const struct lexer *lexer)
 814 {
 815   return lexer->dot;
 816 }
 817
 818 /* Causes the rest of the current input line to be ignored for
 819    tokenization purposes. */
 820 void
 821 lex_discard_line (struct lexer *lexer)
 822 {
 823   ds_cstr (&lexer->line_buffer);  /* Ensures ds_end points to something valid */
 824   lexer->prog = ds_end (&lexer->line_buffer);
 825   lexer->dot = false;
 826   lexer->put_token = 0;
 827 }
 828
 829
 830 /* Discards the rest of the current command.
 831    When we're reading commands from a file, we skip tokens until
 832    a terminal dot or EOF.
 833    When we're reading commands interactively from the user,
 834    that's just discarding the current line, because presumably
 835    the user doesn't want to finish typing a command that will be
 836    ignored anyway. */
 837 void
 838 lex_discard_rest_of_command (struct lexer *lexer)
 839 {
 840   if (!getl_is_interactive (lexer->ss))
 841     {
 842       while (lexer->token != T_STOP && lexer->token != T_ENDCMD)
 843         lex_get (lexer);
 844     }
 845   else
 846     lex_discard_line (lexer);
 847 }
 848 \f
 849 /* Weird line reading functions. */
 850
 851 /* Remove C-style comments in STRING, begun by slash-star and
 852    terminated by star-slash or newline. */
 853 static void
 854 strip_comments (struct string *string)
 855 {
 856   char *cp;
 857   int quote;
 858   bool in_comment;
 859
 860   in_comment = false;
 861   quote = EOF;
 862   for (cp = ds_cstr (string); *cp; )
 863     {
 864       /* If we're not in a comment, check for quote marks. */
 865       if (!in_comment)
 866         {
 867           if (*cp == quote)
 868             quote = EOF;
 869           else if (*cp == '\'' || *cp == '"')
 870             quote = *cp;
 871         }
 872
 873       /* If we're not inside a quotation, check for comment. */
 874       if (quote == EOF)
 875         {
 876           if (cp[0] == '/' && cp[1] == '*')
 877             {
 878               in_comment = true;
 879               *cp++ = ' ';
 880               *cp++ = ' ';
 881               continue;
 882             }
 883           else if (in_comment && cp[0] == '*' && cp[1] == '/')
 884             {
 885               in_comment = false;
 886               *cp++ = ' ';
 887               *cp++ = ' ';
 888               continue;
 889             }
 890         }
 891
 892       /* Check commenting. */
 893       if (in_comment)
 894         *cp = ' ';
 895       cp++;
 896     }
 897 }
 898
 899 /* Prepares LINE, which is subject to the given SYNTAX rules, for
 900    tokenization by stripping comments and determining whether it
 901    is the beginning or end of a command and storing into
 902    *LINE_STARTS_COMMAND and *LINE_ENDS_COMMAND appropriately. */
 903 void
 904 lex_preprocess_line (struct string *line,
 905                      enum syntax_mode syntax,
 906                      bool *line_starts_command,
 907                      bool *line_ends_command)
 908 {
 909   strip_comments (line);
 910   ds_rtrim (line, ss_cstr (CC_SPACES));
 911   *line_ends_command = ds_chomp_byte (line, '.') || ds_is_empty (line);
 912   *line_starts_command = false;
 913   if (syntax == GETL_BATCH)
 914     {
 915       int first = ds_first (line);
 916       *line_starts_command = !c_isspace (first);
 917       if (first == '+' || first == '-')
 918         *ds_data (line) = ' ';
 919     }
 920 }
 921
 922 /* Reads a line, without performing any preprocessing. */
 923 bool
 924 lex_get_line_raw (struct lexer *lexer)
 925 {
 926   bool ok = getl_read_line (lexer->ss, &lexer->line_buffer);
 927   if (ok)
 928     {
 929       const char *line = ds_cstr (&lexer->line_buffer);
 930       text_item_submit (text_item_create (TEXT_ITEM_SYNTAX, line));
 931     }
 932   else
 933     lexer->prog = NULL;
 934   return ok;
 935 }
 936
 937 /* Reads a line for use by the tokenizer, and preprocesses it by
 938    removing comments, stripping trailing whitespace and the
 939    terminal dot, and removing leading indentors. */
 940 bool
 941 lex_get_line (struct lexer *lexer)
 942 {
 943   bool line_starts_command;
 944
 945   if (!lex_get_line_raw (lexer))
 946     return false;
 947
 948   lex_preprocess_line (&lexer->line_buffer,
 949                        lex_current_syntax_mode (lexer),
 950                        &line_starts_command, &lexer->dot);
 951
 952   if (line_starts_command)
 953     lexer->put_token = T_ENDCMD;
 954
 955   lexer->prog = ds_cstr (&lexer->line_buffer);
 956   return true;
 957 }
 958 \f
 959 /* Token names. */
 960
 961 /* Returns the name of a token. */
 962 const char *
 963 lex_token_name (enum token_type token)
 964 {
 965   switch (token)
 966     {
 967     case T_ID:
 968     case T_POS_NUM:
 969     case T_NEG_NUM:
 970     case T_STRING:
 971     case TOKEN_N_TYPES:
 972       NOT_REACHED ();
 973
 974     case T_STOP:
 975       return "";
 976
 977     case T_ENDCMD:
 978       return ".";
 979
 980     case T_PLUS:
 981       return "+";
 982
 983     case T_DASH:
 984       return "-";
 985
 986     case T_ASTERISK:
 987       return "*";
 988
 989     case T_SLASH:
 990       return "/";
 991
 992     case T_EQUALS:
 993       return "=";
 994
 995     case T_LPAREN:
 996       return "(";
 997
 998     case T_RPAREN:
 999       return ")";
1000
1001     case T_LBRACK:
1002       return "[";
1003
1004     case T_RBRACK:
1005       return "]";
1006
1007     case T_COMMA:
1008       return ",";
1009
1010     case T_AND:
1011       return "AND";
1012
1013     case T_OR:
1014       return "OR";
1015
1016     case T_NOT:
1017       return "NOT";
1018
1019     case T_EQ:
1020       return "EQ";
1021
1022     case T_GE:
1023       return ">=";
1024
1025     case T_GT:
1026       return ">";
1027
1028     case T_LE:
1029       return "<=";
1030
1031     case T_LT:
1032       return "<";
1033
1034     case T_NE:
1035       return "~=";
1036
1037     case T_ALL:
1038       return "ALL";
1039
1040     case T_BY:
1041       return "BY";
1042
1043     case T_TO:
1044       return "TO";
1045
1046     case T_WITH:
1047       return "WITH";
1048
1049     case T_EXP:
1050       return "**";
1051     }
1052
1053   NOT_REACHED ();
1054 }
1055
1056 /* Returns an ASCII representation of the current token as a
1057    malloc()'d string. */
1058 char *
1059 lex_token_representation (struct lexer *lexer)
1060 {
1061   char *token_rep;
1062
1063   switch (lexer->token)
1064     {
1065     case T_ID:
1066     case T_POS_NUM:
1067     case T_NEG_NUM:
1068       return ss_xstrdup (lex_tokss (lexer));
1069
1070     case T_STRING:
1071       {
1072         struct substring ss;
1073         int hexstring = 0;
1074         char *sp, *dp;
1075
1076         ss = lex_tokss (lexer);
1077         for (sp = ss_data (ss); sp < ss_end (ss); sp++)
1078           if (!c_isprint ((unsigned char) *sp))
1079             {
1080               hexstring = 1;
1081               break;
1082             }
1083
1084         token_rep = xmalloc (2 + ss_length (ss) * 2 + 1 + 1);
1085
1086         dp = token_rep;
1087         if (hexstring)
1088           *dp++ = 'X';
1089         *dp++ = '\'';
1090
1091         for (sp = ss_data (ss); sp < ss_end (ss); sp++)
1092           if (!hexstring)
1093             {
1094               if (*sp == '\'')
1095                 *dp++ = '\'';
1096               *dp++ = (unsigned char) *sp;
1097             }
1098           else
1099             {
1100               *dp++ = (((unsigned char) *sp) >> 4)["0123456789ABCDEF"];
1101               *dp++ = (((unsigned char) *sp) & 15)["0123456789ABCDEF"];
1102             }
1103         *dp++ = '\'';
1104         *dp = '\0';
1105
1106         return token_rep;
1107       }
1108
1109     default:
1110       return xstrdup (lex_token_name (lexer->token));
1111     }
1112 }
1113 \f
1114 /* Really weird functions. */
1115
1116 /* Skip a COMMENT command. */
1117 void
1118 lex_skip_comment (struct lexer *lexer)
1119 {
1120   for (;;)
1121     {
1122       if (!lex_get_line (lexer))
1123         {
1124           lexer->put_token = T_STOP;
1125           lexer->prog = NULL;
1126           return;
1127         }
1128
1129       if (lexer->put_token == T_ENDCMD)
1130         break;
1131
1132       ds_cstr (&lexer->line_buffer); /* Ensures ds_end will point to a valid char */
1133       lexer->prog = ds_end (&lexer->line_buffer);
1134       if (lexer->dot)
1135         break;
1136     }
1137 }
1138 \f
1139 /* Private functions. */
1140
1141 /* When invoked, tokstr contains a string of binary, octal, or
1142    hex digits, according to TYPE.  The string is converted to
1143    characters having the specified values. */
1144 static void
1145 convert_numeric_string_to_char_string (struct lexer *lexer,
1146                                        enum string_type type)
1147 {
1148   const char *base_name;
1149   int base;
1150   int chars_per_byte;
1151   size_t byte_cnt;
1152   size_t i;
1153   char *p;
1154
1155   switch (type)
1156     {
1157     case BINARY_STRING:
1158       base_name = _("binary");
1159       base = 2;
1160       chars_per_byte = 8;
1161       break;
1162     case OCTAL_STRING:
1163       base_name = _("octal");
1164       base = 8;
1165       chars_per_byte = 3;
1166       break;
1167     case HEX_STRING:
1168       base_name = _("hex");
1169       base = 16;
1170       chars_per_byte = 2;
1171       break;
1172     default:
1173       NOT_REACHED ();
1174     }
1175
1176   byte_cnt = ds_length (&lexer->tokstr) / chars_per_byte;
1177   if (ds_length (&lexer->tokstr) % chars_per_byte)
1178     msg (SE, _("String of %s digits has %zu characters, which is not a "
1179                "multiple of %d."),
1180          base_name, ds_length (&lexer->tokstr), chars_per_byte);
1181
1182   p = ds_cstr (&lexer->tokstr);
1183   for (i = 0; i < byte_cnt; i++)
1184     {
1185       int value;
1186       int j;
1187
1188       value = 0;
1189       for (j = 0; j < chars_per_byte; j++, p++)
1190         {
1191           int v;
1192
1193           if (*p >= '0' && *p <= '9')
1194             v = *p - '0';
1195           else
1196             {
1197               static const char alpha[] = "abcdef";
1198               const char *q = strchr (alpha, tolower ((unsigned char) *p));
1199
1200               if (q)
1201                 v = q - alpha + 10;
1202               else
1203                 v = base;
1204             }
1205
1206           if (v >= base)
1207             msg (SE, _("`%c' is not a valid %s digit."), *p, base_name);
1208
1209           value = value * base + v;
1210         }
1211
1212       ds_cstr (&lexer->tokstr)[i] = (unsigned char) value;
1213     }
1214
1215   ds_truncate (&lexer->tokstr, byte_cnt);
1216 }
1217
1218 /* Parses a string from the input buffer into tokstr.  The input
1219    buffer pointer lexer->prog must point to the initial single or double
1220    quote.  TYPE indicates the type of string to be parsed.
1221    Returns token type. */
1222 static int
1223 parse_string (struct lexer *lexer, enum string_type type)
1224 {
1225   if (type != CHARACTER_STRING)
1226     lexer->prog++;
1227
1228   /* Accumulate the entire string, joining sections indicated by +
1229      signs. */
1230   for (;;)
1231     {
1232       /* Single or double quote. */
1233       int c = *lexer->prog++;
1234
1235       /* Accumulate section. */
1236       for (;;)
1237         {
1238           /* Check end of line. */
1239           if (*lexer->prog == '\0')
1240             {
1241               msg (SE, _("Unterminated string constant."));
1242               goto finish;
1243             }
1244
1245           /* Double quote characters to embed them in strings. */
1246           if (*lexer->prog == c)
1247             {
1248               if (lexer->prog[1] == c)
1249                 lexer->prog++;
1250               else
1251                 break;
1252             }
1253
1254           ds_put_byte (&lexer->tokstr, *lexer->prog++);
1255         }
1256       lexer->prog++;
1257
1258       /* Skip whitespace after final quote mark. */
1259       if (lexer->prog == NULL)
1260         break;
1261       for (;;)
1262         {
1263           while (c_isspace ((unsigned char) *lexer->prog))
1264             lexer->prog++;
1265           if (*lexer->prog)
1266             break;
1267
1268           if (lexer->dot)
1269             goto finish;
1270
1271           if (!lex_get_line (lexer))
1272             goto finish;
1273         }
1274
1275       /* Skip plus sign. */
1276       if (*lexer->prog != '+')
1277         break;
1278       lexer->prog++;
1279
1280       /* Skip whitespace after plus sign. */
1281       if (lexer->prog == NULL)
1282         break;
1283       for (;;)
1284         {
1285           while (c_isspace ((unsigned char) *lexer->prog))
1286             lexer->prog++;
1287           if (*lexer->prog)
1288             break;
1289
1290           if (lexer->dot)
1291             goto finish;
1292
1293           if (!lex_get_line (lexer))
1294             {
1295               msg (SE, _("Unexpected end of file in string concatenation."));
1296               goto finish;
1297             }
1298         }
1299
1300       /* Ensure that a valid string follows. */
1301       if (*lexer->prog != '\'' && *lexer->prog != '"')
1302         {
1303           msg (SE, _("String expected following `+'."));
1304           goto finish;
1305         }
1306     }
1307
1308   /* We come here when we've finished concatenating all the string sections
1309      into one large string. */
1310 finish:
1311   if (type != CHARACTER_STRING)
1312     convert_numeric_string_to_char_string (lexer, type);
1313
1314   return T_STRING;
1315 }
1316 \f
1317 /* Token Accessor Functions */
1318
1319 enum token_type
1320 lex_token (const struct lexer *lexer)
1321 {
1322   return lexer->token;
1323 }
1324
1325 double
1326 lex_tokval (const struct lexer *lexer)
1327 {
1328   return lexer->tokval;
1329 }
1330
1331 /* Returns the null-terminated string value associated with LEXER's current
1332    token.  For a T_ID token, this is the identifier, and for a T_STRING token,
1333    this is the string.  For other tokens the value is undefined. */
1334 const char *
1335 lex_tokcstr (const struct lexer *lexer)
1336 {
1337   return ds_cstr (&lexer->tokstr);
1338 }
1339
1340 /* Returns the string value associated with LEXER's current token.  For a T_ID
1341    token, this is the identifier, and for a T_STRING token, this is the string.
1342    For other tokens the value is undefined. */
1343 struct substring
1344 lex_tokss (const struct lexer *lexer)
1345 {
1346   return ds_ss (&lexer->tokstr);
1347 }
1348
1349 /* If the lexer is positioned at the (pseudo)identifier S, which
1350    may contain a hyphen ('-'), skips it and returns true.  Each
1351    half of the identifier may be abbreviated to its first three
1352    letters.
1353    Otherwise, returns false. */
1354 bool
1355 lex_match_hyphenated_word (struct lexer *lexer, const char *s)
1356 {
1357   const char *hyphen = strchr (s, '-');
1358   if (hyphen == NULL)
1359     return lex_match_id (lexer, s);
1360   else if (lexer->token != T_ID
1361            || !lex_id_match (ss_buffer (s, hyphen - s), lex_tokss (lexer))
1362            || lex_look_ahead (lexer) != T_DASH)
1363     return false;
1364   else
1365     {
1366       lex_get (lexer);
1367       lex_force_match (lexer, T_DASH);
1368       lex_force_match_id (lexer, hyphen + 1);
1369       return true;
1370     }
1371 }
1372