pintos-os.org Git - pspp/blob - src/language/lexer/lexer.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 1997-9, 2000, 2006, 2009, 2010 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "lexer.h"
  20 #include <c-ctype.h>
  21 #include <c-strtod.h>
  22 #include <errno.h>
  23 #include <limits.h>
  24 #include <math.h>
  25 #include <stdarg.h>
  26 #include <stdint.h>
  27 #include <stdlib.h>
  28 #include <language/command.h>
  29 #include <data/settings.h>
  30 #include <libpspp/assertion.h>
  31 #include <libpspp/getl.h>
  32 #include <libpspp/message.h>
  33 #include <libpspp/str.h>
  34 #include <output/journal.h>
  35 #include <output/text-item.h>
  36
  37 #include "xalloc.h"
  38
  39 #include "gettext.h"
  40 #define _(msgid) gettext (msgid)
  41 #define N_(msgid) msgid
  42
  43 struct lexer
  44 {
  45   struct string line_buffer;
  46
  47   struct source_stream *ss;
  48
  49   int token;      /* Current token. */
  50   double tokval;  /* T_POS_NUM, T_NEG_NUM: the token's value. */
  51
  52   struct string tokstr;   /* T_ID, T_STRING: token string value. */
  53
  54   char *prog; /* Pointer to next token in line_buffer. */
  55   bool dot;   /* True only if this line ends with a terminal dot. */
  56
  57   int put_token ; /* If nonzero, next token returned by lex_get().
  58                     Used only in exceptional circumstances. */
  59
  60   struct string put_tokstr;
  61   double put_tokval;
  62 };
  63
  64
  65 static int parse_id (struct lexer *);
  66
  67 /* How a string represents its contents. */
  68 enum string_type
  69   {
  70     CHARACTER_STRING,   /* Characters. */
  71     BINARY_STRING,      /* Binary digits. */
  72     OCTAL_STRING,       /* Octal digits. */
  73     HEX_STRING          /* Hexadecimal digits. */
  74   };
  75
  76 static int parse_string (struct lexer *, enum string_type);
  77 \f
  78 /* Initialization. */
  79
  80 /* Initializes the lexer. */
  81 struct lexer *
  82 lex_create (struct source_stream *ss)
  83 {
  84   struct lexer *lexer = xzalloc (sizeof (*lexer));
  85
  86   ds_init_empty (&lexer->tokstr);
  87   ds_init_empty (&lexer->put_tokstr);
  88   ds_init_empty (&lexer->line_buffer);
  89   lexer->ss = ss;
  90
  91   return lexer;
  92 }
  93
  94 struct source_stream *
  95 lex_get_source_stream (const struct lexer *lex)
  96 {
  97   return lex->ss;
  98 }
  99
 100 enum syntax_mode
 101 lex_current_syntax_mode (const struct lexer *lex)
 102 {
 103   return source_stream_current_syntax_mode (lex->ss);
 104 }
 105
 106 enum error_mode
 107 lex_current_error_mode (const struct lexer *lex)
 108 {
 109   return source_stream_current_error_mode (lex->ss);
 110 }
 111
 112
 113 void
 114 lex_destroy (struct lexer *lexer)
 115 {
 116   if ( NULL != lexer )
 117     {
 118       ds_destroy (&lexer->put_tokstr);
 119       ds_destroy (&lexer->tokstr);
 120       ds_destroy (&lexer->line_buffer);
 121
 122       free (lexer);
 123     }
 124 }
 125
 126 \f
 127 /* Common functions. */
 128
 129 /* Copies put_token, lexer->put_tokstr, put_tokval into token, tokstr,
 130    tokval, respectively, and sets tokid appropriately. */
 131 static void
 132 restore_token (struct lexer *lexer)
 133 {
 134   assert (lexer->put_token != 0);
 135   lexer->token = lexer->put_token;
 136   ds_assign_string (&lexer->tokstr, &lexer->put_tokstr);
 137   lexer->tokval = lexer->put_tokval;
 138   lexer->put_token = 0;
 139 }
 140
 141 /* Copies token, tokstr, lexer->tokval into lexer->put_token, put_tokstr,
 142    put_lexer->tokval respectively. */
 143 static void
 144 save_token (struct lexer *lexer)
 145 {
 146   lexer->put_token = lexer->token;
 147   ds_assign_string (&lexer->put_tokstr, &lexer->tokstr);
 148   lexer->put_tokval = lexer->tokval;
 149 }
 150
 151 /* Parses a single token, setting appropriate global variables to
 152    indicate the token's attributes. */
 153 void
 154 lex_get (struct lexer *lexer)
 155 {
 156   /* Find a token. */
 157   for (;;)
 158     {
 159       if (NULL == lexer->prog && ! lex_get_line (lexer) )
 160         {
 161           lexer->token = T_STOP;
 162           return;
 163         }
 164
 165       /* If a token was pushed ahead, return it. */
 166       if (lexer->put_token)
 167         {
 168           restore_token (lexer);
 169           return;
 170         }
 171
 172       for (;;)
 173         {
 174           /* Skip whitespace. */
 175           while (c_isspace ((unsigned char) *lexer->prog))
 176             lexer->prog++;
 177
 178           if (*lexer->prog)
 179             break;
 180
 181           if (lexer->dot)
 182             {
 183               lexer->dot = 0;
 184               lexer->token = T_ENDCMD;
 185               return;
 186             }
 187           else if (!lex_get_line (lexer))
 188             {
 189               lexer->prog = NULL;
 190               lexer->token = T_STOP;
 191               return;
 192             }
 193
 194           if (lexer->put_token)
 195             {
 196               restore_token (lexer);
 197               return;
 198             }
 199         }
 200
 201
 202       /* Actually parse the token. */
 203       ds_clear (&lexer->tokstr);
 204
 205       switch (*lexer->prog)
 206         {
 207         case '-': case '.':
 208         case '0': case '1': case '2': case '3': case '4':
 209         case '5': case '6': case '7': case '8': case '9':
 210           {
 211             char *tail;
 212
 213             /* `-' can introduce a negative number, or it can be a token by
 214                itself. */
 215             if (*lexer->prog == '-')
 216               {
 217                 ds_put_byte (&lexer->tokstr, *lexer->prog++);
 218                 while (c_isspace ((unsigned char) *lexer->prog))
 219                   lexer->prog++;
 220
 221                 if (!c_isdigit ((unsigned char) *lexer->prog) && *lexer->prog != '.')
 222                   {
 223                     lexer->token = T_DASH;
 224                     break;
 225                   }
 226                 lexer->token = T_NEG_NUM;
 227               }
 228             else
 229               lexer->token = T_POS_NUM;
 230
 231             /* Parse the number, copying it into tokstr. */
 232             while (c_isdigit ((unsigned char) *lexer->prog))
 233               ds_put_byte (&lexer->tokstr, *lexer->prog++);
 234             if (*lexer->prog == '.')
 235               {
 236                 ds_put_byte (&lexer->tokstr, *lexer->prog++);
 237                 while (c_isdigit ((unsigned char) *lexer->prog))
 238                   ds_put_byte (&lexer->tokstr, *lexer->prog++);
 239               }
 240             if (*lexer->prog == 'e' || *lexer->prog == 'E')
 241               {
 242                 ds_put_byte (&lexer->tokstr, *lexer->prog++);
 243                 if (*lexer->prog == '+' || *lexer->prog == '-')
 244                   ds_put_byte (&lexer->tokstr, *lexer->prog++);
 245                 while (c_isdigit ((unsigned char) *lexer->prog))
 246                   ds_put_byte (&lexer->tokstr, *lexer->prog++);
 247               }
 248
 249             /* Parse as floating point. */
 250             lexer->tokval = c_strtod (ds_cstr (&lexer->tokstr), &tail);
 251             if (*tail)
 252               {
 253                 msg (SE, _("%s does not form a valid number."),
 254                      ds_cstr (&lexer->tokstr));
 255                 lexer->tokval = 0.0;
 256
 257                 ds_clear (&lexer->tokstr);
 258                 ds_put_byte (&lexer->tokstr, '0');
 259               }
 260
 261             break;
 262           }
 263
 264         case '\'': case '"':
 265           lexer->token = parse_string (lexer, CHARACTER_STRING);
 266           break;
 267
 268         case '+':
 269           lexer->token = T_PLUS;
 270           lexer->prog++;
 271           break;
 272
 273         case '/':
 274           lexer->token = T_SLASH;
 275           lexer->prog++;
 276           break;
 277
 278         case '=':
 279           lexer->token = T_EQUALS;
 280           lexer->prog++;
 281           break;
 282
 283         case '(':
 284           lexer->token = T_LPAREN;
 285           lexer->prog++;
 286           break;
 287
 288         case ')':
 289           lexer->token = T_RPAREN;
 290           lexer->prog++;
 291           break;
 292
 293         case '[':
 294           lexer->token = T_LBRACK;
 295           lexer->prog++;
 296           break;
 297
 298         case ']':
 299           lexer->token = T_RBRACK;
 300           lexer->prog++;
 301           break;
 302
 303         case ',':
 304           lexer->token = T_COMMA;
 305           lexer->prog++;
 306           break;
 307
 308         case '*':
 309           if (*++lexer->prog == '*')
 310             {
 311               lexer->prog++;
 312               lexer->token = T_EXP;
 313             }
 314           else
 315             lexer->token = T_ASTERISK;
 316           break;
 317
 318         case '<':
 319           if (*++lexer->prog == '=')
 320             {
 321               lexer->prog++;
 322               lexer->token = T_LE;
 323             }
 324           else if (*lexer->prog == '>')
 325             {
 326               lexer->prog++;
 327               lexer->token = T_NE;
 328             }
 329           else
 330             lexer->token = T_LT;
 331           break;
 332
 333         case '>':
 334           if (*++lexer->prog == '=')
 335             {
 336               lexer->prog++;
 337               lexer->token = T_GE;
 338             }
 339           else
 340             lexer->token = T_GT;
 341           break;
 342
 343         case '~':
 344           if (*++lexer->prog == '=')
 345             {
 346               lexer->prog++;
 347               lexer->token = T_NE;
 348             }
 349           else
 350             lexer->token = T_NOT;
 351           break;
 352
 353         case '&':
 354           lexer->prog++;
 355           lexer->token = T_AND;
 356           break;
 357
 358         case '|':
 359           lexer->prog++;
 360           lexer->token = T_OR;
 361           break;
 362
 363         case 'b': case 'B':
 364           if (lexer->prog[1] == '\'' || lexer->prog[1] == '"')
 365             lexer->token = parse_string (lexer, BINARY_STRING);
 366           else
 367             lexer->token = parse_id (lexer);
 368           break;
 369
 370         case 'o': case 'O':
 371           if (lexer->prog[1] == '\'' || lexer->prog[1] == '"')
 372             lexer->token = parse_string (lexer, OCTAL_STRING);
 373           else
 374             lexer->token = parse_id (lexer);
 375           break;
 376
 377         case 'x': case 'X':
 378           if (lexer->prog[1] == '\'' || lexer->prog[1] == '"')
 379             lexer->token = parse_string (lexer, HEX_STRING);
 380           else
 381             lexer->token = parse_id (lexer);
 382           break;
 383
 384         default:
 385           if (lex_is_id1 (*lexer->prog))
 386             {
 387               lexer->token = parse_id (lexer);
 388               break;
 389             }
 390           else
 391             {
 392               unsigned char c = *lexer->prog++;
 393               char *c_name = xasprintf (c_isgraph (c) ? "%c" : "\\%o", c);
 394               msg (SE, _("Bad character in input: `%s'."), c_name);
 395               free (c_name);
 396               continue;
 397             }
 398         }
 399       break;
 400     }
 401 }
 402
 403 /* Parses an identifier at the current position into tokstr.
 404    Returns the correct token type. */
 405 static int
 406 parse_id (struct lexer *lexer)
 407 {
 408   struct substring rest_of_line
 409     = ss_substr (ds_ss (&lexer->line_buffer),
 410                  ds_pointer_to_position (&lexer->line_buffer, lexer->prog),
 411                  SIZE_MAX);
 412   struct substring id = ss_head (rest_of_line,
 413                                  lex_id_get_length (rest_of_line));
 414   lexer->prog += ss_length (id);
 415
 416   ds_assign_substring (&lexer->tokstr, id);
 417   return lex_id_to_token (id);
 418 }
 419
 420 /* Reports an error to the effect that subcommand SBC may only be
 421    specified once. */
 422 void
 423 lex_sbc_only_once (const char *sbc)
 424 {
 425   msg (SE, _("Subcommand %s may only be specified once."), sbc);
 426 }
 427
 428 /* Reports an error to the effect that subcommand SBC is
 429    missing. */
 430 void
 431 lex_sbc_missing (struct lexer *lexer, const char *sbc)
 432 {
 433   lex_error (lexer, _("missing required subcommand %s"), sbc);
 434 }
 435
 436 /* Prints a syntax error message containing the current token and
 437    given message MESSAGE (if non-null). */
 438 void
 439 lex_error (struct lexer *lexer, const char *message, ...)
 440 {
 441   struct string s;
 442
 443   ds_init_empty (&s);
 444
 445   if (lexer->token == T_STOP)
 446     ds_put_cstr (&s, _("Syntax error at end of file"));
 447   else if (lexer->token == T_ENDCMD)
 448     ds_put_cstr (&s, _("Syntax error at end of command"));
 449   else
 450     {
 451       char *token_rep = lex_token_representation (lexer);
 452       ds_put_format (&s, _("Syntax error at `%s'"), token_rep);
 453       free (token_rep);
 454     }
 455
 456   if (message)
 457     {
 458       va_list args;
 459
 460       ds_put_cstr (&s, ": ");
 461
 462       va_start (args, message);
 463       ds_put_vformat (&s, message, args);
 464       va_end (args);
 465     }
 466
 467   msg (SE, "%s.", ds_cstr (&s));
 468   ds_destroy (&s);
 469 }
 470
 471 /* Checks that we're at end of command.
 472    If so, returns a successful command completion code.
 473    If not, flags a syntax error and returns an error command
 474    completion code. */
 475 int
 476 lex_end_of_command (struct lexer *lexer)
 477 {
 478   if (lexer->token != T_ENDCMD)
 479     {
 480       lex_error (lexer, _("expecting end of command"));
 481       return CMD_FAILURE;
 482     }
 483   else
 484     return CMD_SUCCESS;
 485 }
 486 \f
 487 /* Token testing functions. */
 488
 489 /* Returns true if the current token is a number. */
 490 bool
 491 lex_is_number (struct lexer *lexer)
 492 {
 493   return lexer->token == T_POS_NUM || lexer->token == T_NEG_NUM;
 494 }
 495
 496
 497 /* Returns true if the current token is a string. */
 498 bool
 499 lex_is_string (struct lexer *lexer)
 500 {
 501   return lexer->token == T_STRING;
 502 }
 503
 504
 505 /* Returns the value of the current token, which must be a
 506    floating point number. */
 507 double
 508 lex_number (struct lexer *lexer)
 509 {
 510   assert (lex_is_number (lexer));
 511   return lexer->tokval;
 512 }
 513
 514 /* Returns true iff the current token is an integer. */
 515 bool
 516 lex_is_integer (struct lexer *lexer)
 517 {
 518   return (lex_is_number (lexer)
 519           && lexer->tokval > LONG_MIN
 520           && lexer->tokval <= LONG_MAX
 521           && floor (lexer->tokval) == lexer->tokval);
 522 }
 523
 524 /* Returns the value of the current token, which must be an
 525    integer. */
 526 long
 527 lex_integer (struct lexer *lexer)
 528 {
 529   assert (lex_is_integer (lexer));
 530   return lexer->tokval;
 531 }
 532 \f
 533 /* Token matching functions. */
 534
 535 /* If TOK is the current token, skips it and returns true
 536    Otherwise, returns false. */
 537 bool
 538 lex_match (struct lexer *lexer, enum token_type t)
 539 {
 540   if (lexer->token == t)
 541     {
 542       lex_get (lexer);
 543       return true;
 544     }
 545   else
 546     return false;
 547 }
 548
 549 /* If the current token is the identifier S, skips it and returns
 550    true.  The identifier may be abbreviated to its first three
 551    letters.
 552    Otherwise, returns false. */
 553 bool
 554 lex_match_id (struct lexer *lexer, const char *s)
 555 {
 556   return lex_match_id_n (lexer, s, 3);
 557 }
 558
 559 /* If the current token is the identifier S, skips it and returns
 560    true.  The identifier may be abbreviated to its first N
 561    letters.
 562    Otherwise, returns false. */
 563 bool
 564 lex_match_id_n (struct lexer *lexer, const char *s, size_t n)
 565 {
 566   if (lexer->token == T_ID
 567       && lex_id_match_n (ss_cstr (s), lex_tokss (lexer), n))
 568     {
 569       lex_get (lexer);
 570       return true;
 571     }
 572   else
 573     return false;
 574 }
 575
 576 /* If the current token is integer N, skips it and returns true.
 577    Otherwise, returns false. */
 578 bool
 579 lex_match_int (struct lexer *lexer, int x)
 580 {
 581   if (lex_is_integer (lexer) && lex_integer (lexer) == x)
 582     {
 583       lex_get (lexer);
 584       return true;
 585     }
 586   else
 587     return false;
 588 }
 589 \f
 590 /* Forced matches. */
 591
 592 /* If this token is identifier S, fetches the next token and returns
 593    nonzero.
 594    Otherwise, reports an error and returns zero. */
 595 bool
 596 lex_force_match_id (struct lexer *lexer, const char *s)
 597 {
 598   if (lex_match_id (lexer, s))
 599     return true;
 600   else
 601     {
 602       lex_error (lexer, _("expecting `%s'"), s);
 603       return false;
 604     }
 605 }
 606
 607 /* If the current token is T, skips the token.  Otherwise, reports an
 608    error and returns from the current function with return value false. */
 609 bool
 610 lex_force_match (struct lexer *lexer, enum token_type t)
 611 {
 612   if (lexer->token == t)
 613     {
 614       lex_get (lexer);
 615       return true;
 616     }
 617   else
 618     {
 619       lex_error (lexer, _("expecting `%s'"), lex_token_name (t));
 620       return false;
 621     }
 622 }
 623
 624 /* If this token is a string, does nothing and returns true.
 625    Otherwise, reports an error and returns false. */
 626 bool
 627 lex_force_string (struct lexer *lexer)
 628 {
 629   if (lex_is_string (lexer))
 630     return true;
 631   else
 632     {
 633       lex_error (lexer, _("expecting string"));
 634       return false;
 635     }
 636 }
 637
 638 /* If this token is an integer, does nothing and returns true.
 639    Otherwise, reports an error and returns false. */
 640 bool
 641 lex_force_int (struct lexer *lexer)
 642 {
 643   if (lex_is_integer (lexer))
 644     return true;
 645   else
 646     {
 647       lex_error (lexer, _("expecting integer"));
 648       return false;
 649     }
 650 }
 651
 652 /* If this token is a number, does nothing and returns true.
 653    Otherwise, reports an error and returns false. */
 654 bool
 655 lex_force_num (struct lexer *lexer)
 656 {
 657   if (lex_is_number (lexer))
 658     return true;
 659
 660   lex_error (lexer, _("expecting number"));
 661   return false;
 662 }
 663
 664 /* If this token is an identifier, does nothing and returns true.
 665    Otherwise, reports an error and returns false. */
 666 bool
 667 lex_force_id (struct lexer *lexer)
 668 {
 669   if (lexer->token == T_ID)
 670     return true;
 671
 672   lex_error (lexer, _("expecting identifier"));
 673   return false;
 674 }
 675
 676 /* Weird token functions. */
 677
 678 /* Returns the likely type of the next token, or 0 if it's hard to tell. */
 679 enum token_type
 680 lex_look_ahead (struct lexer *lexer)
 681 {
 682   if (lexer->put_token)
 683     return lexer->put_token;
 684
 685   for (;;)
 686     {
 687       if (NULL == lexer->prog && ! lex_get_line (lexer) )
 688         return 0;
 689
 690       for (;;)
 691         {
 692           while (c_isspace ((unsigned char) *lexer->prog))
 693             lexer->prog++;
 694           if (*lexer->prog)
 695             break;
 696
 697           if (lexer->dot)
 698             return T_ENDCMD;
 699           else if (!lex_get_line (lexer))
 700             return 0;
 701
 702           if (lexer->put_token)
 703             return lexer->put_token;
 704         }
 705
 706       switch (toupper ((unsigned char) *lexer->prog))
 707         {
 708         case 'X': case 'B': case 'O':
 709           if (lexer->prog[1] == '\'' || lexer->prog[1] == '"')
 710             return T_STRING;
 711           /* Fall through */
 712
 713         case '-':
 714           return T_DASH;
 715
 716         case '.':
 717         case '0': case '1': case '2': case '3': case '4':
 718         case '5': case '6': case '7': case '8': case '9':
 719           return T_POS_NUM;
 720
 721         case '\'': case '"':
 722           return T_STRING;
 723
 724         case '+':
 725           return T_PLUS;
 726
 727         case '/':
 728           return T_SLASH;
 729
 730         case '=':
 731           return T_EQUALS;
 732
 733         case '(':
 734           return T_LPAREN;
 735
 736         case ')':
 737           return T_RPAREN;
 738
 739         case '[':
 740           return T_LBRACK;
 741
 742         case ']':
 743           return T_RBRACK;
 744
 745         case ',':
 746           return T_COMMA;
 747
 748         case '*':
 749           return lexer->prog[1] == '*' ? T_EXP : T_ASTERISK;
 750
 751         case '<':
 752           return (lexer->prog[1] == '=' ? T_LE
 753                   : lexer->prog[1] == '>' ? T_NE
 754                   : T_LT);
 755
 756         case '>':
 757           return lexer->prog[1] == '=' ? T_GE : T_GT;
 758
 759         case '~':
 760           return lexer->prog[1] == '=' ? T_NE : T_NOT;
 761
 762         case '&':
 763           return T_AND;
 764
 765         case '|':
 766           return T_OR;
 767
 768         default:
 769           if (lex_is_id1 (*lexer->prog))
 770             return T_ID;
 771           return 0;
 772         }
 773     }
 774 }
 775
 776 /* Makes the current token become the next token to be read; the
 777    current token is set to T. */
 778 void
 779 lex_put_back (struct lexer *lexer, enum token_type t)
 780 {
 781   save_token (lexer);
 782   lexer->token = t;
 783 }
 784
 785 /* Makes the current token become the next token to be read; the
 786    current token is set to the identifier ID. */
 787 void
 788 lex_put_back_id (struct lexer *lexer, const char *id)
 789 {
 790   assert (lex_id_to_token (ss_cstr (id)) == T_ID);
 791   save_token (lexer);
 792   lexer->token = T_ID;
 793   ds_assign_cstr (&lexer->tokstr, id);
 794 }
 795 \f
 796 /* Weird line processing functions. */
 797
 798 /* Returns the entire contents of the current line. */
 799 const char *
 800 lex_entire_line (const struct lexer *lexer)
 801 {
 802   return ds_cstr (&lexer->line_buffer);
 803 }
 804
 805 const struct string *
 806 lex_entire_line_ds (const struct lexer *lexer)
 807 {
 808   return &lexer->line_buffer;
 809 }
 810
 811 /* As lex_entire_line(), but only returns the part of the current line
 812    that hasn't already been tokenized. */
 813 const char *
 814 lex_rest_of_line (const struct lexer *lexer)
 815 {
 816   return lexer->prog;
 817 }
 818
 819 /* Returns true if the current line ends in a terminal dot,
 820    false otherwise. */
 821 bool
 822 lex_end_dot (const struct lexer *lexer)
 823 {
 824   return lexer->dot;
 825 }
 826
 827 /* Causes the rest of the current input line to be ignored for
 828    tokenization purposes. */
 829 void
 830 lex_discard_line (struct lexer *lexer)
 831 {
 832   ds_cstr (&lexer->line_buffer);  /* Ensures ds_end points to something valid */
 833   lexer->prog = ds_end (&lexer->line_buffer);
 834   lexer->dot = false;
 835   lexer->put_token = 0;
 836 }
 837
 838
 839 /* Discards the rest of the current command.
 840    When we're reading commands from a file, we skip tokens until
 841    a terminal dot or EOF.
 842    When we're reading commands interactively from the user,
 843    that's just discarding the current line, because presumably
 844    the user doesn't want to finish typing a command that will be
 845    ignored anyway. */
 846 void
 847 lex_discard_rest_of_command (struct lexer *lexer)
 848 {
 849   if (!getl_is_interactive (lexer->ss))
 850     {
 851       while (lexer->token != T_STOP && lexer->token != T_ENDCMD)
 852         lex_get (lexer);
 853     }
 854   else
 855     lex_discard_line (lexer);
 856 }
 857 \f
 858 /* Weird line reading functions. */
 859
 860 /* Remove C-style comments in STRING, begun by slash-star and
 861    terminated by star-slash or newline. */
 862 static void
 863 strip_comments (struct string *string)
 864 {
 865   char *cp;
 866   int quote;
 867   bool in_comment;
 868
 869   in_comment = false;
 870   quote = EOF;
 871   for (cp = ds_cstr (string); *cp; )
 872     {
 873       /* If we're not in a comment, check for quote marks. */
 874       if (!in_comment)
 875         {
 876           if (*cp == quote)
 877             quote = EOF;
 878           else if (*cp == '\'' || *cp == '"')
 879             quote = *cp;
 880         }
 881
 882       /* If we're not inside a quotation, check for comment. */
 883       if (quote == EOF)
 884         {
 885           if (cp[0] == '/' && cp[1] == '*')
 886             {
 887               in_comment = true;
 888               *cp++ = ' ';
 889               *cp++ = ' ';
 890               continue;
 891             }
 892           else if (in_comment && cp[0] == '*' && cp[1] == '/')
 893             {
 894               in_comment = false;
 895               *cp++ = ' ';
 896               *cp++ = ' ';
 897               continue;
 898             }
 899         }
 900
 901       /* Check commenting. */
 902       if (in_comment)
 903         *cp = ' ';
 904       cp++;
 905     }
 906 }
 907
 908 /* Prepares LINE, which is subject to the given SYNTAX rules, for
 909    tokenization by stripping comments and determining whether it
 910    is the beginning or end of a command and storing into
 911    *LINE_STARTS_COMMAND and *LINE_ENDS_COMMAND appropriately. */
 912 void
 913 lex_preprocess_line (struct string *line,
 914                      enum syntax_mode syntax,
 915                      bool *line_starts_command,
 916                      bool *line_ends_command)
 917 {
 918   strip_comments (line);
 919   ds_rtrim (line, ss_cstr (CC_SPACES));
 920   *line_ends_command = (ds_chomp (line, settings_get_endcmd ())
 921                         || (ds_is_empty (line) && settings_get_nulline ()));
 922   *line_starts_command = false;
 923   if (syntax == GETL_BATCH)
 924     {
 925       int first = ds_first (line);
 926       *line_starts_command = !c_isspace (first);
 927       if (first == '+' || first == '-')
 928         *ds_data (line) = ' ';
 929     }
 930 }
 931
 932 /* Reads a line, without performing any preprocessing. */
 933 bool
 934 lex_get_line_raw (struct lexer *lexer)
 935 {
 936   bool ok = getl_read_line (lexer->ss, &lexer->line_buffer);
 937   if (ok)
 938     {
 939       const char *line = ds_cstr (&lexer->line_buffer);
 940       text_item_submit (text_item_create (TEXT_ITEM_SYNTAX, line));
 941     }
 942   else
 943     lexer->prog = NULL;
 944   return ok;
 945 }
 946
 947 /* Reads a line for use by the tokenizer, and preprocesses it by
 948    removing comments, stripping trailing whitespace and the
 949    terminal dot, and removing leading indentors. */
 950 bool
 951 lex_get_line (struct lexer *lexer)
 952 {
 953   bool line_starts_command;
 954
 955   if (!lex_get_line_raw (lexer))
 956     return false;
 957
 958   lex_preprocess_line (&lexer->line_buffer,
 959                        lex_current_syntax_mode (lexer),
 960                        &line_starts_command, &lexer->dot);
 961
 962   if (line_starts_command)
 963     lexer->put_token = T_ENDCMD;
 964
 965   lexer->prog = ds_cstr (&lexer->line_buffer);
 966   return true;
 967 }
 968 \f
 969 /* Token names. */
 970
 971 /* Returns the name of a token. */
 972 const char *
 973 lex_token_name (enum token_type token)
 974 {
 975   switch (token)
 976     {
 977     case T_ID:
 978     case T_POS_NUM:
 979     case T_NEG_NUM:
 980     case T_STRING:
 981       NOT_REACHED ();
 982
 983     case T_STOP:
 984       return "";
 985
 986     case T_ENDCMD:
 987       return ".";
 988
 989     case T_PLUS:
 990       return "+";
 991
 992     case T_DASH:
 993       return "-";
 994
 995     case T_ASTERISK:
 996       return "*";
 997
 998     case T_SLASH:
 999       return "/";
1000
1001     case T_EQUALS:
1002       return "=";
1003
1004     case T_LPAREN:
1005       return "(";
1006
1007     case T_RPAREN:
1008       return ")";
1009
1010     case T_LBRACK:
1011       return "[";
1012
1013     case T_RBRACK:
1014       return "]";
1015
1016     case T_COMMA:
1017       return ",";
1018
1019     case T_AND:
1020       return "AND";
1021
1022     case T_OR:
1023       return "OR";
1024
1025     case T_NOT:
1026       return "NOT";
1027
1028     case T_EQ:
1029       return "EQ";
1030
1031     case T_GE:
1032       return ">=";
1033
1034     case T_GT:
1035       return ">";
1036
1037     case T_LE:
1038       return "<=";
1039
1040     case T_LT:
1041       return "<";
1042
1043     case T_NE:
1044       return "~=";
1045
1046     case T_ALL:
1047       return "ALL";
1048
1049     case T_BY:
1050       return "BY";
1051
1052     case T_TO:
1053       return "TO";
1054
1055     case T_WITH:
1056       return "WITH";
1057
1058     case T_EXP:
1059       return "**";
1060     }
1061
1062   NOT_REACHED ();
1063 }
1064
1065 /* Returns an ASCII representation of the current token as a
1066    malloc()'d string. */
1067 char *
1068 lex_token_representation (struct lexer *lexer)
1069 {
1070   char *token_rep;
1071
1072   switch (lexer->token)
1073     {
1074     case T_ID:
1075     case T_POS_NUM:
1076     case T_NEG_NUM:
1077       return ss_xstrdup (lex_tokss (lexer));
1078
1079     case T_STRING:
1080       {
1081         struct substring ss;
1082         int hexstring = 0;
1083         char *sp, *dp;
1084
1085         ss = lex_tokss (lexer);
1086         for (sp = ss_data (ss); sp < ss_end (ss); sp++)
1087           if (!c_isprint ((unsigned char) *sp))
1088             {
1089               hexstring = 1;
1090               break;
1091             }
1092
1093         token_rep = xmalloc (2 + ss_length (ss) * 2 + 1 + 1);
1094
1095         dp = token_rep;
1096         if (hexstring)
1097           *dp++ = 'X';
1098         *dp++ = '\'';
1099
1100         for (sp = ss_data (ss); sp < ss_end (ss); sp++)
1101           if (!hexstring)
1102             {
1103               if (*sp == '\'')
1104                 *dp++ = '\'';
1105               *dp++ = (unsigned char) *sp;
1106             }
1107           else
1108             {
1109               *dp++ = (((unsigned char) *sp) >> 4)["0123456789ABCDEF"];
1110               *dp++ = (((unsigned char) *sp) & 15)["0123456789ABCDEF"];
1111             }
1112         *dp++ = '\'';
1113         *dp = '\0';
1114
1115         return token_rep;
1116       }
1117
1118     default:
1119       return xstrdup (lex_token_name (lexer->token));
1120     }
1121 }
1122 \f
1123 /* Really weird functions. */
1124
1125 /* Skip a COMMENT command. */
1126 void
1127 lex_skip_comment (struct lexer *lexer)
1128 {
1129   for (;;)
1130     {
1131       if (!lex_get_line (lexer))
1132         {
1133           lexer->put_token = T_STOP;
1134           lexer->prog = NULL;
1135           return;
1136         }
1137
1138       if (lexer->put_token == T_ENDCMD)
1139         break;
1140
1141       ds_cstr (&lexer->line_buffer); /* Ensures ds_end will point to a valid char */
1142       lexer->prog = ds_end (&lexer->line_buffer);
1143       if (lexer->dot)
1144         break;
1145     }
1146 }
1147 \f
1148 /* Private functions. */
1149
1150 /* When invoked, tokstr contains a string of binary, octal, or
1151    hex digits, according to TYPE.  The string is converted to
1152    characters having the specified values. */
1153 static void
1154 convert_numeric_string_to_char_string (struct lexer *lexer,
1155                                        enum string_type type)
1156 {
1157   const char *base_name;
1158   int base;
1159   int chars_per_byte;
1160   size_t byte_cnt;
1161   size_t i;
1162   char *p;
1163
1164   switch (type)
1165     {
1166     case BINARY_STRING:
1167       base_name = _("binary");
1168       base = 2;
1169       chars_per_byte = 8;
1170       break;
1171     case OCTAL_STRING:
1172       base_name = _("octal");
1173       base = 8;
1174       chars_per_byte = 3;
1175       break;
1176     case HEX_STRING:
1177       base_name = _("hex");
1178       base = 16;
1179       chars_per_byte = 2;
1180       break;
1181     default:
1182       NOT_REACHED ();
1183     }
1184
1185   byte_cnt = ds_length (&lexer->tokstr) / chars_per_byte;
1186   if (ds_length (&lexer->tokstr) % chars_per_byte)
1187     msg (SE, _("String of %s digits has %zu characters, which is not a "
1188                "multiple of %d."),
1189          base_name, ds_length (&lexer->tokstr), chars_per_byte);
1190
1191   p = ds_cstr (&lexer->tokstr);
1192   for (i = 0; i < byte_cnt; i++)
1193     {
1194       int value;
1195       int j;
1196
1197       value = 0;
1198       for (j = 0; j < chars_per_byte; j++, p++)
1199         {
1200           int v;
1201
1202           if (*p >= '0' && *p <= '9')
1203             v = *p - '0';
1204           else
1205             {
1206               static const char alpha[] = "abcdef";
1207               const char *q = strchr (alpha, tolower ((unsigned char) *p));
1208
1209               if (q)
1210                 v = q - alpha + 10;
1211               else
1212                 v = base;
1213             }
1214
1215           if (v >= base)
1216             msg (SE, _("`%c' is not a valid %s digit."), *p, base_name);
1217
1218           value = value * base + v;
1219         }
1220
1221       ds_cstr (&lexer->tokstr)[i] = (unsigned char) value;
1222     }
1223
1224   ds_truncate (&lexer->tokstr, byte_cnt);
1225 }
1226
1227 /* Parses a string from the input buffer into tokstr.  The input
1228    buffer pointer lexer->prog must point to the initial single or double
1229    quote.  TYPE indicates the type of string to be parsed.
1230    Returns token type. */
1231 static int
1232 parse_string (struct lexer *lexer, enum string_type type)
1233 {
1234   if (type != CHARACTER_STRING)
1235     lexer->prog++;
1236
1237   /* Accumulate the entire string, joining sections indicated by +
1238      signs. */
1239   for (;;)
1240     {
1241       /* Single or double quote. */
1242       int c = *lexer->prog++;
1243
1244       /* Accumulate section. */
1245       for (;;)
1246         {
1247           /* Check end of line. */
1248           if (*lexer->prog == '\0')
1249             {
1250               msg (SE, _("Unterminated string constant."));
1251               goto finish;
1252             }
1253
1254           /* Double quote characters to embed them in strings. */
1255           if (*lexer->prog == c)
1256             {
1257               if (lexer->prog[1] == c)
1258                 lexer->prog++;
1259               else
1260                 break;
1261             }
1262
1263           ds_put_byte (&lexer->tokstr, *lexer->prog++);
1264         }
1265       lexer->prog++;
1266
1267       /* Skip whitespace after final quote mark. */
1268       if (lexer->prog == NULL)
1269         break;
1270       for (;;)
1271         {
1272           while (c_isspace ((unsigned char) *lexer->prog))
1273             lexer->prog++;
1274           if (*lexer->prog)
1275             break;
1276
1277           if (lexer->dot)
1278             goto finish;
1279
1280           if (!lex_get_line (lexer))
1281             goto finish;
1282         }
1283
1284       /* Skip plus sign. */
1285       if (*lexer->prog != '+')
1286         break;
1287       lexer->prog++;
1288
1289       /* Skip whitespace after plus sign. */
1290       if (lexer->prog == NULL)
1291         break;
1292       for (;;)
1293         {
1294           while (c_isspace ((unsigned char) *lexer->prog))
1295             lexer->prog++;
1296           if (*lexer->prog)
1297             break;
1298
1299           if (lexer->dot)
1300             goto finish;
1301
1302           if (!lex_get_line (lexer))
1303             {
1304               msg (SE, _("Unexpected end of file in string concatenation."));
1305               goto finish;
1306             }
1307         }
1308
1309       /* Ensure that a valid string follows. */
1310       if (*lexer->prog != '\'' && *lexer->prog != '"')
1311         {
1312           msg (SE, _("String expected following `+'."));
1313           goto finish;
1314         }
1315     }
1316
1317   /* We come here when we've finished concatenating all the string sections
1318      into one large string. */
1319 finish:
1320   if (type != CHARACTER_STRING)
1321     convert_numeric_string_to_char_string (lexer, type);
1322
1323   return T_STRING;
1324 }
1325 \f
1326 /* Token Accessor Functions */
1327
1328 enum token_type
1329 lex_token (const struct lexer *lexer)
1330 {
1331   return lexer->token;
1332 }
1333
1334 double
1335 lex_tokval (const struct lexer *lexer)
1336 {
1337   return lexer->tokval;
1338 }
1339
1340 /* Returns the null-terminated string value associated with LEXER's current
1341    token.  For a T_ID token, this is the identifier, and for a T_STRING token,
1342    this is the string.  For other tokens the value is undefined. */
1343 const char *
1344 lex_tokcstr (const struct lexer *lexer)
1345 {
1346   return ds_cstr (&lexer->tokstr);
1347 }
1348
1349 /* Returns the string value associated with LEXER's current token.  For a T_ID
1350    token, this is the identifier, and for a T_STRING token, this is the string.
1351    For other tokens the value is undefined. */
1352 struct substring
1353 lex_tokss (const struct lexer *lexer)
1354 {
1355   return ds_ss (&lexer->tokstr);
1356 }
1357
1358 /* If the lexer is positioned at the (pseudo)identifier S, which
1359    may contain a hyphen ('-'), skips it and returns true.  Each
1360    half of the identifier may be abbreviated to its first three
1361    letters.
1362    Otherwise, returns false. */
1363 bool
1364 lex_match_hyphenated_word (struct lexer *lexer, const char *s)
1365 {
1366   const char *hyphen = strchr (s, '-');
1367   if (hyphen == NULL)
1368     return lex_match_id (lexer, s);
1369   else if (lexer->token != T_ID
1370            || !lex_id_match (ss_buffer (s, hyphen - s), lex_tokss (lexer))
1371            || lex_look_ahead (lexer) != T_DASH)
1372     return false;
1373   else
1374     {
1375       lex_get (lexer);
1376       lex_force_match (lexer, T_DASH);
1377       lex_force_match_id (lexer, hyphen + 1);
1378       return true;
1379     }
1380 }
1381