src/language/lexer/lexer.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 1997-9, 2000, 2006, 2009, 2010 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "lexer.h"
  20 #include <c-ctype.h>
  21 #include <c-strtod.h>
  22 #include <errno.h>
  23 #include <limits.h>
  24 #include <math.h>
  25 #include <stdarg.h>
  26 #include <stdint.h>
  27 #include <stdlib.h>
  28 #include <language/command.h>
  29 #include <data/settings.h>
  30 #include <libpspp/assertion.h>
  31 #include <libpspp/getl.h>
  32 #include <libpspp/message.h>
  33 #include <libpspp/str.h>
  34 #include <output/journal.h>
  35 #include <output/text-item.h>
  36
  37 #include "xalloc.h"
  38
  39 #include "gettext.h"
  40 #define _(msgid) gettext (msgid)
  41 #define N_(msgid) msgid
  42
  43 struct lexer
  44 {
  45   struct string line_buffer;
  46
  47   struct source_stream *ss;
  48
  49   int token;      /* Current token. */
  50   double tokval;  /* T_POS_NUM, T_NEG_NUM: the token's value. */
  51
  52   char tokid [VAR_NAME_LEN + 1];   /* T_ID: the identifier. */
  53
  54   struct string tokstr;   /* T_ID, T_STRING: token string value.
  55                             For T_ID, this is not truncated as is
  56                             tokid. */
  57
  58   char *prog; /* Pointer to next token in line_buffer. */
  59   bool dot;   /* True only if this line ends with a terminal dot. */
  60
  61   int put_token ; /* If nonzero, next token returned by lex_get().
  62                     Used only in exceptional circumstances. */
  63
  64   struct string put_tokstr;
  65   double put_tokval;
  66 };
  67
  68
  69 static int parse_id (struct lexer *);
  70
  71 /* How a string represents its contents. */
  72 enum string_type
  73   {
  74     CHARACTER_STRING,   /* Characters. */
  75     BINARY_STRING,      /* Binary digits. */
  76     OCTAL_STRING,       /* Octal digits. */
  77     HEX_STRING          /* Hexadecimal digits. */
  78   };
  79
  80 static int parse_string (struct lexer *, enum string_type);
  81 \f
  82 /* Initialization. */
  83
  84 /* Initializes the lexer. */
  85 struct lexer *
  86 lex_create (struct source_stream *ss)
  87 {
  88   struct lexer *lexer = xzalloc (sizeof (*lexer));
  89
  90   ds_init_empty (&lexer->tokstr);
  91   ds_init_empty (&lexer->put_tokstr);
  92   ds_init_empty (&lexer->line_buffer);
  93   lexer->ss = ss;
  94
  95   return lexer;
  96 }
  97
  98 struct source_stream *
  99 lex_get_source_stream (const struct lexer *lex)
 100 {
 101   return lex->ss;
 102 }
 103
 104 enum syntax_mode
 105 lex_current_syntax_mode (const struct lexer *lex)
 106 {
 107   return source_stream_current_syntax_mode (lex->ss);
 108 }
 109
 110 enum error_mode
 111 lex_current_error_mode (const struct lexer *lex)
 112 {
 113   return source_stream_current_error_mode (lex->ss);
 114 }
 115
 116
 117 void
 118 lex_destroy (struct lexer *lexer)
 119 {
 120   if ( NULL != lexer )
 121     {
 122       ds_destroy (&lexer->put_tokstr);
 123       ds_destroy (&lexer->tokstr);
 124       ds_destroy (&lexer->line_buffer);
 125
 126       free (lexer);
 127     }
 128 }
 129
 130 \f
 131 /* Common functions. */
 132
 133 /* Copies put_token, lexer->put_tokstr, put_tokval into token, tokstr,
 134    tokval, respectively, and sets tokid appropriately. */
 135 static void
 136 restore_token (struct lexer *lexer)
 137 {
 138   assert (lexer->put_token != 0);
 139   lexer->token = lexer->put_token;
 140   ds_assign_string (&lexer->tokstr, &lexer->put_tokstr);
 141   str_copy_trunc (lexer->tokid, sizeof lexer->tokid, ds_cstr (&lexer->tokstr));
 142   lexer->tokval = lexer->put_tokval;
 143   lexer->put_token = 0;
 144 }
 145
 146 /* Copies token, tokstr, lexer->tokval into lexer->put_token, put_tokstr,
 147    put_lexer->tokval respectively. */
 148 static void
 149 save_token (struct lexer *lexer)
 150 {
 151   lexer->put_token = lexer->token;
 152   ds_assign_string (&lexer->put_tokstr, &lexer->tokstr);
 153   lexer->put_tokval = lexer->tokval;
 154 }
 155
 156 /* Parses a single token, setting appropriate global variables to
 157    indicate the token's attributes. */
 158 void
 159 lex_get (struct lexer *lexer)
 160 {
 161   /* Find a token. */
 162   for (;;)
 163     {
 164       if (NULL == lexer->prog && ! lex_get_line (lexer) )
 165         {
 166           lexer->token = T_STOP;
 167           return;
 168         }
 169
 170       /* If a token was pushed ahead, return it. */
 171       if (lexer->put_token)
 172         {
 173           restore_token (lexer);
 174           return;
 175         }
 176
 177       for (;;)
 178         {
 179           /* Skip whitespace. */
 180           while (c_isspace ((unsigned char) *lexer->prog))
 181             lexer->prog++;
 182
 183           if (*lexer->prog)
 184             break;
 185
 186           if (lexer->dot)
 187             {
 188               lexer->dot = 0;
 189               lexer->token = T_ENDCMD;
 190               return;
 191             }
 192           else if (!lex_get_line (lexer))
 193             {
 194               lexer->prog = NULL;
 195               lexer->token = T_STOP;
 196               return;
 197             }
 198
 199           if (lexer->put_token)
 200             {
 201               restore_token (lexer);
 202               return;
 203             }
 204         }
 205
 206
 207       /* Actually parse the token. */
 208       ds_clear (&lexer->tokstr);
 209
 210       switch (*lexer->prog)
 211         {
 212         case '-': case '.':
 213         case '0': case '1': case '2': case '3': case '4':
 214         case '5': case '6': case '7': case '8': case '9':
 215           {
 216             char *tail;
 217
 218             /* `-' can introduce a negative number, or it can be a token by
 219                itself. */
 220             if (*lexer->prog == '-')
 221               {
 222                 ds_put_byte (&lexer->tokstr, *lexer->prog++);
 223                 while (c_isspace ((unsigned char) *lexer->prog))
 224                   lexer->prog++;
 225
 226                 if (!c_isdigit ((unsigned char) *lexer->prog) && *lexer->prog != '.')
 227                   {
 228                     lexer->token = T_DASH;
 229                     break;
 230                   }
 231                 lexer->token = T_NEG_NUM;
 232               }
 233             else
 234               lexer->token = T_POS_NUM;
 235
 236             /* Parse the number, copying it into tokstr. */
 237             while (c_isdigit ((unsigned char) *lexer->prog))
 238               ds_put_byte (&lexer->tokstr, *lexer->prog++);
 239             if (*lexer->prog == '.')
 240               {
 241                 ds_put_byte (&lexer->tokstr, *lexer->prog++);
 242                 while (c_isdigit ((unsigned char) *lexer->prog))
 243                   ds_put_byte (&lexer->tokstr, *lexer->prog++);
 244               }
 245             if (*lexer->prog == 'e' || *lexer->prog == 'E')
 246               {
 247                 ds_put_byte (&lexer->tokstr, *lexer->prog++);
 248                 if (*lexer->prog == '+' || *lexer->prog == '-')
 249                   ds_put_byte (&lexer->tokstr, *lexer->prog++);
 250                 while (c_isdigit ((unsigned char) *lexer->prog))
 251                   ds_put_byte (&lexer->tokstr, *lexer->prog++);
 252               }
 253
 254             /* Parse as floating point. */
 255             lexer->tokval = c_strtod (ds_cstr (&lexer->tokstr), &tail);
 256             if (*tail)
 257               {
 258                 msg (SE, _("%s does not form a valid number."),
 259                      ds_cstr (&lexer->tokstr));
 260                 lexer->tokval = 0.0;
 261
 262                 ds_clear (&lexer->tokstr);
 263                 ds_put_byte (&lexer->tokstr, '0');
 264               }
 265
 266             break;
 267           }
 268
 269         case '\'': case '"':
 270           lexer->token = parse_string (lexer, CHARACTER_STRING);
 271           break;
 272
 273         case '+':
 274           lexer->token = T_PLUS;
 275           lexer->prog++;
 276           break;
 277
 278         case '/':
 279           lexer->token = T_SLASH;
 280           lexer->prog++;
 281           break;
 282
 283         case '=':
 284           lexer->token = T_EQUALS;
 285           lexer->prog++;
 286           break;
 287
 288         case '(':
 289           lexer->token = T_LPAREN;
 290           lexer->prog++;
 291           break;
 292
 293         case ')':
 294           lexer->token = T_RPAREN;
 295           lexer->prog++;
 296           break;
 297
 298         case '[':
 299           lexer->token = T_LBRACK;
 300           lexer->prog++;
 301           break;
 302
 303         case ']':
 304           lexer->token = T_RBRACK;
 305           lexer->prog++;
 306           break;
 307
 308         case ',':
 309           lexer->token = T_COMMA;
 310           lexer->prog++;
 311           break;
 312
 313         case '*':
 314           if (*++lexer->prog == '*')
 315             {
 316               lexer->prog++;
 317               lexer->token = T_EXP;
 318             }
 319           else
 320             lexer->token = T_ASTERISK;
 321           break;
 322
 323         case '<':
 324           if (*++lexer->prog == '=')
 325             {
 326               lexer->prog++;
 327               lexer->token = T_LE;
 328             }
 329           else if (*lexer->prog == '>')
 330             {
 331               lexer->prog++;
 332               lexer->token = T_NE;
 333             }
 334           else
 335             lexer->token = T_LT;
 336           break;
 337
 338         case '>':
 339           if (*++lexer->prog == '=')
 340             {
 341               lexer->prog++;
 342               lexer->token = T_GE;
 343             }
 344           else
 345             lexer->token = T_GT;
 346           break;
 347
 348         case '~':
 349           if (*++lexer->prog == '=')
 350             {
 351               lexer->prog++;
 352               lexer->token = T_NE;
 353             }
 354           else
 355             lexer->token = T_NOT;
 356           break;
 357
 358         case '&':
 359           lexer->prog++;
 360           lexer->token = T_AND;
 361           break;
 362
 363         case '|':
 364           lexer->prog++;
 365           lexer->token = T_OR;
 366           break;
 367
 368         case 'b': case 'B':
 369           if (lexer->prog[1] == '\'' || lexer->prog[1] == '"')
 370             lexer->token = parse_string (lexer, BINARY_STRING);
 371           else
 372             lexer->token = parse_id (lexer);
 373           break;
 374
 375         case 'o': case 'O':
 376           if (lexer->prog[1] == '\'' || lexer->prog[1] == '"')
 377             lexer->token = parse_string (lexer, OCTAL_STRING);
 378           else
 379             lexer->token = parse_id (lexer);
 380           break;
 381
 382         case 'x': case 'X':
 383           if (lexer->prog[1] == '\'' || lexer->prog[1] == '"')
 384             lexer->token = parse_string (lexer, HEX_STRING);
 385           else
 386             lexer->token = parse_id (lexer);
 387           break;
 388
 389         default:
 390           if (lex_is_id1 (*lexer->prog))
 391             {
 392               lexer->token = parse_id (lexer);
 393               break;
 394             }
 395           else
 396             {
 397               unsigned char c = *lexer->prog++;
 398               char *c_name = xasprintf (c_isgraph (c) ? "%c" : "\\%o", c);
 399               msg (SE, _("Bad character in input: `%s'."), c_name);
 400               free (c_name);
 401               continue;
 402             }
 403         }
 404       break;
 405     }
 406 }
 407
 408 /* Parses an identifier at the current position into tokid and
 409    tokstr.
 410    Returns the correct token type. */
 411 static int
 412 parse_id (struct lexer *lexer)
 413 {
 414   struct substring rest_of_line
 415     = ss_substr (ds_ss (&lexer->line_buffer),
 416                  ds_pointer_to_position (&lexer->line_buffer, lexer->prog),
 417                  SIZE_MAX);
 418   struct substring id = ss_head (rest_of_line,
 419                                  lex_id_get_length (rest_of_line));
 420   lexer->prog += ss_length (id);
 421
 422   ds_assign_substring (&lexer->tokstr, id);
 423   str_copy_trunc (lexer->tokid, sizeof lexer->tokid, ds_cstr (&lexer->tokstr));
 424   return lex_id_to_token (id);
 425 }
 426
 427 /* Reports an error to the effect that subcommand SBC may only be
 428    specified once. */
 429 void
 430 lex_sbc_only_once (const char *sbc)
 431 {
 432   msg (SE, _("Subcommand %s may only be specified once."), sbc);
 433 }
 434
 435 /* Reports an error to the effect that subcommand SBC is
 436    missing. */
 437 void
 438 lex_sbc_missing (struct lexer *lexer, const char *sbc)
 439 {
 440   lex_error (lexer, _("missing required subcommand %s"), sbc);
 441 }
 442
 443 /* Prints a syntax error message containing the current token and
 444    given message MESSAGE (if non-null). */
 445 void
 446 lex_error (struct lexer *lexer, const char *message, ...)
 447 {
 448   struct string s;
 449
 450   ds_init_empty (&s);
 451
 452   if (lexer->token == T_STOP)
 453     ds_put_cstr (&s, _("Syntax error at end of file"));
 454   else if (lexer->token == T_ENDCMD)
 455     ds_put_cstr (&s, _("Syntax error at end of command"));
 456   else
 457     {
 458       char *token_rep = lex_token_representation (lexer);
 459       ds_put_format (&s, _("Syntax error at `%s'"), token_rep);
 460       free (token_rep);
 461     }
 462
 463   if (message)
 464     {
 465       va_list args;
 466
 467       ds_put_cstr (&s, ": ");
 468
 469       va_start (args, message);
 470       ds_put_vformat (&s, message, args);
 471       va_end (args);
 472     }
 473
 474   msg (SE, "%s.", ds_cstr (&s));
 475   ds_destroy (&s);
 476 }
 477
 478 /* Checks that we're at end of command.
 479    If so, returns a successful command completion code.
 480    If not, flags a syntax error and returns an error command
 481    completion code. */
 482 int
 483 lex_end_of_command (struct lexer *lexer)
 484 {
 485   if (lexer->token != T_ENDCMD)
 486     {
 487       lex_error (lexer, _("expecting end of command"));
 488       return CMD_FAILURE;
 489     }
 490   else
 491     return CMD_SUCCESS;
 492 }
 493 \f
 494 /* Token testing functions. */
 495
 496 /* Returns true if the current token is a number. */
 497 bool
 498 lex_is_number (struct lexer *lexer)
 499 {
 500   return lexer->token == T_POS_NUM || lexer->token == T_NEG_NUM;
 501 }
 502
 503
 504 /* Returns true if the current token is a string. */
 505 bool
 506 lex_is_string (struct lexer *lexer)
 507 {
 508   return lexer->token == T_STRING;
 509 }
 510
 511
 512 /* Returns the value of the current token, which must be a
 513    floating point number. */
 514 double
 515 lex_number (struct lexer *lexer)
 516 {
 517   assert (lex_is_number (lexer));
 518   return lexer->tokval;
 519 }
 520
 521 /* Returns true iff the current token is an integer. */
 522 bool
 523 lex_is_integer (struct lexer *lexer)
 524 {
 525   return (lex_is_number (lexer)
 526           && lexer->tokval > LONG_MIN
 527           && lexer->tokval <= LONG_MAX
 528           && floor (lexer->tokval) == lexer->tokval);
 529 }
 530
 531 /* Returns the value of the current token, which must be an
 532    integer. */
 533 long
 534 lex_integer (struct lexer *lexer)
 535 {
 536   assert (lex_is_integer (lexer));
 537   return lexer->tokval;
 538 }
 539 \f
 540 /* Token matching functions. */
 541
 542 /* If TOK is the current token, skips it and returns true
 543    Otherwise, returns false. */
 544 bool
 545 lex_match (struct lexer *lexer, enum token_type t)
 546 {
 547   if (lexer->token == t)
 548     {
 549       lex_get (lexer);
 550       return true;
 551     }
 552   else
 553     return false;
 554 }
 555
 556 /* If the current token is the identifier S, skips it and returns
 557    true.  The identifier may be abbreviated to its first three
 558    letters.
 559    Otherwise, returns false. */
 560 bool
 561 lex_match_id (struct lexer *lexer, const char *s)
 562 {
 563   return lex_match_id_n (lexer, s, 3);
 564 }
 565
 566 /* If the current token is the identifier S, skips it and returns
 567    true.  The identifier may be abbreviated to its first N
 568    letters.
 569    Otherwise, returns false. */
 570 bool
 571 lex_match_id_n (struct lexer *lexer, const char *s, size_t n)
 572 {
 573   if (lexer->token == T_ID
 574       && lex_id_match_n (ss_cstr (s), ss_cstr (lexer->tokid), n))
 575     {
 576       lex_get (lexer);
 577       return true;
 578     }
 579   else
 580     return false;
 581 }
 582
 583 /* If the current token is integer N, skips it and returns true.
 584    Otherwise, returns false. */
 585 bool
 586 lex_match_int (struct lexer *lexer, int x)
 587 {
 588   if (lex_is_integer (lexer) && lex_integer (lexer) == x)
 589     {
 590       lex_get (lexer);
 591       return true;
 592     }
 593   else
 594     return false;
 595 }
 596 \f
 597 /* Forced matches. */
 598
 599 /* If this token is identifier S, fetches the next token and returns
 600    nonzero.
 601    Otherwise, reports an error and returns zero. */
 602 bool
 603 lex_force_match_id (struct lexer *lexer, const char *s)
 604 {
 605   if (lex_match_id (lexer, s))
 606     return true;
 607   else
 608     {
 609       lex_error (lexer, _("expecting `%s'"), s);
 610       return false;
 611     }
 612 }
 613
 614 /* If the current token is T, skips the token.  Otherwise, reports an
 615    error and returns from the current function with return value false. */
 616 bool
 617 lex_force_match (struct lexer *lexer, enum token_type t)
 618 {
 619   if (lexer->token == t)
 620     {
 621       lex_get (lexer);
 622       return true;
 623     }
 624   else
 625     {
 626       lex_error (lexer, _("expecting `%s'"), lex_token_name (t));
 627       return false;
 628     }
 629 }
 630
 631 /* If this token is a string, does nothing and returns true.
 632    Otherwise, reports an error and returns false. */
 633 bool
 634 lex_force_string (struct lexer *lexer)
 635 {
 636   if (lex_is_string (lexer))
 637     return true;
 638   else
 639     {
 640       lex_error (lexer, _("expecting string"));
 641       return false;
 642     }
 643 }
 644
 645 /* If this token is an integer, does nothing and returns true.
 646    Otherwise, reports an error and returns false. */
 647 bool
 648 lex_force_int (struct lexer *lexer)
 649 {
 650   if (lex_is_integer (lexer))
 651     return true;
 652   else
 653     {
 654       lex_error (lexer, _("expecting integer"));
 655       return false;
 656     }
 657 }
 658
 659 /* If this token is a number, does nothing and returns true.
 660    Otherwise, reports an error and returns false. */
 661 bool
 662 lex_force_num (struct lexer *lexer)
 663 {
 664   if (lex_is_number (lexer))
 665     return true;
 666
 667   lex_error (lexer, _("expecting number"));
 668   return false;
 669 }
 670
 671 /* If this token is an identifier, does nothing and returns true.
 672    Otherwise, reports an error and returns false. */
 673 bool
 674 lex_force_id (struct lexer *lexer)
 675 {
 676   if (lexer->token == T_ID)
 677     return true;
 678
 679   lex_error (lexer, _("expecting identifier"));
 680   return false;
 681 }
 682
 683 /* Weird token functions. */
 684
 685 /* Returns the likely type of the next token, or 0 if it's hard to tell. */
 686 enum token_type
 687 lex_look_ahead (struct lexer *lexer)
 688 {
 689   if (lexer->put_token)
 690     return lexer->put_token;
 691
 692   for (;;)
 693     {
 694       if (NULL == lexer->prog && ! lex_get_line (lexer) )
 695         return 0;
 696
 697       for (;;)
 698         {
 699           while (c_isspace ((unsigned char) *lexer->prog))
 700             lexer->prog++;
 701           if (*lexer->prog)
 702             break;
 703
 704           if (lexer->dot)
 705             return T_ENDCMD;
 706           else if (!lex_get_line (lexer))
 707             return 0;
 708
 709           if (lexer->put_token)
 710             return lexer->put_token;
 711         }
 712
 713       switch (toupper ((unsigned char) *lexer->prog))
 714         {
 715         case 'X': case 'B': case 'O':
 716           if (lexer->prog[1] == '\'' || lexer->prog[1] == '"')
 717             return T_STRING;
 718           /* Fall through */
 719
 720         case '-':
 721           return T_DASH;
 722
 723         case '.':
 724         case '0': case '1': case '2': case '3': case '4':
 725         case '5': case '6': case '7': case '8': case '9':
 726           return T_POS_NUM;
 727
 728         case '\'': case '"':
 729           return T_STRING;
 730
 731         case '+':
 732           return T_PLUS;
 733
 734         case '/':
 735           return T_SLASH;
 736
 737         case '=':
 738           return T_EQUALS;
 739
 740         case '(':
 741           return T_LPAREN;
 742
 743         case ')':
 744           return T_RPAREN;
 745
 746         case '[':
 747           return T_LBRACK;
 748
 749         case ']':
 750           return T_RBRACK;
 751
 752         case ',':
 753           return T_COMMA;
 754
 755         case '*':
 756           return lexer->prog[1] == '*' ? T_EXP : T_ASTERISK;
 757
 758         case '<':
 759           return (lexer->prog[1] == '=' ? T_LE
 760                   : lexer->prog[1] == '>' ? T_NE
 761                   : T_LT);
 762
 763         case '>':
 764           return lexer->prog[1] == '=' ? T_GE : T_GT;
 765
 766         case '~':
 767           return lexer->prog[1] == '=' ? T_NE : T_NOT;
 768
 769         case '&':
 770           return T_AND;
 771
 772         case '|':
 773           return T_OR;
 774
 775         default:
 776           if (lex_is_id1 (*lexer->prog))
 777             return T_ID;
 778           return 0;
 779         }
 780     }
 781 }
 782
 783 /* Makes the current token become the next token to be read; the
 784    current token is set to T. */
 785 void
 786 lex_put_back (struct lexer *lexer, enum token_type t)
 787 {
 788   save_token (lexer);
 789   lexer->token = t;
 790 }
 791
 792 /* Makes the current token become the next token to be read; the
 793    current token is set to the identifier ID. */
 794 void
 795 lex_put_back_id (struct lexer *lexer, const char *id)
 796 {
 797   assert (lex_id_to_token (ss_cstr (id)) == T_ID);
 798   save_token (lexer);
 799   lexer->token = T_ID;
 800   ds_assign_cstr (&lexer->tokstr, id);
 801   str_copy_trunc (lexer->tokid, sizeof lexer->tokid, ds_cstr (&lexer->tokstr));
 802 }
 803 \f
 804 /* Weird line processing functions. */
 805
 806 /* Returns the entire contents of the current line. */
 807 const char *
 808 lex_entire_line (const struct lexer *lexer)
 809 {
 810   return ds_cstr (&lexer->line_buffer);
 811 }
 812
 813 const struct string *
 814 lex_entire_line_ds (const struct lexer *lexer)
 815 {
 816   return &lexer->line_buffer;
 817 }
 818
 819 /* As lex_entire_line(), but only returns the part of the current line
 820    that hasn't already been tokenized. */
 821 const char *
 822 lex_rest_of_line (const struct lexer *lexer)
 823 {
 824   return lexer->prog;
 825 }
 826
 827 /* Returns true if the current line ends in a terminal dot,
 828    false otherwise. */
 829 bool
 830 lex_end_dot (const struct lexer *lexer)
 831 {
 832   return lexer->dot;
 833 }
 834
 835 /* Causes the rest of the current input line to be ignored for
 836    tokenization purposes. */
 837 void
 838 lex_discard_line (struct lexer *lexer)
 839 {
 840   ds_cstr (&lexer->line_buffer);  /* Ensures ds_end points to something valid */
 841   lexer->prog = ds_end (&lexer->line_buffer);
 842   lexer->dot = false;
 843   lexer->put_token = 0;
 844 }
 845
 846
 847 /* Discards the rest of the current command.
 848    When we're reading commands from a file, we skip tokens until
 849    a terminal dot or EOF.
 850    When we're reading commands interactively from the user,
 851    that's just discarding the current line, because presumably
 852    the user doesn't want to finish typing a command that will be
 853    ignored anyway. */
 854 void
 855 lex_discard_rest_of_command (struct lexer *lexer)
 856 {
 857   if (!getl_is_interactive (lexer->ss))
 858     {
 859       while (lexer->token != T_STOP && lexer->token != T_ENDCMD)
 860         lex_get (lexer);
 861     }
 862   else
 863     lex_discard_line (lexer);
 864 }
 865 \f
 866 /* Weird line reading functions. */
 867
 868 /* Remove C-style comments in STRING, begun by slash-star and
 869    terminated by star-slash or newline. */
 870 static void
 871 strip_comments (struct string *string)
 872 {
 873   char *cp;
 874   int quote;
 875   bool in_comment;
 876
 877   in_comment = false;
 878   quote = EOF;
 879   for (cp = ds_cstr (string); *cp; )
 880     {
 881       /* If we're not in a comment, check for quote marks. */
 882       if (!in_comment)
 883         {
 884           if (*cp == quote)
 885             quote = EOF;
 886           else if (*cp == '\'' || *cp == '"')
 887             quote = *cp;
 888         }
 889
 890       /* If we're not inside a quotation, check for comment. */
 891       if (quote == EOF)
 892         {
 893           if (cp[0] == '/' && cp[1] == '*')
 894             {
 895               in_comment = true;
 896               *cp++ = ' ';
 897               *cp++ = ' ';
 898               continue;
 899             }
 900           else if (in_comment && cp[0] == '*' && cp[1] == '/')
 901             {
 902               in_comment = false;
 903               *cp++ = ' ';
 904               *cp++ = ' ';
 905               continue;
 906             }
 907         }
 908
 909       /* Check commenting. */
 910       if (in_comment)
 911         *cp = ' ';
 912       cp++;
 913     }
 914 }
 915
 916 /* Prepares LINE, which is subject to the given SYNTAX rules, for
 917    tokenization by stripping comments and determining whether it
 918    is the beginning or end of a command and storing into
 919    *LINE_STARTS_COMMAND and *LINE_ENDS_COMMAND appropriately. */
 920 void
 921 lex_preprocess_line (struct string *line,
 922                      enum syntax_mode syntax,
 923                      bool *line_starts_command,
 924                      bool *line_ends_command)
 925 {
 926   strip_comments (line);
 927   ds_rtrim (line, ss_cstr (CC_SPACES));
 928   *line_ends_command = (ds_chomp (line, settings_get_endcmd ())
 929                         || (ds_is_empty (line) && settings_get_nulline ()));
 930   *line_starts_command = false;
 931   if (syntax == GETL_BATCH)
 932     {
 933       int first = ds_first (line);
 934       *line_starts_command = !c_isspace (first);
 935       if (first == '+' || first == '-')
 936         *ds_data (line) = ' ';
 937     }
 938 }
 939
 940 /* Reads a line, without performing any preprocessing. */
 941 bool
 942 lex_get_line_raw (struct lexer *lexer)
 943 {
 944   bool ok = getl_read_line (lexer->ss, &lexer->line_buffer);
 945   if (ok)
 946     {
 947       const char *line = ds_cstr (&lexer->line_buffer);
 948       text_item_submit (text_item_create (TEXT_ITEM_SYNTAX, line));
 949     }
 950   else
 951     lexer->prog = NULL;
 952   return ok;
 953 }
 954
 955 /* Reads a line for use by the tokenizer, and preprocesses it by
 956    removing comments, stripping trailing whitespace and the
 957    terminal dot, and removing leading indentors. */
 958 bool
 959 lex_get_line (struct lexer *lexer)
 960 {
 961   bool line_starts_command;
 962
 963   if (!lex_get_line_raw (lexer))
 964     return false;
 965
 966   lex_preprocess_line (&lexer->line_buffer,
 967                        lex_current_syntax_mode (lexer),
 968                        &line_starts_command, &lexer->dot);
 969
 970   if (line_starts_command)
 971     lexer->put_token = T_ENDCMD;
 972
 973   lexer->prog = ds_cstr (&lexer->line_buffer);
 974   return true;
 975 }
 976 \f
 977 /* Token names. */
 978
 979 /* Returns the name of a token. */
 980 const char *
 981 lex_token_name (enum token_type token)
 982 {
 983   switch (token)
 984     {
 985     case T_ID:
 986     case T_POS_NUM:
 987     case T_NEG_NUM:
 988     case T_STRING:
 989       NOT_REACHED ();
 990
 991     case T_STOP:
 992       return "";
 993
 994     case T_ENDCMD:
 995       return ".";
 996
 997     case T_PLUS:
 998       return "+";
 999
1000     case T_DASH:
1001       return "-";
1002
1003     case T_ASTERISK:
1004       return "*";
1005
1006     case T_SLASH:
1007       return "/";
1008
1009     case T_EQUALS:
1010       return "=";
1011
1012     case T_LPAREN:
1013       return "(";
1014
1015     case T_RPAREN:
1016       return ")";
1017
1018     case T_LBRACK:
1019       return "[";
1020
1021     case T_RBRACK:
1022       return "]";
1023
1024     case T_COMMA:
1025       return ",";
1026
1027     case T_AND:
1028       return "AND";
1029
1030     case T_OR:
1031       return "OR";
1032
1033     case T_NOT:
1034       return "NOT";
1035
1036     case T_EQ:
1037       return "EQ";
1038
1039     case T_GE:
1040       return ">=";
1041
1042     case T_GT:
1043       return ">";
1044
1045     case T_LE:
1046       return "<=";
1047
1048     case T_LT:
1049       return "<";
1050
1051     case T_NE:
1052       return "~=";
1053
1054     case T_ALL:
1055       return "ALL";
1056
1057     case T_BY:
1058       return "BY";
1059
1060     case T_TO:
1061       return "TO";
1062
1063     case T_WITH:
1064       return "WITH";
1065
1066     case T_EXP:
1067       return "**";
1068     }
1069
1070   NOT_REACHED ();
1071 }
1072
1073 /* Returns an ASCII representation of the current token as a
1074    malloc()'d string. */
1075 char *
1076 lex_token_representation (struct lexer *lexer)
1077 {
1078   char *token_rep;
1079
1080   switch (lexer->token)
1081     {
1082     case T_ID:
1083     case T_POS_NUM:
1084     case T_NEG_NUM:
1085       return ds_xstrdup (&lexer->tokstr);
1086
1087     case T_STRING:
1088       {
1089         int hexstring = 0;
1090         char *sp, *dp;
1091
1092         for (sp = ds_cstr (&lexer->tokstr); sp < ds_end (&lexer->tokstr); sp++)
1093           if (!c_isprint ((unsigned char) *sp))
1094             {
1095               hexstring = 1;
1096               break;
1097             }
1098
1099         token_rep = xmalloc (2 + ds_length (&lexer->tokstr) * 2 + 1 + 1);
1100
1101         dp = token_rep;
1102         if (hexstring)
1103           *dp++ = 'X';
1104         *dp++ = '\'';
1105
1106         if (!hexstring)
1107           for (sp = ds_cstr (&lexer->tokstr); *sp; )
1108             {
1109               if (*sp == '\'')
1110                 *dp++ = '\'';
1111               *dp++ = (unsigned char) *sp++;
1112             }
1113         else
1114           for (sp = ds_cstr (&lexer->tokstr); sp < ds_end (&lexer->tokstr); sp++)
1115             {
1116               *dp++ = (((unsigned char) *sp) >> 4)["0123456789ABCDEF"];
1117               *dp++ = (((unsigned char) *sp) & 15)["0123456789ABCDEF"];
1118             }
1119         *dp++ = '\'';
1120         *dp = '\0';
1121
1122         return token_rep;
1123       }
1124
1125     default:
1126       return xstrdup (lex_token_name (lexer->token));
1127     }
1128 }
1129 \f
1130 /* Really weird functions. */
1131
1132 /* Skip a COMMENT command. */
1133 void
1134 lex_skip_comment (struct lexer *lexer)
1135 {
1136   for (;;)
1137     {
1138       if (!lex_get_line (lexer))
1139         {
1140           lexer->put_token = T_STOP;
1141           lexer->prog = NULL;
1142           return;
1143         }
1144
1145       if (lexer->put_token == T_ENDCMD)
1146         break;
1147
1148       ds_cstr (&lexer->line_buffer); /* Ensures ds_end will point to a valid char */
1149       lexer->prog = ds_end (&lexer->line_buffer);
1150       if (lexer->dot)
1151         break;
1152     }
1153 }
1154 \f
1155 /* Private functions. */
1156
1157 /* When invoked, tokstr contains a string of binary, octal, or
1158    hex digits, according to TYPE.  The string is converted to
1159    characters having the specified values. */
1160 static void
1161 convert_numeric_string_to_char_string (struct lexer *lexer,
1162                                        enum string_type type)
1163 {
1164   const char *base_name;
1165   int base;
1166   int chars_per_byte;
1167   size_t byte_cnt;
1168   size_t i;
1169   char *p;
1170
1171   switch (type)
1172     {
1173     case BINARY_STRING:
1174       base_name = _("binary");
1175       base = 2;
1176       chars_per_byte = 8;
1177       break;
1178     case OCTAL_STRING:
1179       base_name = _("octal");
1180       base = 8;
1181       chars_per_byte = 3;
1182       break;
1183     case HEX_STRING:
1184       base_name = _("hex");
1185       base = 16;
1186       chars_per_byte = 2;
1187       break;
1188     default:
1189       NOT_REACHED ();
1190     }
1191
1192   byte_cnt = ds_length (&lexer->tokstr) / chars_per_byte;
1193   if (ds_length (&lexer->tokstr) % chars_per_byte)
1194     msg (SE, _("String of %s digits has %zu characters, which is not a "
1195                "multiple of %d."),
1196          base_name, ds_length (&lexer->tokstr), chars_per_byte);
1197
1198   p = ds_cstr (&lexer->tokstr);
1199   for (i = 0; i < byte_cnt; i++)
1200     {
1201       int value;
1202       int j;
1203
1204       value = 0;
1205       for (j = 0; j < chars_per_byte; j++, p++)
1206         {
1207           int v;
1208
1209           if (*p >= '0' && *p <= '9')
1210             v = *p - '0';
1211           else
1212             {
1213               static const char alpha[] = "abcdef";
1214               const char *q = strchr (alpha, tolower ((unsigned char) *p));
1215
1216               if (q)
1217                 v = q - alpha + 10;
1218               else
1219                 v = base;
1220             }
1221
1222           if (v >= base)
1223             msg (SE, _("`%c' is not a valid %s digit."), *p, base_name);
1224
1225           value = value * base + v;
1226         }
1227
1228       ds_cstr (&lexer->tokstr)[i] = (unsigned char) value;
1229     }
1230
1231   ds_truncate (&lexer->tokstr, byte_cnt);
1232 }
1233
1234 /* Parses a string from the input buffer into tokstr.  The input
1235    buffer pointer lexer->prog must point to the initial single or double
1236    quote.  TYPE indicates the type of string to be parsed.
1237    Returns token type. */
1238 static int
1239 parse_string (struct lexer *lexer, enum string_type type)
1240 {
1241   if (type != CHARACTER_STRING)
1242     lexer->prog++;
1243
1244   /* Accumulate the entire string, joining sections indicated by +
1245      signs. */
1246   for (;;)
1247     {
1248       /* Single or double quote. */
1249       int c = *lexer->prog++;
1250
1251       /* Accumulate section. */
1252       for (;;)
1253         {
1254           /* Check end of line. */
1255           if (*lexer->prog == '\0')
1256             {
1257               msg (SE, _("Unterminated string constant."));
1258               goto finish;
1259             }
1260
1261           /* Double quote characters to embed them in strings. */
1262           if (*lexer->prog == c)
1263             {
1264               if (lexer->prog[1] == c)
1265                 lexer->prog++;
1266               else
1267                 break;
1268             }
1269
1270           ds_put_byte (&lexer->tokstr, *lexer->prog++);
1271         }
1272       lexer->prog++;
1273
1274       /* Skip whitespace after final quote mark. */
1275       if (lexer->prog == NULL)
1276         break;
1277       for (;;)
1278         {
1279           while (c_isspace ((unsigned char) *lexer->prog))
1280             lexer->prog++;
1281           if (*lexer->prog)
1282             break;
1283
1284           if (lexer->dot)
1285             goto finish;
1286
1287           if (!lex_get_line (lexer))
1288             goto finish;
1289         }
1290
1291       /* Skip plus sign. */
1292       if (*lexer->prog != '+')
1293         break;
1294       lexer->prog++;
1295
1296       /* Skip whitespace after plus sign. */
1297       if (lexer->prog == NULL)
1298         break;
1299       for (;;)
1300         {
1301           while (c_isspace ((unsigned char) *lexer->prog))
1302             lexer->prog++;
1303           if (*lexer->prog)
1304             break;
1305
1306           if (lexer->dot)
1307             goto finish;
1308
1309           if (!lex_get_line (lexer))
1310             {
1311               msg (SE, _("Unexpected end of file in string concatenation."));
1312               goto finish;
1313             }
1314         }
1315
1316       /* Ensure that a valid string follows. */
1317       if (*lexer->prog != '\'' && *lexer->prog != '"')
1318         {
1319           msg (SE, _("String expected following `+'."));
1320           goto finish;
1321         }
1322     }
1323
1324   /* We come here when we've finished concatenating all the string sections
1325      into one large string. */
1326 finish:
1327   if (type != CHARACTER_STRING)
1328     convert_numeric_string_to_char_string (lexer, type);
1329
1330   return T_STRING;
1331 }
1332 \f
1333 /* Token Accessor Functions */
1334
1335 enum token_type
1336 lex_token (const struct lexer *lexer)
1337 {
1338   return lexer->token;
1339 }
1340
1341 double
1342 lex_tokval (const struct lexer *lexer)
1343 {
1344   return lexer->tokval;
1345 }
1346
1347 const char *
1348 lex_tokid (const struct lexer *lexer)
1349 {
1350   return lexer->tokid;
1351 }
1352
1353 const struct string *
1354 lex_tokstr (const struct lexer *lexer)
1355 {
1356   return &lexer->tokstr;
1357 }
1358
1359 /* If the lexer is positioned at the (pseudo)identifier S, which
1360    may contain a hyphen ('-'), skips it and returns true.  Each
1361    half of the identifier may be abbreviated to its first three
1362    letters.
1363    Otherwise, returns false. */
1364 bool
1365 lex_match_hyphenated_word (struct lexer *lexer, const char *s)
1366 {
1367   const char *hyphen = strchr (s, '-');
1368   if (hyphen == NULL)
1369     return lex_match_id (lexer, s);
1370   else if (lexer->token != T_ID
1371            || !lex_id_match (ss_buffer (s, hyphen - s), ss_cstr (lexer->tokid))
1372            || lex_look_ahead (lexer) != T_DASH)
1373     return false;
1374   else
1375     {
1376       lex_get (lexer);
1377       lex_force_match (lexer, T_DASH);
1378       lex_force_match_id (lexer, hyphen + 1);
1379       return true;
1380     }
1381 }
1382