pintos-os.org Git - pspp/blob - src/language/lexer/lexer.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 1997-9, 2000, 2006, 2009, 2010 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "lexer.h"
  20 #include <c-ctype.h>
  21 #include <c-strtod.h>
  22 #include <errno.h>
  23 #include <limits.h>
  24 #include <math.h>
  25 #include <stdarg.h>
  26 #include <stdint.h>
  27 #include <stdlib.h>
  28 #include <language/command.h>
  29 #include <data/settings.h>
  30 #include <libpspp/assertion.h>
  31 #include <libpspp/getl.h>
  32 #include <libpspp/message.h>
  33 #include <libpspp/str.h>
  34 #include <output/journal.h>
  35 #include <output/text-item.h>
  36
  37 #include "xalloc.h"
  38
  39 #include "gettext.h"
  40 #define _(msgid) gettext (msgid)
  41 #define N_(msgid) msgid
  42
  43 struct lexer
  44 {
  45   struct string line_buffer;
  46
  47   struct source_stream *ss;
  48
  49   int token;      /* Current token. */
  50   double tokval;  /* T_POS_NUM, T_NEG_NUM: the token's value. */
  51
  52   char tokid [VAR_NAME_LEN + 1];   /* T_ID: the identifier. */
  53
  54   struct string tokstr;   /* T_ID, T_STRING: token string value.
  55                             For T_ID, this is not truncated as is
  56                             tokid. */
  57
  58   char *prog; /* Pointer to next token in line_buffer. */
  59   bool dot;   /* True only if this line ends with a terminal dot. */
  60
  61   int put_token ; /* If nonzero, next token returned by lex_get().
  62                     Used only in exceptional circumstances. */
  63
  64   struct string put_tokstr;
  65   double put_tokval;
  66 };
  67
  68
  69 static int parse_id (struct lexer *);
  70
  71 /* How a string represents its contents. */
  72 enum string_type
  73   {
  74     CHARACTER_STRING,   /* Characters. */
  75     BINARY_STRING,      /* Binary digits. */
  76     OCTAL_STRING,       /* Octal digits. */
  77     HEX_STRING          /* Hexadecimal digits. */
  78   };
  79
  80 static int parse_string (struct lexer *, enum string_type);
  81 \f
  82 /* Initialization. */
  83
  84 /* Initializes the lexer. */
  85 struct lexer *
  86 lex_create (struct source_stream *ss)
  87 {
  88   struct lexer *lexer = xzalloc (sizeof (*lexer));
  89
  90   ds_init_empty (&lexer->tokstr);
  91   ds_init_empty (&lexer->put_tokstr);
  92   ds_init_empty (&lexer->line_buffer);
  93   lexer->ss = ss;
  94
  95   return lexer;
  96 }
  97
  98 struct source_stream *
  99 lex_get_source_stream (const struct lexer *lex)
 100 {
 101   return lex->ss;
 102 }
 103
 104 enum syntax_mode
 105 lex_current_syntax_mode (const struct lexer *lex)
 106 {
 107   return source_stream_current_syntax_mode (lex->ss);
 108 }
 109
 110 enum error_mode
 111 lex_current_error_mode (const struct lexer *lex)
 112 {
 113   return source_stream_current_error_mode (lex->ss);
 114 }
 115
 116
 117 void
 118 lex_destroy (struct lexer *lexer)
 119 {
 120   if ( NULL != lexer )
 121     {
 122       ds_destroy (&lexer->put_tokstr);
 123       ds_destroy (&lexer->tokstr);
 124       ds_destroy (&lexer->line_buffer);
 125
 126       free (lexer);
 127     }
 128 }
 129
 130 \f
 131 /* Common functions. */
 132
 133 /* Copies put_token, lexer->put_tokstr, put_tokval into token, tokstr,
 134    tokval, respectively, and sets tokid appropriately. */
 135 static void
 136 restore_token (struct lexer *lexer)
 137 {
 138   assert (lexer->put_token != 0);
 139   lexer->token = lexer->put_token;
 140   ds_assign_string (&lexer->tokstr, &lexer->put_tokstr);
 141   str_copy_trunc (lexer->tokid, sizeof lexer->tokid, ds_cstr (&lexer->tokstr));
 142   lexer->tokval = lexer->put_tokval;
 143   lexer->put_token = 0;
 144 }
 145
 146 /* Copies token, tokstr, lexer->tokval into lexer->put_token, put_tokstr,
 147    put_lexer->tokval respectively. */
 148 static void
 149 save_token (struct lexer *lexer)
 150 {
 151   lexer->put_token = lexer->token;
 152   ds_assign_string (&lexer->put_tokstr, &lexer->tokstr);
 153   lexer->put_tokval = lexer->tokval;
 154 }
 155
 156 /* Parses a single token, setting appropriate global variables to
 157    indicate the token's attributes. */
 158 void
 159 lex_get (struct lexer *lexer)
 160 {
 161   /* Find a token. */
 162   for (;;)
 163     {
 164       if (NULL == lexer->prog && ! lex_get_line (lexer) )
 165         {
 166           lexer->token = T_STOP;
 167           return;
 168         }
 169
 170       /* If a token was pushed ahead, return it. */
 171       if (lexer->put_token)
 172         {
 173           restore_token (lexer);
 174           return;
 175         }
 176
 177       for (;;)
 178         {
 179           /* Skip whitespace. */
 180           while (c_isspace ((unsigned char) *lexer->prog))
 181             lexer->prog++;
 182
 183           if (*lexer->prog)
 184             break;
 185
 186           if (lexer->dot)
 187             {
 188               lexer->dot = 0;
 189               lexer->token = T_ENDCMD;
 190               return;
 191             }
 192           else if (!lex_get_line (lexer))
 193             {
 194               lexer->prog = NULL;
 195               lexer->token = T_STOP;
 196               return;
 197             }
 198
 199           if (lexer->put_token)
 200             {
 201               restore_token (lexer);
 202               return;
 203             }
 204         }
 205
 206
 207       /* Actually parse the token. */
 208       ds_clear (&lexer->tokstr);
 209
 210       switch (*lexer->prog)
 211         {
 212         case '-': case '.':
 213         case '0': case '1': case '2': case '3': case '4':
 214         case '5': case '6': case '7': case '8': case '9':
 215           {
 216             char *tail;
 217
 218             /* `-' can introduce a negative number, or it can be a
 219                token by itself.  If it is not followed by a digit or a
 220                decimal point, it is definitely not a number.
 221                Otherwise, it might be either, but most of the time we
 222                want it as a number.  When the syntax calls for a `-'
 223                token, lex_negative_to_dash() must be used to break
 224                negative numbers into two tokens. */
 225             if (*lexer->prog == '-')
 226               {
 227                 ds_put_byte (&lexer->tokstr, *lexer->prog++);
 228                 while (c_isspace ((unsigned char) *lexer->prog))
 229                   lexer->prog++;
 230
 231                 if (!c_isdigit ((unsigned char) *lexer->prog) && *lexer->prog != '.')
 232                   {
 233                     lexer->token = T_DASH;
 234                     break;
 235                   }
 236                 lexer->token = T_NEG_NUM;
 237               }
 238             else
 239               lexer->token = T_POS_NUM;
 240
 241             /* Parse the number, copying it into tokstr. */
 242             while (c_isdigit ((unsigned char) *lexer->prog))
 243               ds_put_byte (&lexer->tokstr, *lexer->prog++);
 244             if (*lexer->prog == '.')
 245               {
 246                 ds_put_byte (&lexer->tokstr, *lexer->prog++);
 247                 while (c_isdigit ((unsigned char) *lexer->prog))
 248                   ds_put_byte (&lexer->tokstr, *lexer->prog++);
 249               }
 250             if (*lexer->prog == 'e' || *lexer->prog == 'E')
 251               {
 252                 ds_put_byte (&lexer->tokstr, *lexer->prog++);
 253                 if (*lexer->prog == '+' || *lexer->prog == '-')
 254                   ds_put_byte (&lexer->tokstr, *lexer->prog++);
 255                 while (c_isdigit ((unsigned char) *lexer->prog))
 256                   ds_put_byte (&lexer->tokstr, *lexer->prog++);
 257               }
 258
 259             /* Parse as floating point. */
 260             lexer->tokval = c_strtod (ds_cstr (&lexer->tokstr), &tail);
 261             if (*tail)
 262               {
 263                 msg (SE, _("%s does not form a valid number."),
 264                      ds_cstr (&lexer->tokstr));
 265                 lexer->tokval = 0.0;
 266
 267                 ds_clear (&lexer->tokstr);
 268                 ds_put_byte (&lexer->tokstr, '0');
 269               }
 270
 271             break;
 272           }
 273
 274         case '\'': case '"':
 275           lexer->token = parse_string (lexer, CHARACTER_STRING);
 276           break;
 277
 278         case '+':
 279           lexer->token = T_PLUS;
 280           lexer->prog++;
 281           break;
 282
 283         case '/':
 284           lexer->token = T_SLASH;
 285           lexer->prog++;
 286           break;
 287
 288         case '=':
 289           lexer->token = T_EQUALS;
 290           lexer->prog++;
 291           break;
 292
 293         case '(':
 294           lexer->token = T_LPAREN;
 295           lexer->prog++;
 296           break;
 297
 298         case ')':
 299           lexer->token = T_RPAREN;
 300           lexer->prog++;
 301           break;
 302
 303         case '[':
 304           lexer->token = T_LBRACK;
 305           lexer->prog++;
 306           break;
 307
 308         case ']':
 309           lexer->token = T_RBRACK;
 310           lexer->prog++;
 311           break;
 312
 313         case ',':
 314           lexer->token = T_COMMA;
 315           lexer->prog++;
 316           break;
 317
 318         case '*':
 319           if (*++lexer->prog == '*')
 320             {
 321               lexer->prog++;
 322               lexer->token = T_EXP;
 323             }
 324           else
 325             lexer->token = T_ASTERISK;
 326           break;
 327
 328         case '<':
 329           if (*++lexer->prog == '=')
 330             {
 331               lexer->prog++;
 332               lexer->token = T_LE;
 333             }
 334           else if (*lexer->prog == '>')
 335             {
 336               lexer->prog++;
 337               lexer->token = T_NE;
 338             }
 339           else
 340             lexer->token = T_LT;
 341           break;
 342
 343         case '>':
 344           if (*++lexer->prog == '=')
 345             {
 346               lexer->prog++;
 347               lexer->token = T_GE;
 348             }
 349           else
 350             lexer->token = T_GT;
 351           break;
 352
 353         case '~':
 354           if (*++lexer->prog == '=')
 355             {
 356               lexer->prog++;
 357               lexer->token = T_NE;
 358             }
 359           else
 360             lexer->token = T_NOT;
 361           break;
 362
 363         case '&':
 364           lexer->prog++;
 365           lexer->token = T_AND;
 366           break;
 367
 368         case '|':
 369           lexer->prog++;
 370           lexer->token = T_OR;
 371           break;
 372
 373         case 'b': case 'B':
 374           if (lexer->prog[1] == '\'' || lexer->prog[1] == '"')
 375             lexer->token = parse_string (lexer, BINARY_STRING);
 376           else
 377             lexer->token = parse_id (lexer);
 378           break;
 379
 380         case 'o': case 'O':
 381           if (lexer->prog[1] == '\'' || lexer->prog[1] == '"')
 382             lexer->token = parse_string (lexer, OCTAL_STRING);
 383           else
 384             lexer->token = parse_id (lexer);
 385           break;
 386
 387         case 'x': case 'X':
 388           if (lexer->prog[1] == '\'' || lexer->prog[1] == '"')
 389             lexer->token = parse_string (lexer, HEX_STRING);
 390           else
 391             lexer->token = parse_id (lexer);
 392           break;
 393
 394         default:
 395           if (lex_is_id1 (*lexer->prog))
 396             {
 397               lexer->token = parse_id (lexer);
 398               break;
 399             }
 400           else
 401             {
 402               unsigned char c = *lexer->prog++;
 403               char *c_name = xasprintf (c_isgraph (c) ? "%c" : "\\%o", c);
 404               msg (SE, _("Bad character in input: `%s'."), c_name);
 405               free (c_name);
 406               continue;
 407             }
 408         }
 409       break;
 410     }
 411 }
 412
 413 /* Parses an identifier at the current position into tokid and
 414    tokstr.
 415    Returns the correct token type. */
 416 static int
 417 parse_id (struct lexer *lexer)
 418 {
 419   struct substring rest_of_line
 420     = ss_substr (ds_ss (&lexer->line_buffer),
 421                  ds_pointer_to_position (&lexer->line_buffer, lexer->prog),
 422                  SIZE_MAX);
 423   struct substring id = ss_head (rest_of_line,
 424                                  lex_id_get_length (rest_of_line));
 425   lexer->prog += ss_length (id);
 426
 427   ds_assign_substring (&lexer->tokstr, id);
 428   str_copy_trunc (lexer->tokid, sizeof lexer->tokid, ds_cstr (&lexer->tokstr));
 429   return lex_id_to_token (id);
 430 }
 431
 432 /* Reports an error to the effect that subcommand SBC may only be
 433    specified once. */
 434 void
 435 lex_sbc_only_once (const char *sbc)
 436 {
 437   msg (SE, _("Subcommand %s may only be specified once."), sbc);
 438 }
 439
 440 /* Reports an error to the effect that subcommand SBC is
 441    missing. */
 442 void
 443 lex_sbc_missing (struct lexer *lexer, const char *sbc)
 444 {
 445   lex_error (lexer, _("missing required subcommand %s"), sbc);
 446 }
 447
 448 /* Prints a syntax error message containing the current token and
 449    given message MESSAGE (if non-null). */
 450 void
 451 lex_error (struct lexer *lexer, const char *message, ...)
 452 {
 453   struct string s;
 454
 455   ds_init_empty (&s);
 456
 457   if (lexer->token == T_STOP)
 458     ds_put_cstr (&s, _("Syntax error at end of file"));
 459   else if (lexer->token == T_ENDCMD)
 460     ds_put_cstr (&s, _("Syntax error at end of command"));
 461   else
 462     {
 463       char *token_rep = lex_token_representation (lexer);
 464       ds_put_format (&s, _("Syntax error at `%s'"), token_rep);
 465       free (token_rep);
 466     }
 467
 468   if (message)
 469     {
 470       va_list args;
 471
 472       ds_put_cstr (&s, ": ");
 473
 474       va_start (args, message);
 475       ds_put_vformat (&s, message, args);
 476       va_end (args);
 477     }
 478
 479   msg (SE, "%s.", ds_cstr (&s));
 480   ds_destroy (&s);
 481 }
 482
 483 /* Checks that we're at end of command.
 484    If so, returns a successful command completion code.
 485    If not, flags a syntax error and returns an error command
 486    completion code. */
 487 int
 488 lex_end_of_command (struct lexer *lexer)
 489 {
 490   if (lexer->token != T_ENDCMD)
 491     {
 492       lex_error (lexer, _("expecting end of command"));
 493       return CMD_FAILURE;
 494     }
 495   else
 496     return CMD_SUCCESS;
 497 }
 498 \f
 499 /* Token testing functions. */
 500
 501 /* Returns true if the current token is a number. */
 502 bool
 503 lex_is_number (struct lexer *lexer)
 504 {
 505   return lexer->token == T_POS_NUM || lexer->token == T_NEG_NUM;
 506 }
 507
 508
 509 /* Returns true if the current token is a string. */
 510 bool
 511 lex_is_string (struct lexer *lexer)
 512 {
 513   return lexer->token == T_STRING;
 514 }
 515
 516
 517 /* Returns the value of the current token, which must be a
 518    floating point number. */
 519 double
 520 lex_number (struct lexer *lexer)
 521 {
 522   assert (lex_is_number (lexer));
 523   return lexer->tokval;
 524 }
 525
 526 /* Returns true iff the current token is an integer. */
 527 bool
 528 lex_is_integer (struct lexer *lexer)
 529 {
 530   return (lex_is_number (lexer)
 531           && lexer->tokval > LONG_MIN
 532           && lexer->tokval <= LONG_MAX
 533           && floor (lexer->tokval) == lexer->tokval);
 534 }
 535
 536 /* Returns the value of the current token, which must be an
 537    integer. */
 538 long
 539 lex_integer (struct lexer *lexer)
 540 {
 541   assert (lex_is_integer (lexer));
 542   return lexer->tokval;
 543 }
 544 \f
 545 /* Token matching functions. */
 546
 547 /* If TOK is the current token, skips it and returns true
 548    Otherwise, returns false. */
 549 bool
 550 lex_match (struct lexer *lexer, enum token_type t)
 551 {
 552   if (lexer->token == t)
 553     {
 554       lex_get (lexer);
 555       return true;
 556     }
 557   else
 558     return false;
 559 }
 560
 561 /* If the current token is the identifier S, skips it and returns
 562    true.  The identifier may be abbreviated to its first three
 563    letters.
 564    Otherwise, returns false. */
 565 bool
 566 lex_match_id (struct lexer *lexer, const char *s)
 567 {
 568   return lex_match_id_n (lexer, s, 3);
 569 }
 570
 571 /* If the current token is the identifier S, skips it and returns
 572    true.  The identifier may be abbreviated to its first N
 573    letters.
 574    Otherwise, returns false. */
 575 bool
 576 lex_match_id_n (struct lexer *lexer, const char *s, size_t n)
 577 {
 578   if (lexer->token == T_ID
 579       && lex_id_match_n (ss_cstr (s), ss_cstr (lexer->tokid), n))
 580     {
 581       lex_get (lexer);
 582       return true;
 583     }
 584   else
 585     return false;
 586 }
 587
 588 /* If the current token is integer N, skips it and returns true.
 589    Otherwise, returns false. */
 590 bool
 591 lex_match_int (struct lexer *lexer, int x)
 592 {
 593   if (lex_is_integer (lexer) && lex_integer (lexer) == x)
 594     {
 595       lex_get (lexer);
 596       return true;
 597     }
 598   else
 599     return false;
 600 }
 601 \f
 602 /* Forced matches. */
 603
 604 /* If this token is identifier S, fetches the next token and returns
 605    nonzero.
 606    Otherwise, reports an error and returns zero. */
 607 bool
 608 lex_force_match_id (struct lexer *lexer, const char *s)
 609 {
 610   if (lex_match_id (lexer, s))
 611     return true;
 612   else
 613     {
 614       lex_error (lexer, _("expecting `%s'"), s);
 615       return false;
 616     }
 617 }
 618
 619 /* If the current token is T, skips the token.  Otherwise, reports an
 620    error and returns from the current function with return value false. */
 621 bool
 622 lex_force_match (struct lexer *lexer, enum token_type t)
 623 {
 624   if (lexer->token == t)
 625     {
 626       lex_get (lexer);
 627       return true;
 628     }
 629   else
 630     {
 631       lex_error (lexer, _("expecting `%s'"), lex_token_name (t));
 632       return false;
 633     }
 634 }
 635
 636 /* If this token is a string, does nothing and returns true.
 637    Otherwise, reports an error and returns false. */
 638 bool
 639 lex_force_string (struct lexer *lexer)
 640 {
 641   if (lex_is_string (lexer))
 642     return true;
 643   else
 644     {
 645       lex_error (lexer, _("expecting string"));
 646       return false;
 647     }
 648 }
 649
 650 /* If this token is an integer, does nothing and returns true.
 651    Otherwise, reports an error and returns false. */
 652 bool
 653 lex_force_int (struct lexer *lexer)
 654 {
 655   if (lex_is_integer (lexer))
 656     return true;
 657   else
 658     {
 659       lex_error (lexer, _("expecting integer"));
 660       return false;
 661     }
 662 }
 663
 664 /* If this token is a number, does nothing and returns true.
 665    Otherwise, reports an error and returns false. */
 666 bool
 667 lex_force_num (struct lexer *lexer)
 668 {
 669   if (lex_is_number (lexer))
 670     return true;
 671
 672   lex_error (lexer, _("expecting number"));
 673   return false;
 674 }
 675
 676 /* If this token is an identifier, does nothing and returns true.
 677    Otherwise, reports an error and returns false. */
 678 bool
 679 lex_force_id (struct lexer *lexer)
 680 {
 681   if (lexer->token == T_ID)
 682     return true;
 683
 684   lex_error (lexer, _("expecting identifier"));
 685   return false;
 686 }
 687
 688 /* Weird token functions. */
 689
 690 /* Returns the likely type of the next token, or 0 if it's hard to tell. */
 691 enum token_type
 692 lex_look_ahead (struct lexer *lexer)
 693 {
 694   if (lexer->put_token)
 695     return lexer->put_token;
 696
 697   for (;;)
 698     {
 699       if (NULL == lexer->prog && ! lex_get_line (lexer) )
 700         return 0;
 701
 702       for (;;)
 703         {
 704           while (c_isspace ((unsigned char) *lexer->prog))
 705             lexer->prog++;
 706           if (*lexer->prog)
 707             break;
 708
 709           if (lexer->dot)
 710             return T_ENDCMD;
 711           else if (!lex_get_line (lexer))
 712             return 0;
 713
 714           if (lexer->put_token)
 715             return lexer->put_token;
 716         }
 717
 718       switch (toupper ((unsigned char) *lexer->prog))
 719         {
 720         case 'X': case 'B': case 'O':
 721           if (lexer->prog[1] == '\'' || lexer->prog[1] == '"')
 722             return T_STRING;
 723           /* Fall through */
 724
 725         case '-':
 726           return T_DASH;
 727
 728         case '.':
 729         case '0': case '1': case '2': case '3': case '4':
 730         case '5': case '6': case '7': case '8': case '9':
 731           return T_POS_NUM;
 732
 733         case '\'': case '"':
 734           return T_STRING;
 735
 736         case '+':
 737           return T_PLUS;
 738
 739         case '/':
 740           return T_SLASH;
 741
 742         case '=':
 743           return T_EQUALS;
 744
 745         case '(':
 746           return T_LPAREN;
 747
 748         case ')':
 749           return T_RPAREN;
 750
 751         case '[':
 752           return T_LBRACK;
 753
 754         case ']':
 755           return T_RBRACK;
 756
 757         case ',':
 758           return T_COMMA;
 759
 760         case '*':
 761           return lexer->prog[1] == '*' ? T_EXP : T_ASTERISK;
 762
 763         case '<':
 764           return (lexer->prog[1] == '=' ? T_LE
 765                   : lexer->prog[1] == '>' ? T_NE
 766                   : T_LT);
 767
 768         case '>':
 769           return lexer->prog[1] == '=' ? T_GE : T_GT;
 770
 771         case '~':
 772           return lexer->prog[1] == '=' ? T_NE : T_NOT;
 773
 774         case '&':
 775           return T_AND;
 776
 777         case '|':
 778           return T_OR;
 779
 780         default:
 781           if (lex_is_id1 (*lexer->prog))
 782             return T_ID;
 783           return 0;
 784         }
 785     }
 786 }
 787
 788 /* Makes the current token become the next token to be read; the
 789    current token is set to T. */
 790 void
 791 lex_put_back (struct lexer *lexer, enum token_type t)
 792 {
 793   save_token (lexer);
 794   lexer->token = t;
 795 }
 796
 797 /* Makes the current token become the next token to be read; the
 798    current token is set to the identifier ID. */
 799 void
 800 lex_put_back_id (struct lexer *lexer, const char *id)
 801 {
 802   assert (lex_id_to_token (ss_cstr (id)) == T_ID);
 803   save_token (lexer);
 804   lexer->token = T_ID;
 805   ds_assign_cstr (&lexer->tokstr, id);
 806   str_copy_trunc (lexer->tokid, sizeof lexer->tokid, ds_cstr (&lexer->tokstr));
 807 }
 808 \f
 809 /* Weird line processing functions. */
 810
 811 /* Returns the entire contents of the current line. */
 812 const char *
 813 lex_entire_line (const struct lexer *lexer)
 814 {
 815   return ds_cstr (&lexer->line_buffer);
 816 }
 817
 818 const struct string *
 819 lex_entire_line_ds (const struct lexer *lexer)
 820 {
 821   return &lexer->line_buffer;
 822 }
 823
 824 /* As lex_entire_line(), but only returns the part of the current line
 825    that hasn't already been tokenized. */
 826 const char *
 827 lex_rest_of_line (const struct lexer *lexer)
 828 {
 829   return lexer->prog;
 830 }
 831
 832 /* Returns true if the current line ends in a terminal dot,
 833    false otherwise. */
 834 bool
 835 lex_end_dot (const struct lexer *lexer)
 836 {
 837   return lexer->dot;
 838 }
 839
 840 /* Causes the rest of the current input line to be ignored for
 841    tokenization purposes. */
 842 void
 843 lex_discard_line (struct lexer *lexer)
 844 {
 845   ds_cstr (&lexer->line_buffer);  /* Ensures ds_end points to something valid */
 846   lexer->prog = ds_end (&lexer->line_buffer);
 847   lexer->dot = false;
 848   lexer->put_token = 0;
 849 }
 850
 851
 852 /* Discards the rest of the current command.
 853    When we're reading commands from a file, we skip tokens until
 854    a terminal dot or EOF.
 855    When we're reading commands interactively from the user,
 856    that's just discarding the current line, because presumably
 857    the user doesn't want to finish typing a command that will be
 858    ignored anyway. */
 859 void
 860 lex_discard_rest_of_command (struct lexer *lexer)
 861 {
 862   if (!getl_is_interactive (lexer->ss))
 863     {
 864       while (lexer->token != T_STOP && lexer->token != T_ENDCMD)
 865         lex_get (lexer);
 866     }
 867   else
 868     lex_discard_line (lexer);
 869 }
 870 \f
 871 /* Weird line reading functions. */
 872
 873 /* Remove C-style comments in STRING, begun by slash-star and
 874    terminated by star-slash or newline. */
 875 static void
 876 strip_comments (struct string *string)
 877 {
 878   char *cp;
 879   int quote;
 880   bool in_comment;
 881
 882   in_comment = false;
 883   quote = EOF;
 884   for (cp = ds_cstr (string); *cp; )
 885     {
 886       /* If we're not in a comment, check for quote marks. */
 887       if (!in_comment)
 888         {
 889           if (*cp == quote)
 890             quote = EOF;
 891           else if (*cp == '\'' || *cp == '"')
 892             quote = *cp;
 893         }
 894
 895       /* If we're not inside a quotation, check for comment. */
 896       if (quote == EOF)
 897         {
 898           if (cp[0] == '/' && cp[1] == '*')
 899             {
 900               in_comment = true;
 901               *cp++ = ' ';
 902               *cp++ = ' ';
 903               continue;
 904             }
 905           else if (in_comment && cp[0] == '*' && cp[1] == '/')
 906             {
 907               in_comment = false;
 908               *cp++ = ' ';
 909               *cp++ = ' ';
 910               continue;
 911             }
 912         }
 913
 914       /* Check commenting. */
 915       if (in_comment)
 916         *cp = ' ';
 917       cp++;
 918     }
 919 }
 920
 921 /* Prepares LINE, which is subject to the given SYNTAX rules, for
 922    tokenization by stripping comments and determining whether it
 923    is the beginning or end of a command and storing into
 924    *LINE_STARTS_COMMAND and *LINE_ENDS_COMMAND appropriately. */
 925 void
 926 lex_preprocess_line (struct string *line,
 927                      enum syntax_mode syntax,
 928                      bool *line_starts_command,
 929                      bool *line_ends_command)
 930 {
 931   strip_comments (line);
 932   ds_rtrim (line, ss_cstr (CC_SPACES));
 933   *line_ends_command = (ds_chomp (line, settings_get_endcmd ())
 934                         || (ds_is_empty (line) && settings_get_nulline ()));
 935   *line_starts_command = false;
 936   if (syntax == GETL_BATCH)
 937     {
 938       int first = ds_first (line);
 939       *line_starts_command = !c_isspace (first);
 940       if (first == '+' || first == '-')
 941         *ds_data (line) = ' ';
 942     }
 943 }
 944
 945 /* Reads a line, without performing any preprocessing. */
 946 bool
 947 lex_get_line_raw (struct lexer *lexer)
 948 {
 949   bool ok = getl_read_line (lexer->ss, &lexer->line_buffer);
 950   if (ok)
 951     {
 952       const char *line = ds_cstr (&lexer->line_buffer);
 953       text_item_submit (text_item_create (TEXT_ITEM_SYNTAX, line));
 954     }
 955   else
 956     lexer->prog = NULL;
 957   return ok;
 958 }
 959
 960 /* Reads a line for use by the tokenizer, and preprocesses it by
 961    removing comments, stripping trailing whitespace and the
 962    terminal dot, and removing leading indentors. */
 963 bool
 964 lex_get_line (struct lexer *lexer)
 965 {
 966   bool line_starts_command;
 967
 968   if (!lex_get_line_raw (lexer))
 969     return false;
 970
 971   lex_preprocess_line (&lexer->line_buffer,
 972                        lex_current_syntax_mode (lexer),
 973                        &line_starts_command, &lexer->dot);
 974
 975   if (line_starts_command)
 976     lexer->put_token = T_ENDCMD;
 977
 978   lexer->prog = ds_cstr (&lexer->line_buffer);
 979   return true;
 980 }
 981 \f
 982 /* Token names. */
 983
 984 /* Returns the name of a token. */
 985 const char *
 986 lex_token_name (enum token_type token)
 987 {
 988   switch (token)
 989     {
 990     case T_ID:
 991     case T_POS_NUM:
 992     case T_NEG_NUM:
 993     case T_STRING:
 994       NOT_REACHED ();
 995
 996     case T_STOP:
 997       return "";
 998
 999     case T_ENDCMD:
1000       return ".";
1001
1002     case T_PLUS:
1003       return "+";
1004
1005     case T_DASH:
1006       return "-";
1007
1008     case T_ASTERISK:
1009       return "*";
1010
1011     case T_SLASH:
1012       return "/";
1013
1014     case T_EQUALS:
1015       return "=";
1016
1017     case T_LPAREN:
1018       return "(";
1019
1020     case T_RPAREN:
1021       return ")";
1022
1023     case T_LBRACK:
1024       return "[";
1025
1026     case T_RBRACK:
1027       return "]";
1028
1029     case T_COMMA:
1030       return ",";
1031
1032     case T_AND:
1033       return "AND";
1034
1035     case T_OR:
1036       return "OR";
1037
1038     case T_NOT:
1039       return "NOT";
1040
1041     case T_EQ:
1042       return "EQ";
1043
1044     case T_GE:
1045       return ">=";
1046
1047     case T_GT:
1048       return ">";
1049
1050     case T_LE:
1051       return "<=";
1052
1053     case T_LT:
1054       return "<";
1055
1056     case T_NE:
1057       return "~=";
1058
1059     case T_ALL:
1060       return "ALL";
1061
1062     case T_BY:
1063       return "BY";
1064
1065     case T_TO:
1066       return "TO";
1067
1068     case T_WITH:
1069       return "WITH";
1070
1071     case T_EXP:
1072       return "**";
1073     }
1074
1075   NOT_REACHED ();
1076 }
1077
1078 /* Returns an ASCII representation of the current token as a
1079    malloc()'d string. */
1080 char *
1081 lex_token_representation (struct lexer *lexer)
1082 {
1083   char *token_rep;
1084
1085   switch (lexer->token)
1086     {
1087     case T_ID:
1088     case T_POS_NUM:
1089     case T_NEG_NUM:
1090       return ds_xstrdup (&lexer->tokstr);
1091
1092     case T_STRING:
1093       {
1094         int hexstring = 0;
1095         char *sp, *dp;
1096
1097         for (sp = ds_cstr (&lexer->tokstr); sp < ds_end (&lexer->tokstr); sp++)
1098           if (!c_isprint ((unsigned char) *sp))
1099             {
1100               hexstring = 1;
1101               break;
1102             }
1103
1104         token_rep = xmalloc (2 + ds_length (&lexer->tokstr) * 2 + 1 + 1);
1105
1106         dp = token_rep;
1107         if (hexstring)
1108           *dp++ = 'X';
1109         *dp++ = '\'';
1110
1111         if (!hexstring)
1112           for (sp = ds_cstr (&lexer->tokstr); *sp; )
1113             {
1114               if (*sp == '\'')
1115                 *dp++ = '\'';
1116               *dp++ = (unsigned char) *sp++;
1117             }
1118         else
1119           for (sp = ds_cstr (&lexer->tokstr); sp < ds_end (&lexer->tokstr); sp++)
1120             {
1121               *dp++ = (((unsigned char) *sp) >> 4)["0123456789ABCDEF"];
1122               *dp++ = (((unsigned char) *sp) & 15)["0123456789ABCDEF"];
1123             }
1124         *dp++ = '\'';
1125         *dp = '\0';
1126
1127         return token_rep;
1128       }
1129
1130     default:
1131       return xstrdup (lex_token_name (lexer->token));
1132     }
1133 }
1134 \f
1135 /* Really weird functions. */
1136
1137 /* Most of the time, a `-' is a lead-in to a negative number.  But
1138    sometimes it's actually part of the syntax.  If a dash can be part
1139    of syntax then this function is called to rip it off of a
1140    number. */
1141 void
1142 lex_negative_to_dash (struct lexer *lexer)
1143 {
1144   if (lexer->token == T_NEG_NUM)
1145     {
1146       lexer->token = T_POS_NUM;
1147       lexer->tokval = -lexer->tokval;
1148       ds_assign_substring (&lexer->tokstr, ds_substr (&lexer->tokstr, 1, SIZE_MAX));
1149       save_token (lexer);
1150       lexer->token = T_DASH;
1151     }
1152 }
1153
1154 /* Skip a COMMENT command. */
1155 void
1156 lex_skip_comment (struct lexer *lexer)
1157 {
1158   for (;;)
1159     {
1160       if (!lex_get_line (lexer))
1161         {
1162           lexer->put_token = T_STOP;
1163           lexer->prog = NULL;
1164           return;
1165         }
1166
1167       if (lexer->put_token == T_ENDCMD)
1168         break;
1169
1170       ds_cstr (&lexer->line_buffer); /* Ensures ds_end will point to a valid char */
1171       lexer->prog = ds_end (&lexer->line_buffer);
1172       if (lexer->dot)
1173         break;
1174     }
1175 }
1176 \f
1177 /* Private functions. */
1178
1179 /* When invoked, tokstr contains a string of binary, octal, or
1180    hex digits, according to TYPE.  The string is converted to
1181    characters having the specified values. */
1182 static void
1183 convert_numeric_string_to_char_string (struct lexer *lexer,
1184                                        enum string_type type)
1185 {
1186   const char *base_name;
1187   int base;
1188   int chars_per_byte;
1189   size_t byte_cnt;
1190   size_t i;
1191   char *p;
1192
1193   switch (type)
1194     {
1195     case BINARY_STRING:
1196       base_name = _("binary");
1197       base = 2;
1198       chars_per_byte = 8;
1199       break;
1200     case OCTAL_STRING:
1201       base_name = _("octal");
1202       base = 8;
1203       chars_per_byte = 3;
1204       break;
1205     case HEX_STRING:
1206       base_name = _("hex");
1207       base = 16;
1208       chars_per_byte = 2;
1209       break;
1210     default:
1211       NOT_REACHED ();
1212     }
1213
1214   byte_cnt = ds_length (&lexer->tokstr) / chars_per_byte;
1215   if (ds_length (&lexer->tokstr) % chars_per_byte)
1216     msg (SE, _("String of %s digits has %zu characters, which is not a "
1217                "multiple of %d."),
1218          base_name, ds_length (&lexer->tokstr), chars_per_byte);
1219
1220   p = ds_cstr (&lexer->tokstr);
1221   for (i = 0; i < byte_cnt; i++)
1222     {
1223       int value;
1224       int j;
1225
1226       value = 0;
1227       for (j = 0; j < chars_per_byte; j++, p++)
1228         {
1229           int v;
1230
1231           if (*p >= '0' && *p <= '9')
1232             v = *p - '0';
1233           else
1234             {
1235               static const char alpha[] = "abcdef";
1236               const char *q = strchr (alpha, tolower ((unsigned char) *p));
1237
1238               if (q)
1239                 v = q - alpha + 10;
1240               else
1241                 v = base;
1242             }
1243
1244           if (v >= base)
1245             msg (SE, _("`%c' is not a valid %s digit."), *p, base_name);
1246
1247           value = value * base + v;
1248         }
1249
1250       ds_cstr (&lexer->tokstr)[i] = (unsigned char) value;
1251     }
1252
1253   ds_truncate (&lexer->tokstr, byte_cnt);
1254 }
1255
1256 /* Parses a string from the input buffer into tokstr.  The input
1257    buffer pointer lexer->prog must point to the initial single or double
1258    quote.  TYPE indicates the type of string to be parsed.
1259    Returns token type. */
1260 static int
1261 parse_string (struct lexer *lexer, enum string_type type)
1262 {
1263   if (type != CHARACTER_STRING)
1264     lexer->prog++;
1265
1266   /* Accumulate the entire string, joining sections indicated by +
1267      signs. */
1268   for (;;)
1269     {
1270       /* Single or double quote. */
1271       int c = *lexer->prog++;
1272
1273       /* Accumulate section. */
1274       for (;;)
1275         {
1276           /* Check end of line. */
1277           if (*lexer->prog == '\0')
1278             {
1279               msg (SE, _("Unterminated string constant."));
1280               goto finish;
1281             }
1282
1283           /* Double quote characters to embed them in strings. */
1284           if (*lexer->prog == c)
1285             {
1286               if (lexer->prog[1] == c)
1287                 lexer->prog++;
1288               else
1289                 break;
1290             }
1291
1292           ds_put_byte (&lexer->tokstr, *lexer->prog++);
1293         }
1294       lexer->prog++;
1295
1296       /* Skip whitespace after final quote mark. */
1297       if (lexer->prog == NULL)
1298         break;
1299       for (;;)
1300         {
1301           while (c_isspace ((unsigned char) *lexer->prog))
1302             lexer->prog++;
1303           if (*lexer->prog)
1304             break;
1305
1306           if (lexer->dot)
1307             goto finish;
1308
1309           if (!lex_get_line (lexer))
1310             goto finish;
1311         }
1312
1313       /* Skip plus sign. */
1314       if (*lexer->prog != '+')
1315         break;
1316       lexer->prog++;
1317
1318       /* Skip whitespace after plus sign. */
1319       if (lexer->prog == NULL)
1320         break;
1321       for (;;)
1322         {
1323           while (c_isspace ((unsigned char) *lexer->prog))
1324             lexer->prog++;
1325           if (*lexer->prog)
1326             break;
1327
1328           if (lexer->dot)
1329             goto finish;
1330
1331           if (!lex_get_line (lexer))
1332             {
1333               msg (SE, _("Unexpected end of file in string concatenation."));
1334               goto finish;
1335             }
1336         }
1337
1338       /* Ensure that a valid string follows. */
1339       if (*lexer->prog != '\'' && *lexer->prog != '"')
1340         {
1341           msg (SE, _("String expected following `+'."));
1342           goto finish;
1343         }
1344     }
1345
1346   /* We come here when we've finished concatenating all the string sections
1347      into one large string. */
1348 finish:
1349   if (type != CHARACTER_STRING)
1350     convert_numeric_string_to_char_string (lexer, type);
1351
1352   return T_STRING;
1353 }
1354 \f
1355 /* Token Accessor Functions */
1356
1357 enum token_type
1358 lex_token (const struct lexer *lexer)
1359 {
1360   return lexer->token;
1361 }
1362
1363 double
1364 lex_tokval (const struct lexer *lexer)
1365 {
1366   return lexer->tokval;
1367 }
1368
1369 const char *
1370 lex_tokid (const struct lexer *lexer)
1371 {
1372   return lexer->tokid;
1373 }
1374
1375 const struct string *
1376 lex_tokstr (const struct lexer *lexer)
1377 {
1378   return &lexer->tokstr;
1379 }
1380
1381 /* If the lexer is positioned at the (pseudo)identifier S, which
1382    may contain a hyphen ('-'), skips it and returns true.  Each
1383    half of the identifier may be abbreviated to its first three
1384    letters.
1385    Otherwise, returns false. */
1386 bool
1387 lex_match_hyphenated_word (struct lexer *lexer, const char *s)
1388 {
1389   const char *hyphen = strchr (s, '-');
1390   if (hyphen == NULL)
1391     return lex_match_id (lexer, s);
1392   else if (lexer->token != T_ID
1393            || !lex_id_match (ss_buffer (s, hyphen - s), ss_cstr (lexer->tokid))
1394            || lex_look_ahead (lexer) != T_DASH)
1395     return false;
1396   else
1397     {
1398       lex_get (lexer);
1399       lex_force_match (lexer, T_DASH);
1400       lex_force_match_id (lexer, hyphen + 1);
1401       return true;
1402     }
1403 }
1404