src/language/lexer/lexer.c

   1 /* PSPP - computes sample statistics.
   2    Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
   3    Written by Ben Pfaff <blp@gnu.org>.
   4
   5    This program is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU General Public License as
   7    published by the Free Software Foundation; either version 2 of the
   8    License, or (at your option) any later version.
   9
  10    This program is distributed in the hope that it will be useful, but
  11    WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program; if not, write to the Free Software
  17    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  18    02110-1301, USA. */
  19
  20 #include <config.h>
  21 #include "lexer.h"
  22 #include "message.h"
  23 #include <ctype.h>
  24 #include <errno.h>
  25 #include <limits.h>
  26 #include <math.h>
  27 #include <stdarg.h>
  28 #include <stdlib.h>
  29 #include "alloc.h"
  30 #include "command.h"
  31 #include "message.h"
  32 #include "line-buffer.h"
  33 #include "magic.h"
  34 #include "settings.h"
  35 #include "str.h"
  36
  37 #include "gettext.h"
  38 #define _(msgid) gettext (msgid)
  39 #define N_(msgid) msgid
  40
  41 /*
  42 #define DUMP_TOKENS 1
  43 */
  44
  45 \f
  46 /* Global variables. */
  47
  48 extern const char *keywords[T_N_KEYWORDS + 1];
  49
  50
  51 /* Current token. */
  52 int token;
  53
  54 /* T_POS_NUM, T_NEG_NUM: the token's value. */
  55 double tokval;
  56
  57 /* T_ID: the identifier. */
  58 char tokid[LONG_NAME_LEN + 1];
  59
  60 /* T_ID, T_STRING: token string value.
  61    For T_ID, this is not truncated as is tokid. */
  62 struct string tokstr;
  63 \f
  64 /* Static variables. */
  65
  66 /* Pointer to next token in getl_buf. */
  67 static char *prog;
  68
  69 /* Nonzero only if this line ends with a terminal dot. */
  70 static int dot;
  71
  72 /* Nonzero only if the last token returned was T_STOP. */
  73 static int eof;
  74
  75 /* If nonzero, next token returned by lex_get().
  76    Used only in exceptional circumstances. */
  77 static int put_token;
  78 static struct string put_tokstr;
  79 static double put_tokval;
  80
  81 static int parse_id (void);
  82
  83 /* How a string represents its contents. */
  84 enum string_type
  85   {
  86     CHARACTER_STRING,   /* Characters. */
  87     BINARY_STRING,      /* Binary digits. */
  88     OCTAL_STRING,       /* Octal digits. */
  89     HEX_STRING          /* Hexadecimal digits. */
  90   };
  91
  92 static void convert_numeric_string_to_char_string (enum string_type);
  93 static int parse_string (enum string_type);
  94
  95 #if DUMP_TOKENS
  96 static void dump_token (void);
  97 #endif
  98 \f
  99 /* Initialization. */
 100
 101 /* Initializes the lexer. */
 102 void
 103 lex_init (void)
 104 {
 105   ds_init (&tokstr, 64);
 106   ds_init (&put_tokstr, 64);
 107   if (!lex_get_line ())
 108     eof = true;
 109 }
 110
 111 void
 112 lex_done (void)
 113 {
 114   ds_destroy (&put_tokstr);
 115   ds_destroy (&tokstr);
 116 }
 117
 118 \f
 119 /* Common functions. */
 120
 121 /* Copies put_token, put_tokstr, put_tokval into token, tokstr,
 122    tokval, respectively, and sets tokid appropriately. */
 123 static void
 124 restore_token (void)
 125 {
 126   assert (put_token != 0);
 127   token = put_token;
 128   ds_replace (&tokstr, ds_c_str (&put_tokstr));
 129   str_copy_trunc (tokid, sizeof tokid, ds_c_str (&tokstr));
 130   tokval = put_tokval;
 131   put_token = 0;
 132 }
 133
 134 /* Copies token, tokstr, tokval into put_token, put_tokstr,
 135    put_tokval respectively. */
 136 static void
 137 save_token (void)
 138 {
 139   put_token = token;
 140   ds_replace (&put_tokstr, ds_c_str (&tokstr));
 141   put_tokval = tokval;
 142 }
 143
 144 /* Parses a single token, setting appropriate global variables to
 145    indicate the token's attributes. */
 146 void
 147 lex_get (void)
 148 {
 149   /* If a token was pushed ahead, return it. */
 150   if (put_token)
 151     {
 152       restore_token ();
 153 #if DUMP_TOKENS
 154       dump_token ();
 155 #endif
 156       return;
 157     }
 158
 159   /* Find a token. */
 160   for (;;)
 161     {
 162       /* Skip whitespace. */
 163       if (eof)
 164         {
 165           token = T_STOP;
 166           return;
 167         }
 168
 169       for (;;)
 170         {
 171           while (isspace ((unsigned char) *prog))
 172             prog++;
 173           if (*prog)
 174             break;
 175
 176           if (dot)
 177             {
 178               dot = 0;
 179               token = '.';
 180 #if DUMP_TOKENS
 181               dump_token ();
 182 #endif
 183               return;
 184             }
 185           else if (!lex_get_line ())
 186             {
 187               eof = 1;
 188               token = T_STOP;
 189 #if DUMP_TOKENS
 190               dump_token ();
 191 #endif
 192               return;
 193             }
 194
 195           if (put_token)
 196             {
 197               restore_token ();
 198 #if DUMP_TOKENS
 199               dump_token ();
 200 #endif
 201               return;
 202             }
 203         }
 204
 205
 206       /* Actually parse the token. */
 207       ds_clear (&tokstr);
 208
 209       switch (*prog)
 210         {
 211         case '-': case '.':
 212         case '0': case '1': case '2': case '3': case '4':
 213         case '5': case '6': case '7': case '8': case '9':
 214           {
 215             char *tail;
 216
 217             /* `-' can introduce a negative number, or it can be a
 218                token by itself.  If it is not followed by a digit or a
 219                decimal point, it is definitely not a number.
 220                Otherwise, it might be either, but most of the time we
 221                want it as a number.  When the syntax calls for a `-'
 222                token, lex_negative_to_dash() must be used to break
 223                negative numbers into two tokens. */
 224             if (*prog == '-')
 225               {
 226                 ds_putc (&tokstr, *prog++);
 227                 while (isspace ((unsigned char) *prog))
 228                   prog++;
 229
 230                 if (!isdigit ((unsigned char) *prog) && *prog != '.')
 231                   {
 232                     token = '-';
 233                     break;
 234                   }
 235                 token = T_NEG_NUM;
 236               }
 237             else
 238               token = T_POS_NUM;
 239
 240             /* Parse the number, copying it into tokstr. */
 241             while (isdigit ((unsigned char) *prog))
 242               ds_putc (&tokstr, *prog++);
 243             if (*prog == '.')
 244               {
 245                 ds_putc (&tokstr, *prog++);
 246                 while (isdigit ((unsigned char) *prog))
 247                   ds_putc (&tokstr, *prog++);
 248               }
 249             if (*prog == 'e' || *prog == 'E')
 250               {
 251                 ds_putc (&tokstr, *prog++);
 252                 if (*prog == '+' || *prog == '-')
 253                   ds_putc (&tokstr, *prog++);
 254                 while (isdigit ((unsigned char) *prog))
 255                   ds_putc (&tokstr, *prog++);
 256               }
 257
 258             /* Parse as floating point. */
 259             tokval = strtod (ds_c_str (&tokstr), &tail);
 260             if (*tail)
 261               {
 262                 msg (SE, _("%s does not form a valid number."),
 263                      ds_c_str (&tokstr));
 264                 tokval = 0.0;
 265
 266                 ds_clear (&tokstr);
 267                 ds_putc (&tokstr, '0');
 268               }
 269
 270             break;
 271           }
 272
 273         case '\'': case '"':
 274           token = parse_string (CHARACTER_STRING);
 275           break;
 276
 277         case '(': case ')': case ',': case '=': case '+': case '/':
 278           token = *prog++;
 279           break;
 280
 281         case '*':
 282           if (*++prog == '*')
 283             {
 284               prog++;
 285               token = T_EXP;
 286             }
 287           else
 288             token = '*';
 289           break;
 290
 291         case '<':
 292           if (*++prog == '=')
 293             {
 294               prog++;
 295               token = T_LE;
 296             }
 297           else if (*prog == '>')
 298             {
 299               prog++;
 300               token = T_NE;
 301             }
 302           else
 303             token = T_LT;
 304           break;
 305
 306         case '>':
 307           if (*++prog == '=')
 308             {
 309               prog++;
 310               token = T_GE;
 311             }
 312           else
 313             token = T_GT;
 314           break;
 315
 316         case '~':
 317           if (*++prog == '=')
 318             {
 319               prog++;
 320               token = T_NE;
 321             }
 322           else
 323             token = T_NOT;
 324           break;
 325
 326         case '&':
 327           prog++;
 328           token = T_AND;
 329           break;
 330
 331         case '|':
 332           prog++;
 333           token = T_OR;
 334           break;
 335
 336         case 'b': case 'B':
 337           if (prog[1] == '\'' || prog[1] == '"')
 338             token = parse_string (BINARY_STRING);
 339           else
 340             token = parse_id ();
 341           break;
 342
 343         case 'o': case 'O':
 344           if (prog[1] == '\'' || prog[1] == '"')
 345             token = parse_string (OCTAL_STRING);
 346           else
 347             token = parse_id ();
 348           break;
 349
 350         case 'x': case 'X':
 351           if (prog[1] == '\'' || prog[1] == '"')
 352             token = parse_string (HEX_STRING);
 353           else
 354             token = parse_id ();
 355           break;
 356
 357         default:
 358           if (lex_is_id1 (*prog))
 359             {
 360               token = parse_id ();
 361               break;
 362             }
 363           else
 364             {
 365               if (isgraph ((unsigned char) *prog))
 366                 msg (SE, _("Bad character in input: `%c'."), *prog++);
 367               else
 368                 msg (SE, _("Bad character in input: `\\%o'."), *prog++);
 369               continue;
 370             }
 371         }
 372       break;
 373     }
 374
 375 #if DUMP_TOKENS
 376   dump_token ();
 377 #endif
 378 }
 379
 380 /* Parses an identifier at the current position into tokid and
 381    tokstr.
 382    Returns the correct token type. */
 383 static int
 384 parse_id (void)
 385 {
 386   const char *start = prog;
 387   prog = lex_skip_identifier (start);
 388
 389   ds_concat (&tokstr, start, prog - start);
 390   str_copy_trunc (tokid, sizeof tokid, ds_c_str (&tokstr));
 391   return lex_id_to_token (ds_c_str (&tokstr), ds_length (&tokstr));
 392 }
 393
 394 /* Reports an error to the effect that subcommand SBC may only be
 395    specified once. */
 396 void
 397 lex_sbc_only_once (const char *sbc)
 398 {
 399   msg (SE, _("Subcommand %s may only be specified once."), sbc);
 400 }
 401
 402 /* Reports an error to the effect that subcommand SBC is
 403    missing. */
 404 void
 405 lex_sbc_missing (const char *sbc)
 406 {
 407   lex_error (_("missing required subcommand %s"), sbc);
 408 }
 409
 410 /* Prints a syntax error message containing the current token and
 411    given message MESSAGE (if non-null). */
 412 void
 413 lex_error (const char *message, ...)
 414 {
 415   char *token_rep;
 416   char where[128];
 417
 418   token_rep = lex_token_representation ();
 419   if (token == T_STOP)
 420     strcpy (where, "end of file");
 421   else if (token == '.')
 422     strcpy (where, "end of command");
 423   else
 424     snprintf (where, sizeof where, "`%s'", token_rep);
 425   free (token_rep);
 426
 427   if (message)
 428     {
 429       char buf[1024];
 430       va_list args;
 431
 432       va_start (args, message);
 433       vsnprintf (buf, 1024, message, args);
 434       va_end (args);
 435
 436       msg (SE, _("Syntax error %s at %s."), buf, where);
 437     }
 438   else
 439     msg (SE, _("Syntax error at %s."), where);
 440 }
 441
 442 /* Checks that we're at end of command.
 443    If so, returns a successful command completion code.
 444    If not, flags a syntax error and returns an error command
 445    completion code. */
 446 int
 447 lex_end_of_command (void)
 448 {
 449   if (token != '.')
 450     {
 451       lex_error (_("expecting end of command"));
 452       return CMD_TRAILING_GARBAGE;
 453     }
 454   else
 455     return CMD_SUCCESS;
 456 }
 457 \f
 458 /* Token testing functions. */
 459
 460 /* Returns true if the current token is a number. */
 461 bool
 462 lex_is_number (void)
 463 {
 464   return token == T_POS_NUM || token == T_NEG_NUM;
 465 }
 466
 467 /* Returns the value of the current token, which must be a
 468    floating point number. */
 469 double
 470 lex_number (void)
 471 {
 472   assert (lex_is_number ());
 473   return tokval;
 474 }
 475
 476 /* Returns true iff the current token is an integer. */
 477 bool
 478 lex_is_integer (void)
 479 {
 480   return (lex_is_number ()
 481           && tokval != NOT_LONG
 482           && tokval >= LONG_MIN
 483           && tokval <= LONG_MAX
 484           && floor (tokval) == tokval);
 485 }
 486
 487 /* Returns the value of the current token, which must be an
 488    integer. */
 489 long
 490 lex_integer (void)
 491 {
 492   assert (lex_is_integer ());
 493   return tokval;
 494 }
 495 \f
 496 /* Token matching functions. */
 497
 498 /* If TOK is the current token, skips it and returns nonzero.
 499    Otherwise, returns zero. */
 500 int
 501 lex_match (int t)
 502 {
 503   if (token == t)
 504     {
 505       lex_get ();
 506       return 1;
 507     }
 508   else
 509     return 0;
 510 }
 511
 512 /* If the current token is the identifier S, skips it and returns
 513    nonzero.  The identifier may be abbreviated to its first three
 514    letters.
 515    Otherwise, returns zero. */
 516 int
 517 lex_match_id (const char *s)
 518 {
 519   if (token == T_ID && lex_id_match (s, tokid))
 520     {
 521       lex_get ();
 522       return 1;
 523     }
 524   else
 525     return 0;
 526 }
 527
 528 /* If the current token is integer N, skips it and returns nonzero.
 529    Otherwise, returns zero. */
 530 int
 531 lex_match_int (int x)
 532 {
 533   if (lex_is_integer () && lex_integer () == x)
 534     {
 535       lex_get ();
 536       return 1;
 537     }
 538   else
 539     return 0;
 540 }
 541 \f
 542 /* Forced matches. */
 543
 544 /* If this token is identifier S, fetches the next token and returns
 545    nonzero.
 546    Otherwise, reports an error and returns zero. */
 547 int
 548 lex_force_match_id (const char *s)
 549 {
 550   if (token == T_ID && lex_id_match (s, tokid))
 551     {
 552       lex_get ();
 553       return 1;
 554     }
 555   else
 556     {
 557       lex_error (_("expecting `%s'"), s);
 558       return 0;
 559     }
 560 }
 561
 562 /* If the current token is T, skips the token.  Otherwise, reports an
 563    error and returns from the current function with return value 0. */
 564 int
 565 lex_force_match (int t)
 566 {
 567   if (token == t)
 568     {
 569       lex_get ();
 570       return 1;
 571     }
 572   else
 573     {
 574       lex_error (_("expecting `%s'"), lex_token_name (t));
 575       return 0;
 576     }
 577 }
 578
 579 /* If this token is a string, does nothing and returns nonzero.
 580    Otherwise, reports an error and returns zero. */
 581 int
 582 lex_force_string (void)
 583 {
 584   if (token == T_STRING)
 585     return 1;
 586   else
 587     {
 588       lex_error (_("expecting string"));
 589       return 0;
 590     }
 591 }
 592
 593 /* If this token is an integer, does nothing and returns nonzero.
 594    Otherwise, reports an error and returns zero. */
 595 int
 596 lex_force_int (void)
 597 {
 598   if (lex_is_integer ())
 599     return 1;
 600   else
 601     {
 602       lex_error (_("expecting integer"));
 603       return 0;
 604     }
 605 }
 606
 607 /* If this token is a number, does nothing and returns nonzero.
 608    Otherwise, reports an error and returns zero. */
 609 int
 610 lex_force_num (void)
 611 {
 612   if (lex_is_number ())
 613     return 1;
 614   else
 615     {
 616       lex_error (_("expecting number"));
 617       return 0;
 618     }
 619 }
 620
 621 /* If this token is an identifier, does nothing and returns nonzero.
 622    Otherwise, reports an error and returns zero. */
 623 int
 624 lex_force_id (void)
 625 {
 626   if (token == T_ID)
 627     return 1;
 628   else
 629     {
 630       lex_error (_("expecting identifier"));
 631       return 0;
 632     }
 633 }
 634 /* Weird token functions. */
 635
 636 /* Returns the first character of the next token, except that if the
 637    next token is not an identifier, the character returned will not be
 638    a character that can begin an identifier.  Specifically, the
 639    hexstring lead-in X' causes lookahead() to return '.  Note that an
 640    alphanumeric return value doesn't guarantee an ID token, it could
 641    also be a reserved-word token. */
 642 int
 643 lex_look_ahead (void)
 644 {
 645   if (put_token)
 646     return put_token;
 647
 648   for (;;)
 649     {
 650       if (eof)
 651         return 0;
 652
 653       for (;;)
 654         {
 655           while (isspace ((unsigned char) *prog))
 656             prog++;
 657           if (*prog)
 658             break;
 659
 660           if (dot)
 661             return '.';
 662           else if (!lex_get_line ())
 663             return 0;
 664
 665           if (put_token)
 666             return put_token;
 667         }
 668
 669       if ((toupper ((unsigned char) *prog) == 'X'
 670            || toupper ((unsigned char) *prog) == 'B'
 671            || toupper ((unsigned char) *prog) == 'O')
 672           && (prog[1] == '\'' || prog[1] == '"'))
 673         return '\'';
 674
 675       return *prog;
 676     }
 677 }
 678
 679 /* Makes the current token become the next token to be read; the
 680    current token is set to T. */
 681 void
 682 lex_put_back (int t)
 683 {
 684   save_token ();
 685   token = t;
 686 }
 687
 688 /* Makes the current token become the next token to be read; the
 689    current token is set to the identifier ID. */
 690 void
 691 lex_put_back_id (const char *id)
 692 {
 693   assert (lex_id_to_token (id, strlen (id)) == T_ID);
 694   save_token ();
 695   token = T_ID;
 696   ds_replace (&tokstr, id);
 697   str_copy_trunc (tokid, sizeof tokid, ds_c_str (&tokstr));
 698 }
 699 \f
 700 /* Weird line processing functions. */
 701
 702 /* Returns the entire contents of the current line. */
 703 const char *
 704 lex_entire_line (void)
 705 {
 706   return ds_c_str (&getl_buf);
 707 }
 708
 709 /* As lex_entire_line(), but only returns the part of the current line
 710    that hasn't already been tokenized.
 711    If END_DOT is non-null, stores nonzero into *END_DOT if the line
 712    ends with a terminal dot, or zero if it doesn't. */
 713 const char *
 714 lex_rest_of_line (int *end_dot)
 715 {
 716   if (end_dot)
 717     *end_dot = dot;
 718   return prog;
 719 }
 720
 721 /* Causes the rest of the current input line to be ignored for
 722    tokenization purposes. */
 723 void
 724 lex_discard_line (void)
 725 {
 726   prog = ds_end (&getl_buf);
 727   dot = put_token = 0;
 728 }
 729
 730 /* Sets the current position in the current line to P, which must be
 731    in getl_buf. */
 732 void
 733 lex_set_prog (char *p)
 734 {
 735   prog = p;
 736 }
 737 \f
 738 /* Weird line reading functions. */
 739
 740 /* Remove C-style comments in STRING, begun by slash-star and
 741    terminated by star-slash or newline. */
 742 static void
 743 strip_comments (struct string *string)
 744 {
 745   char *cp;
 746   int quote;
 747   bool in_comment;
 748
 749   in_comment = false;
 750   quote = EOF;
 751   for (cp = ds_c_str (string); *cp; )
 752     {
 753       /* If we're not in a comment, check for quote marks. */
 754       if (!in_comment)
 755         {
 756           if (*cp == quote)
 757             quote = EOF;
 758           else if (*cp == '\'' || *cp == '"')
 759             quote = *cp;
 760         }
 761
 762       /* If we're not inside a quotation, check for comment. */
 763       if (quote == EOF)
 764         {
 765           if (cp[0] == '/' && cp[1] == '*')
 766             {
 767               in_comment = true;
 768               *cp++ = ' ';
 769               *cp++ = ' ';
 770               continue;
 771             }
 772           else if (in_comment && cp[0] == '*' && cp[1] == '/')
 773             {
 774               in_comment = false;
 775               *cp++ = ' ';
 776               *cp++ = ' ';
 777               continue;
 778             }
 779         }
 780
 781       /* Check commenting. */
 782       if (in_comment)
 783         *cp = ' ';
 784       cp++;
 785     }
 786 }
 787
 788 /* Reads a line for use by the tokenizer, and preprocesses it by
 789    removing comments, stripping trailing whitespace and the
 790    terminal dot, and removing leading indentors. */
 791 bool
 792 lex_get_line (void)
 793 {
 794   struct string *line = &getl_buf;
 795   bool interactive;
 796
 797   if (!getl_read_line (&interactive))
 798     return false;
 799
 800   strip_comments (line);
 801   ds_rtrim_spaces (line);
 802
 803   /* Check for and remove terminal dot. */
 804   dot = (ds_chomp (line, get_endcmd ())
 805          || (ds_is_empty (line) && get_nulline ()));
 806
 807   /* Strip leading indentors or insert a terminal dot (unless the
 808      line was obtained interactively). */
 809   if (!interactive)
 810     {
 811       int first = ds_first (line);
 812
 813       if (first == '+' || first == '-')
 814         *ds_data (line) = ' ';
 815       else if (first != EOF && !isspace (first))
 816         put_token = '.';
 817     }
 818
 819   prog = ds_c_str (line);
 820
 821   return true;
 822 }
 823 \f
 824 /* Token names. */
 825
 826 /* Returns the name of a token in a static buffer. */
 827 const char *
 828 lex_token_name (int token)
 829 {
 830   if (token >= T_FIRST_KEYWORD && token <= T_LAST_KEYWORD)
 831     return keywords[token - T_FIRST_KEYWORD];
 832
 833   if (token < 256)
 834     {
 835       static char t[2];
 836       t[0] = token;
 837       return t;
 838     }
 839
 840   return _("<ERROR>");
 841 }
 842
 843 /* Returns an ASCII representation of the current token as a
 844    malloc()'d string. */
 845 char *
 846 lex_token_representation (void)
 847 {
 848   char *token_rep;
 849
 850   switch (token)
 851     {
 852     case T_ID:
 853     case T_POS_NUM:
 854     case T_NEG_NUM:
 855       return xstrdup (ds_c_str (&tokstr));
 856       break;
 857
 858     case T_STRING:
 859       {
 860         int hexstring = 0;
 861         char *sp, *dp;
 862
 863         for (sp = ds_c_str (&tokstr); sp < ds_end (&tokstr); sp++)
 864           if (!isprint ((unsigned char) *sp))
 865             {
 866               hexstring = 1;
 867               break;
 868             }
 869
 870         token_rep = xmalloc (2 + ds_length (&tokstr) * 2 + 1 + 1);
 871
 872         dp = token_rep;
 873         if (hexstring)
 874           *dp++ = 'X';
 875         *dp++ = '\'';
 876
 877         if (!hexstring)
 878           for (sp = ds_c_str (&tokstr); *sp; )
 879             {
 880               if (*sp == '\'')
 881                 *dp++ = '\'';
 882               *dp++ = (unsigned char) *sp++;
 883             }
 884         else
 885           for (sp = ds_c_str (&tokstr); sp < ds_end (&tokstr); sp++)
 886             {
 887               *dp++ = (((unsigned char) *sp) >> 4)["0123456789ABCDEF"];
 888               *dp++ = (((unsigned char) *sp) & 15)["0123456789ABCDEF"];
 889             }
 890         *dp++ = '\'';
 891         *dp = '\0';
 892
 893         return token_rep;
 894       }
 895     break;
 896
 897     case T_STOP:
 898       token_rep = xmalloc (1);
 899       *token_rep = '\0';
 900       return token_rep;
 901
 902     case T_EXP:
 903       return xstrdup ("**");
 904
 905     default:
 906       if (token >= T_FIRST_KEYWORD && token <= T_LAST_KEYWORD)
 907         return xstrdup (keywords [token - T_FIRST_KEYWORD]);
 908       else
 909         {
 910           token_rep = xmalloc (2);
 911           token_rep[0] = token;
 912           token_rep[1] = '\0';
 913           return token_rep;
 914         }
 915     }
 916
 917   assert (0);
 918 }
 919 \f
 920 /* Really weird functions. */
 921
 922 /* Most of the time, a `-' is a lead-in to a negative number.  But
 923    sometimes it's actually part of the syntax.  If a dash can be part
 924    of syntax then this function is called to rip it off of a
 925    number. */
 926 void
 927 lex_negative_to_dash (void)
 928 {
 929   if (token == T_NEG_NUM)
 930     {
 931       token = T_POS_NUM;
 932       tokval = -tokval;
 933       ds_replace (&tokstr, ds_c_str (&tokstr) + 1);
 934       save_token ();
 935       token = '-';
 936     }
 937 }
 938
 939 /* We're not at eof any more. */
 940 void
 941 lex_reset_eof (void)
 942 {
 943   eof = 0;
 944 }
 945
 946 /* Skip a COMMENT command. */
 947 void
 948 lex_skip_comment (void)
 949 {
 950   for (;;)
 951     {
 952       if (!lex_get_line ())
 953         {
 954           put_token = T_STOP;
 955           eof = 1;
 956           return;
 957         }
 958
 959       if (put_token == '.')
 960         break;
 961
 962       prog = ds_end (&getl_buf);
 963       if (dot)
 964         break;
 965     }
 966 }
 967 \f
 968 /* Private functions. */
 969
 970 /* When invoked, tokstr contains a string of binary, octal, or
 971    hex digits, according to TYPE.  The string is converted to
 972    characters having the specified values. */
 973 static void
 974 convert_numeric_string_to_char_string (enum string_type type)
 975 {
 976   const char *base_name;
 977   int base;
 978   int chars_per_byte;
 979   size_t byte_cnt;
 980   size_t i;
 981   char *p;
 982
 983   switch (type)
 984     {
 985     case BINARY_STRING:
 986       base_name = _("binary");
 987       base = 2;
 988       chars_per_byte = 8;
 989       break;
 990     case OCTAL_STRING:
 991       base_name = _("octal");
 992       base = 8;
 993       chars_per_byte = 3;
 994       break;
 995     case HEX_STRING:
 996       base_name = _("hex");
 997       base = 16;
 998       chars_per_byte = 2;
 999       break;
1000     default:
1001       abort ();
1002     }
1003
1004   byte_cnt = ds_length (&tokstr) / chars_per_byte;
1005   if (ds_length (&tokstr) % chars_per_byte)
1006     msg (SE, _("String of %s digits has %d characters, which is not a "
1007                "multiple of %d."),
1008          base_name, ds_length (&tokstr), chars_per_byte);
1009
1010   p = ds_c_str (&tokstr);
1011   for (i = 0; i < byte_cnt; i++)
1012     {
1013       int value;
1014       int j;
1015
1016       value = 0;
1017       for (j = 0; j < chars_per_byte; j++, p++)
1018         {
1019           int v;
1020
1021           if (*p >= '0' && *p <= '9')
1022             v = *p - '0';
1023           else
1024             {
1025               static const char alpha[] = "abcdef";
1026               const char *q = strchr (alpha, tolower ((unsigned char) *p));
1027
1028               if (q)
1029                 v = q - alpha + 10;
1030               else
1031                 v = base;
1032             }
1033
1034           if (v >= base)
1035             msg (SE, _("`%c' is not a valid %s digit."), *p, base_name);
1036
1037           value = value * base + v;
1038         }
1039
1040       ds_c_str (&tokstr)[i] = (unsigned char) value;
1041     }
1042
1043   ds_truncate (&tokstr, byte_cnt);
1044 }
1045
1046 /* Parses a string from the input buffer into tokstr.  The input
1047    buffer pointer prog must point to the initial single or double
1048    quote.  TYPE indicates the type of string to be parsed.
1049    Returns token type. */
1050 static int
1051 parse_string (enum string_type type)
1052 {
1053   /* Accumulate the entire string, joining sections indicated by +
1054      signs. */
1055   for (;;)
1056     {
1057       /* Single or double quote. */
1058       int c = *prog++;
1059
1060       /* Accumulate section. */
1061       for (;;)
1062         {
1063           /* Check end of line. */
1064           if (*prog == '\0')
1065             {
1066               msg (SE, _("Unterminated string constant."));
1067               goto finish;
1068             }
1069
1070           /* Double quote characters to embed them in strings. */
1071           if (*prog == c)
1072             {
1073               if (prog[1] == c)
1074                 prog++;
1075               else
1076                 break;
1077             }
1078
1079           ds_putc (&tokstr, *prog++);
1080         }
1081       prog++;
1082
1083       /* Skip whitespace after final quote mark. */
1084       if (eof)
1085         break;
1086       for (;;)
1087         {
1088           while (isspace ((unsigned char) *prog))
1089             prog++;
1090           if (*prog)
1091             break;
1092
1093           if (dot)
1094             goto finish;
1095
1096           if (!lex_get_line ())
1097             goto finish;
1098         }
1099
1100       /* Skip plus sign. */
1101       if (*prog != '+')
1102         break;
1103       prog++;
1104
1105       /* Skip whitespace after plus sign. */
1106       if (eof)
1107         break;
1108       for (;;)
1109         {
1110           while (isspace ((unsigned char) *prog))
1111             prog++;
1112           if (*prog)
1113             break;
1114
1115           if (dot)
1116             goto finish;
1117
1118           if (!lex_get_line ())
1119             {
1120               msg (SE, _("Unexpected end of file in string concatenation."));
1121               goto finish;
1122             }
1123         }
1124
1125       /* Ensure that a valid string follows. */
1126       if (*prog != '\'' && *prog != '"')
1127         {
1128           msg (SE, _("String expected following `+'."));
1129           goto finish;
1130         }
1131     }
1132
1133   /* We come here when we've finished concatenating all the string sections
1134      into one large string. */
1135 finish:
1136   if (type != CHARACTER_STRING)
1137     convert_numeric_string_to_char_string (type);
1138
1139   if (ds_length (&tokstr) > 255)
1140     {
1141       msg (SE, _("String exceeds 255 characters in length (%d characters)."),
1142            ds_length (&tokstr));
1143       ds_truncate (&tokstr, 255);
1144     }
1145
1146   {
1147     /* FIXME. */
1148     size_t i;
1149     int warned = 0;
1150
1151     for (i = 0; i < ds_length (&tokstr); i++)
1152       if (ds_c_str (&tokstr)[i] == 0)
1153         {
1154           if (!warned)
1155             {
1156               msg (SE, _("Sorry, literal strings may not contain null "
1157                          "characters.  Replacing with spaces."));
1158               warned = 1;
1159             }
1160           ds_c_str (&tokstr)[i] = ' ';
1161         }
1162   }
1163
1164   return T_STRING;
1165 }
1166 \f
1167 #if DUMP_TOKENS
1168 /* Reads one token from the lexer and writes a textual representation
1169    on stdout for debugging purposes. */
1170 static void
1171 dump_token (void)
1172 {
1173   {
1174     const char *curfn;
1175     int curln;
1176
1177     getl_location (&curfn, &curln);
1178     if (curfn)
1179       fprintf (stderr, "%s:%d\t", curfn, curln);
1180   }
1181
1182   switch (token)
1183     {
1184     case T_ID:
1185       fprintf (stderr, "ID\t%s\n", tokid);
1186       break;
1187
1188     case T_POS_NUM:
1189     case T_NEG_NUM:
1190       fprintf (stderr, "NUM\t%f\n", tokval);
1191       break;
1192
1193     case T_STRING:
1194       fprintf (stderr, "STRING\t\"%s\"\n", ds_c_str (&tokstr));
1195       break;
1196
1197     case T_STOP:
1198       fprintf (stderr, "STOP\n");
1199       break;
1200
1201     case T_EXP:
1202       fprintf (stderr, "MISC\tEXP\"");
1203       break;
1204
1205     case 0:
1206       fprintf (stderr, "MISC\tEOF\n");
1207       break;
1208
1209     default:
1210       if (token >= T_FIRST_KEYWORD && token <= T_LAST_KEYWORD)
1211         fprintf (stderr, "KEYWORD\t%s\n", lex_token_name (token));
1212       else
1213         fprintf (stderr, "PUNCT\t%c\n", token);
1214       break;
1215     }
1216 }
1217 #endif /* DUMP_TOKENS */