src/lexer.c

   1 /* PSPP - computes sample statistics.
   2    Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
   3    Written by Ben Pfaff <blp@gnu.org>.
   4
   5    This program is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU General Public License as
   7    published by the Free Software Foundation; either version 2 of the
   8    License, or (at your option) any later version.
   9
  10    This program is distributed in the hope that it will be useful, but
  11    WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program; if not, write to the Free Software
  17    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
  18    02111-1307, USA. */
  19
  20 #include <config.h>
  21 #include "lexer.h"
  22 #include "error.h"
  23 #include <ctype.h>
  24 #include <errno.h>
  25 #include <limits.h>
  26 #include <math.h>
  27 #include <stdarg.h>
  28 #include <stdlib.h>
  29 #include "alloc.h"
  30 #include "command.h"
  31 #include "error.h"
  32 #include "getline.h"
  33 #include "magic.h"
  34 #include "settings.h"
  35 #include "str.h"
  36
  37 /*
  38 #define DUMP_TOKENS 1
  39 */
  40
  41 \f
  42 /* Global variables. */
  43
  44 /* Current token. */
  45 int token;
  46
  47 /* T_POS_NUM, T_NEG_NUM: the token's value. */
  48 double tokval;
  49
  50 /* T_ID: the identifier. */
  51 char tokid[9];
  52
  53 /* T_ID, T_STRING: token string value.
  54    For T_ID, this is not truncated to 8 characters as is tokid. */
  55 struct string tokstr;
  56 \f
  57 /* Static variables. */
  58
  59 /* Table of keywords. */
  60 static const char *keywords[T_N_KEYWORDS + 1] =
  61   {
  62     "AND", "OR", "NOT",
  63     "EQ", "GE", "GT", "LE", "LT", "NE",
  64     "ALL", "BY", "TO", "WITH",
  65     NULL,
  66   };
  67
  68 /* Pointer to next token in getl_buf. */
  69 static char *prog;
  70
  71 /* Nonzero only if this line ends with a terminal dot. */
  72 static int dot;
  73
  74 /* Nonzero only if the last token returned was T_STOP. */
  75 static int eof;
  76
  77 /* If nonzero, next token returned by lex_get().
  78    Used only in exceptional circumstances. */
  79 static int put_token;
  80 static struct string put_tokstr;
  81 static double put_tokval;
  82
  83 static void unexpected_eof (void);
  84 static inline int check_id (const char *id, size_t len);
  85 static void convert_numeric_string_to_char_string (int type);
  86 static int parse_string (int type);
  87
  88 #if DUMP_TOKENS
  89 static void dump_token (void);
  90 #endif
  91 \f
  92 /* Initialization. */
  93
  94 /* Initializes the lexer. */
  95 void
  96 lex_init (void)
  97 {
  98   ds_init (&put_tokstr, 64);
  99   if (!lex_get_line ())
 100     unexpected_eof ();
 101 }
 102
 103 void
 104 lex_done (void)
 105 {
 106   ds_destroy(&put_tokstr);
 107 }
 108
 109 \f
 110 /* Common functions. */
 111
 112 /* Copies put_token, put_tokstr, put_tokval into token, tokstr,
 113    tokval, respectively, and sets tokid appropriately. */
 114 static void
 115 restore_token (void)
 116 {
 117   assert (put_token != 0);
 118   token = put_token;
 119   ds_replace (&tokstr, ds_c_str (&put_tokstr));
 120   strncpy (tokid, ds_c_str (&put_tokstr), 8);
 121   tokid[8] = 0;
 122   tokval = put_tokval;
 123   put_token = 0;
 124 }
 125
 126 /* Copies token, tokstr, tokval into put_token, put_tokstr,
 127    put_tokval respectively. */
 128 static void
 129 save_token (void)
 130 {
 131   put_token = token;
 132   ds_replace (&put_tokstr, ds_c_str (&tokstr));
 133   put_tokval = tokval;
 134 }
 135
 136 /* Parses a single token, setting appropriate global variables to
 137    indicate the token's attributes. */
 138 void
 139 lex_get (void)
 140 {
 141   /* If a token was pushed ahead, return it. */
 142   if (put_token)
 143     {
 144       restore_token ();
 145 #if DUMP_TOKENS
 146       dump_token ();
 147 #endif
 148       return;
 149     }
 150
 151   /* Find a token. */
 152   for (;;)
 153     {
 154       char *cp;
 155
 156       /* Skip whitespace. */
 157       if (eof)
 158         unexpected_eof ();
 159
 160       for (;;)
 161         {
 162           while (isspace ((unsigned char) *prog))
 163             prog++;
 164           if (*prog)
 165             break;
 166
 167           if (dot)
 168             {
 169               dot = 0;
 170               token = '.';
 171 #if DUMP_TOKENS
 172               dump_token ();
 173 #endif
 174               return;
 175             }
 176           else if (!lex_get_line ())
 177             {
 178               eof = 1;
 179               token = T_STOP;
 180 #if DUMP_TOKENS
 181               dump_token ();
 182 #endif
 183               return;
 184             }
 185
 186           if (put_token)
 187             {
 188               restore_token ();
 189 #if DUMP_TOKENS
 190               dump_token ();
 191 #endif
 192               return;
 193             }
 194         }
 195
 196
 197       /* Actually parse the token. */
 198       cp = prog;
 199       ds_clear (&tokstr);
 200
 201       switch (*prog)
 202         {
 203         case '-': case '.':
 204         case '0': case '1': case '2': case '3': case '4':
 205         case '5': case '6': case '7': case '8': case '9':
 206           {
 207             char *tail;
 208
 209             /* `-' can introduce a negative number, or it can be a
 210                token by itself.  If it is not followed by a digit or a
 211                decimal point, it is definitely not a number.
 212                Otherwise, it might be either, but most of the time we
 213                want it as a number.  When the syntax calls for a `-'
 214                token, lex_negative_to_dash() must be used to break
 215                negative numbers into two tokens. */
 216             if (*cp == '-')
 217               {
 218                 ds_putc (&tokstr, *prog++);
 219                 while (isspace ((unsigned char) *prog))
 220                   prog++;
 221
 222                 if (!isdigit ((unsigned char) *prog) && *prog != '.')
 223                   {
 224                     token = '-';
 225                     break;
 226                   }
 227                 token = T_NEG_NUM;
 228               }
 229             else
 230               token = T_POS_NUM;
 231
 232             /* Parse the number, copying it into tokstr. */
 233             while (isdigit ((unsigned char) *prog))
 234               ds_putc (&tokstr, *prog++);
 235             if (*prog == '.')
 236               {
 237                 ds_putc (&tokstr, *prog++);
 238                 while (isdigit ((unsigned char) *prog))
 239                   ds_putc (&tokstr, *prog++);
 240               }
 241             if (*prog == 'e' || *prog == 'E')
 242               {
 243                 ds_putc (&tokstr, *prog++);
 244                 if (*prog == '+' || *prog == '-')
 245                   ds_putc (&tokstr, *prog++);
 246                 while (isdigit ((unsigned char) *prog))
 247                   ds_putc (&tokstr, *prog++);
 248               }
 249
 250             /* Parse as floating point. */
 251             tokval = strtod (ds_c_str (&tokstr), &tail);
 252             if (*tail)
 253               {
 254                 msg (SE, _("%s does not form a valid number."),
 255                      ds_c_str (&tokstr));
 256                 tokval = 0.0;
 257
 258                 ds_clear (&tokstr);
 259                 ds_putc (&tokstr, '0');
 260               }
 261
 262             break;
 263           }
 264
 265         case '\'': case '"':
 266           token = parse_string (0);
 267           break;
 268
 269         case '(': case ')': case ',': case '=': case '+': case '/':
 270           token = *prog++;
 271           break;
 272
 273         case '*':
 274           if (*++prog == '*')
 275             {
 276               prog++;
 277               token = T_EXP;
 278             }
 279           else
 280             token = '*';
 281           break;
 282
 283         case '<':
 284           if (*++prog == '=')
 285             {
 286               prog++;
 287               token = T_LE;
 288             }
 289           else if (*prog == '>')
 290             {
 291               prog++;
 292               token = T_NE;
 293             }
 294           else
 295             token = T_LT;
 296           break;
 297
 298         case '>':
 299           if (*++prog == '=')
 300             {
 301               prog++;
 302               token = T_GE;
 303             }
 304           else
 305             token = T_GT;
 306           break;
 307
 308         case '~':
 309           if (*++prog == '=')
 310             {
 311               prog++;
 312               token = T_NE;
 313             }
 314           else
 315             token = T_NOT;
 316           break;
 317
 318         case '&':
 319           prog++;
 320           token = T_AND;
 321           break;
 322
 323         case '|':
 324           prog++;
 325           token = T_OR;
 326           break;
 327
 328         case 'a': case 'b': case 'c': case 'd': case 'e':
 329         case 'f': case 'g': case 'h': case 'i': case 'j':
 330         case 'k': case 'l': case 'm': case 'n': case 'o':
 331         case 'p': case 'q': case 'r': case 's': case 't':
 332         case 'u': case 'v': case 'w': case 'x': case 'y':
 333         case 'z':
 334         case 'A': case 'B': case 'C': case 'D': case 'E':
 335         case 'F': case 'G': case 'H': case 'I': case 'J':
 336         case 'K': case 'L': case 'M': case 'N': case 'O':
 337         case 'P': case 'Q': case 'R': case 'S': case 'T':
 338         case 'U': case 'V': case 'W': case 'X': case 'Y':
 339         case 'Z':
 340         case '#': case '$': case '@':
 341           /* Strings can be specified in binary, octal, or hex using
 342                this special syntax. */
 343           if (prog[1] == '\'' || prog[1] == '"')
 344             {
 345               static const char special[3] = "box";
 346               const char *p;
 347
 348               p = strchr (special, tolower ((unsigned char) *prog));
 349               if (p)
 350                 {
 351                   prog++;
 352                   token = parse_string (p - special + 1);
 353                   break;
 354                 }
 355             }
 356
 357           /* Copy id to tokstr. */
 358           ds_putc (&tokstr, toupper ((unsigned char) *prog++));
 359           while (CHAR_IS_IDN (*prog))
 360             ds_putc (&tokstr, toupper ((unsigned char) *prog++));
 361
 362           /* Copy tokstr to tokid, truncating it to 8 characters. */
 363           strncpy (tokid, ds_c_str (&tokstr), 8);
 364           tokid[8] = 0;
 365
 366           token = check_id (ds_c_str (&tokstr), ds_length (&tokstr));
 367           break;
 368
 369         default:
 370           if (isgraph ((unsigned char) *prog))
 371             msg (SE, _("Bad character in input: `%c'."), *prog++);
 372           else
 373             msg (SE, _("Bad character in input: `\\%o'."), *prog++);
 374           continue;
 375         }
 376
 377       break;
 378     }
 379
 380 #if DUMP_TOKENS
 381   dump_token ();
 382 #endif
 383 }
 384
 385 /* Prints a syntax error message containing the current token and
 386    given message MESSAGE (if non-null). */
 387 void
 388 lex_error (const char *message, ...)
 389 {
 390   char *token_rep;
 391
 392   token_rep = lex_token_representation ();
 393   if (token_rep[0] == 0)
 394     msg (SE, _("Syntax error at end of file."));
 395   else if (message)
 396     {
 397       char buf[1024];
 398       va_list args;
 399
 400       va_start (args, message);
 401       vsnprintf (buf, 1024, message, args);
 402       va_end (args);
 403
 404       msg (SE, _("Syntax error %s at `%s'."), buf, token_rep);
 405     }
 406   else
 407     msg (SE, _("Syntax error at `%s'."), token_rep);
 408
 409   free (token_rep);
 410 }
 411
 412 /* Checks that we're at end of command.
 413    If so, returns a successful command completion code.
 414    If not, flags a syntax error and returns an error command
 415    completion code. */
 416 int
 417 lex_end_of_command (void)
 418 {
 419   if (token != '.')
 420     {
 421       lex_error (_("expecting end of command"));
 422       return CMD_TRAILING_GARBAGE;
 423     }
 424   else
 425     return CMD_SUCCESS;
 426 }
 427 \f
 428 /* Token testing functions. */
 429
 430 /* Returns true if the current token is a number. */
 431 bool
 432 lex_is_number (void)
 433 {
 434   return token == T_POS_NUM || token == T_NEG_NUM;
 435 }
 436
 437 /* Returns the value of the current token, which must be a
 438    floating point number. */
 439 double
 440 lex_number (void)
 441 {
 442   assert (lex_is_number ());
 443   return tokval;
 444 }
 445
 446 /* Returns true iff the current token is an integer. */
 447 bool
 448 lex_is_integer (void)
 449 {
 450   return (lex_is_number ()
 451           && tokval != NOT_LONG
 452           && tokval >= LONG_MIN
 453           && tokval <= LONG_MAX
 454           && floor (tokval) == tokval);
 455 }
 456
 457 /* Returns the value of the current token, which must be an
 458    integer. */
 459 long
 460 lex_integer (void)
 461 {
 462   assert (lex_is_integer ());
 463   return tokval;
 464 }
 465 \f
 466 /* Token matching functions. */
 467
 468 /* If TOK is the current token, skips it and returns nonzero.
 469    Otherwise, returns zero. */
 470 int
 471 lex_match (int t)
 472 {
 473   if (token == t)
 474     {
 475       lex_get ();
 476       return 1;
 477     }
 478   else
 479     return 0;
 480 }
 481
 482 /* If the current token is the identifier S, skips it and returns
 483    nonzero.
 484    Otherwise, returns zero. */
 485 int
 486 lex_match_id (const char *s)
 487 {
 488   if (token == T_ID && lex_id_match (s, tokid))
 489     {
 490       lex_get ();
 491       return 1;
 492     }
 493   else
 494     return 0;
 495 }
 496
 497 /* If the current token is integer N, skips it and returns nonzero.
 498    Otherwise, returns zero. */
 499 int
 500 lex_match_int (int x)
 501 {
 502   if (lex_is_integer () && lex_integer () == x)
 503     {
 504       lex_get ();
 505       return 1;
 506     }
 507   else
 508     return 0;
 509 }
 510 \f
 511 /* Forced matches. */
 512
 513 /* If this token is identifier S, fetches the next token and returns
 514    nonzero.
 515    Otherwise, reports an error and returns zero. */
 516 int
 517 lex_force_match_id (const char *s)
 518 {
 519   if (token == T_ID && lex_id_match (s, tokid))
 520     {
 521       lex_get ();
 522       return 1;
 523     }
 524   else
 525     {
 526       lex_error (_("expecting `%s'"), s);
 527       return 0;
 528     }
 529 }
 530
 531 /* If the current token is T, skips the token.  Otherwise, reports an
 532    error and returns from the current function with return value 0. */
 533 int
 534 lex_force_match (int t)
 535 {
 536   if (token == t)
 537     {
 538       lex_get ();
 539       return 1;
 540     }
 541   else
 542     {
 543       lex_error (_("expecting %s"), lex_token_name (t));
 544       return 0;
 545     }
 546 }
 547
 548 /* If this token is a string, does nothing and returns nonzero.
 549    Otherwise, reports an error and returns zero. */
 550 int
 551 lex_force_string (void)
 552 {
 553   if (token == T_STRING)
 554     return 1;
 555   else
 556     {
 557       lex_error (_("expecting string"));
 558       return 0;
 559     }
 560 }
 561
 562 /* If this token is an integer, does nothing and returns nonzero.
 563    Otherwise, reports an error and returns zero. */
 564 int
 565 lex_force_int (void)
 566 {
 567   if (lex_is_integer ())
 568     return 1;
 569   else
 570     {
 571       lex_error (_("expecting integer"));
 572       return 0;
 573     }
 574 }
 575
 576 /* If this token is a number, does nothing and returns nonzero.
 577    Otherwise, reports an error and returns zero. */
 578 int
 579 lex_force_num (void)
 580 {
 581   if (lex_is_number ())
 582     return 1;
 583   else
 584     {
 585       lex_error (_("expecting number"));
 586       return 0;
 587     }
 588 }
 589
 590 /* If this token is an identifier, does nothing and returns nonzero.
 591    Otherwise, reports an error and returns zero. */
 592 int
 593 lex_force_id (void)
 594 {
 595   if (token == T_ID)
 596     return 1;
 597   else
 598     {
 599       lex_error (_("expecting identifier"));
 600       return 0;
 601     }
 602 }
 603 \f
 604 /* Comparing identifiers. */
 605
 606 /* Keywords match if one of the following is true: KW and TOK are
 607    identical (barring differences in case), or TOK is at least 3
 608    characters long and those characters are identical to KW.  KW_LEN
 609    is the length of KW, TOK_LEN is the length of TOK. */
 610 int
 611 lex_id_match_len (const char *kw, size_t kw_len,
 612                   const char *tok, size_t tok_len)
 613 {
 614   size_t i = 0;
 615
 616   assert (kw && tok);
 617   for (;;)
 618     {
 619       if (i == kw_len && i == tok_len)
 620         return 1;
 621       else if (i == tok_len)
 622         return i >= 3;
 623       else if (i == kw_len)
 624         return 0;
 625       else if (toupper ((unsigned char) kw[i])
 626                != toupper ((unsigned char) tok[i]))
 627         return 0;
 628
 629       i++;
 630     }
 631 }
 632
 633 /* Same as lex_id_match_len() minus the need to pass in the lengths. */
 634 int
 635 lex_id_match (const char *kw, const char *tok)
 636 {
 637   return lex_id_match_len (kw, strlen (kw), tok, strlen (tok));
 638 }
 639 \f
 640 /* Weird token functions. */
 641
 642 /* Returns the first character of the next token, except that if the
 643    next token is not an identifier, the character returned will not be
 644    a character that can begin an identifier.  Specifically, the
 645    hexstring lead-in X' causes lookahead() to return '.  Note that an
 646    alphanumeric return value doesn't guarantee an ID token, it could
 647    also be a reserved-word token. */
 648 int
 649 lex_look_ahead (void)
 650 {
 651   if (put_token)
 652     return put_token;
 653
 654   for (;;)
 655     {
 656       if (eof)
 657         unexpected_eof ();
 658
 659       for (;;)
 660         {
 661           while (isspace ((unsigned char) *prog))
 662             prog++;
 663           if (*prog)
 664             break;
 665
 666           if (dot)
 667             return '.';
 668           else if (!lex_get_line ())
 669             unexpected_eof ();
 670
 671           if (put_token)
 672             return put_token;
 673         }
 674
 675       if ((toupper ((unsigned char) *prog) == 'X'
 676            || toupper ((unsigned char) *prog) == 'B')
 677           && (prog[1] == '\'' || prog[1] == '"'))
 678         return '\'';
 679
 680       return *prog;
 681     }
 682 }
 683
 684 /* Makes the current token become the next token to be read; the
 685    current token is set to T. */
 686 void
 687 lex_put_back (int t)
 688 {
 689   save_token ();
 690   token = t;
 691 }
 692
 693 /* Makes the current token become the next token to be read; the
 694    current token is set to the identifier ID. */
 695 void
 696 lex_put_back_id (const char *id)
 697 {
 698   save_token ();
 699   token = T_ID;
 700   ds_replace (&tokstr, id);
 701   strncpy (tokid, ds_c_str (&tokstr), 8);
 702   tokid[8] = 0;
 703 }
 704 \f
 705 /* Weird line processing functions. */
 706
 707 /* Returns the entire contents of the current line. */
 708 const char *
 709 lex_entire_line (void)
 710 {
 711   return ds_c_str (&getl_buf);
 712 }
 713
 714 /* As lex_entire_line(), but only returns the part of the current line
 715    that hasn't already been tokenized.
 716    If END_DOT is non-null, stores nonzero into *END_DOT if the line
 717    ends with a terminal dot, or zero if it doesn't. */
 718 const char *
 719 lex_rest_of_line (int *end_dot)
 720 {
 721   if (end_dot)
 722     *end_dot = dot;
 723   return prog;
 724 }
 725
 726 /* Causes the rest of the current input line to be ignored for
 727    tokenization purposes. */
 728 void
 729 lex_discard_line (void)
 730 {
 731   prog = ds_end (&getl_buf);
 732   dot = put_token = 0;
 733 }
 734
 735 /* Sets the current position in the current line to P, which must be
 736    in getl_buf. */
 737 void
 738 lex_set_prog (char *p)
 739 {
 740   prog = p;
 741 }
 742 \f
 743 /* Weird line reading functions. */
 744
 745 /* Read a line for use by the tokenizer. */
 746 int
 747 lex_get_line (void)
 748 {
 749   if (!getl_read_line ())
 750     return 0;
 751
 752   lex_preprocess_line ();
 753   return 1;
 754 }
 755
 756 /* Preprocesses getl_buf by removing comments, stripping trailing
 757    whitespace and the terminal dot, and removing leading indentors. */
 758 void
 759 lex_preprocess_line (void)
 760 {
 761   /* Strips comments. */
 762   {
 763     /* getl_buf iterator. */
 764     char *cp;
 765
 766     /* Nonzero inside a comment. */
 767     int comment;
 768
 769     /* Nonzero inside a quoted string. */
 770     int quote;
 771
 772     /* Remove C-style comments begun by slash-star and terminated by
 773      star-slash or newline. */
 774     quote = comment = 0;
 775     for (cp = ds_c_str (&getl_buf); *cp; )
 776       {
 777         /* If we're not commented out, toggle quoting. */
 778         if (!comment)
 779           {
 780             if (*cp == quote)
 781               quote = 0;
 782             else if (*cp == '\'' || *cp == '"')
 783               quote = *cp;
 784           }
 785
 786         /* If we're not quoting, toggle commenting. */
 787         if (!quote)
 788           {
 789             if (cp[0] == '/' && cp[1] == '*')
 790               {
 791                 comment = 1;
 792                 *cp++ = ' ';
 793                 *cp++ = ' ';
 794                 continue;
 795               }
 796             else if (cp[0] == '*' && cp[1] == '/' && comment)
 797               {
 798                 comment = 0;
 799                 *cp++ = ' ';
 800                 *cp++ = ' ';
 801                 continue;
 802               }
 803           }
 804
 805         /* Check commenting. */
 806         if (!comment)
 807           cp++;
 808         else
 809           *cp++ = ' ';
 810       }
 811   }
 812
 813   /* Strip trailing whitespace and terminal dot. */
 814   {
 815     size_t len = ds_length (&getl_buf);
 816     char *s = ds_c_str (&getl_buf);
 817
 818     /* Strip trailing whitespace. */
 819     while (len > 0 && isspace ((unsigned char) s[len - 1]))
 820       len--;
 821
 822     /* Check for and remove terminal dot. */
 823     if (len > 0 && s[len - 1] == get_endcmd() )
 824       {
 825         dot = 1;
 826         len--;
 827       }
 828     else if (len == 0 && get_nullline() )
 829       dot = 1;
 830     else
 831       dot = 0;
 832
 833     /* Set length. */
 834     ds_truncate (&getl_buf, len);
 835   }
 836
 837   /* In batch mode, strip leading indentors and insert a terminal dot
 838      as necessary. */
 839   if (getl_interactive != 2 && getl_mode == GETL_MODE_BATCH)
 840     {
 841       char *s = ds_c_str (&getl_buf);
 842
 843       if (s[0] == '+' || s[0] == '-' || s[0] == '.')
 844         s[0] = ' ';
 845       else if (s[0] && !isspace ((unsigned char) s[0]))
 846         put_token = '.';
 847     }
 848
 849   prog = ds_c_str (&getl_buf);
 850 }
 851 \f
 852 /* Token names. */
 853
 854 /* Returns the name of a token in a static buffer. */
 855 const char *
 856 lex_token_name (int token)
 857 {
 858   if (token >= T_FIRST_KEYWORD && token <= T_LAST_KEYWORD)
 859     return keywords[token - T_FIRST_KEYWORD];
 860
 861   if (token < 256)
 862     {
 863       static char t[2];
 864       t[0] = token;
 865       return t;
 866     }
 867
 868   return _("<ERROR>");
 869 }
 870
 871 /* Returns an ASCII representation of the current token as a
 872    malloc()'d string. */
 873 char *
 874 lex_token_representation (void)
 875 {
 876   char *token_rep;
 877
 878   switch (token)
 879     {
 880     case T_ID:
 881     case T_POS_NUM:
 882     case T_NEG_NUM:
 883       return xstrdup (ds_c_str (&tokstr));
 884       break;
 885
 886     case T_STRING:
 887       {
 888         int hexstring = 0;
 889         char *sp, *dp;
 890
 891         for (sp = ds_c_str (&tokstr); sp < ds_end (&tokstr); sp++)
 892           if (!isprint ((unsigned char) *sp))
 893             {
 894               hexstring = 1;
 895               break;
 896             }
 897
 898         token_rep = xmalloc (2 + ds_length (&tokstr) * 2 + 1 + 1);
 899
 900         dp = token_rep;
 901         if (hexstring)
 902           *dp++ = 'X';
 903         *dp++ = '\'';
 904
 905         if (!hexstring)
 906           for (sp = ds_c_str (&tokstr); *sp; )
 907             {
 908               if (*sp == '\'')
 909                 *dp++ = '\'';
 910               *dp++ = (unsigned char) *sp++;
 911             }
 912         else
 913           for (sp = ds_c_str (&tokstr); sp < ds_end (&tokstr); sp++)
 914             {
 915               *dp++ = (((unsigned char) *sp) >> 4)["0123456789ABCDEF"];
 916               *dp++ = (((unsigned char) *sp) & 15)["0123456789ABCDEF"];
 917             }
 918         *dp++ = '\'';
 919         *dp = '\0';
 920
 921         return token_rep;
 922       }
 923     break;
 924
 925     case T_STOP:
 926       token_rep = xmalloc (1);
 927       *token_rep = '\0';
 928       return token_rep;
 929
 930     case T_EXP:
 931       return xstrdup ("**");
 932
 933     default:
 934       if (token >= T_FIRST_KEYWORD && token <= T_LAST_KEYWORD)
 935         return xstrdup (keywords [token - T_FIRST_KEYWORD]);
 936       else
 937         {
 938           token_rep = xmalloc (2);
 939           token_rep[0] = token;
 940           token_rep[1] = '\0';
 941           return token_rep;
 942         }
 943     }
 944
 945   assert (0);
 946 }
 947 \f
 948 /* Really weird functions. */
 949
 950 /* Most of the time, a `-' is a lead-in to a negative number.  But
 951    sometimes it's actually part of the syntax.  If a dash can be part
 952    of syntax then this function is called to rip it off of a
 953    number. */
 954 void
 955 lex_negative_to_dash (void)
 956 {
 957   if (token == T_NEG_NUM)
 958     {
 959       token = T_POS_NUM;
 960       tokval = -tokval;
 961       ds_replace (&tokstr, ds_c_str (&tokstr) + 1);
 962       save_token ();
 963       token = '-';
 964     }
 965 }
 966
 967 /* We're not at eof any more. */
 968 void
 969 lex_reset_eof (void)
 970 {
 971   eof = 0;
 972 }
 973
 974 /* Skip a COMMENT command. */
 975 void
 976 lex_skip_comment (void)
 977 {
 978   for (;;)
 979     {
 980       if (!lex_get_line ())
 981         {
 982           put_token = T_STOP;
 983           eof = 1;
 984           return;
 985         }
 986
 987       if (put_token == '.')
 988         break;
 989
 990       prog = ds_end (&getl_buf);
 991       if (dot)
 992         break;
 993     }
 994 }
 995 \f
 996 /* Private functions. */
 997
 998 /* Unexpected end of file. */
 999 static void
1000 unexpected_eof (void)
1001 {
1002   msg (FE, _("Unexpected end of file."));
1003 }
1004
1005 /* Returns the proper token type, either T_ID or a reserved keyword
1006    enum, for ID[], which must contain LEN characters. */
1007 static inline int
1008 check_id (const char *id, size_t len)
1009 {
1010   const char **kwp;
1011
1012   if (len < 2 || len > 4)
1013     return T_ID;
1014
1015   for (kwp = keywords; *kwp; kwp++)
1016     if (!strcmp (*kwp, id))
1017       return T_FIRST_KEYWORD + (kwp - keywords);
1018
1019   return T_ID;
1020 }
1021
1022 /* When invoked, tokstr contains a string of binary, octal, or hex
1023    digits, for values of TYPE of 0, 1, or 2, respectively.  The string
1024    is converted to characters having the specified values. */
1025 static void
1026 convert_numeric_string_to_char_string (int type)
1027 {
1028   static const char *base_names[] = {N_("binary"), N_("octal"), N_("hex")};
1029   static const int bases[] = {2, 8, 16};
1030   static const int chars_per_byte[] = {8, 3, 2};
1031
1032   const char *const base_name = base_names[type];
1033   const int base = bases[type];
1034   const int cpb = chars_per_byte[type];
1035   const int nb = ds_length (&tokstr) / cpb;
1036   int i;
1037   char *p;
1038
1039   assert (type >= 0 && type <= 2);
1040
1041   if (ds_length (&tokstr) % cpb)
1042     msg (SE, _("String of %s digits has %d characters, which is not a "
1043                "multiple of %d."),
1044          gettext (base_name), ds_length (&tokstr), cpb);
1045
1046   p = ds_c_str (&tokstr);
1047   for (i = 0; i < nb; i++)
1048     {
1049       int value;
1050       int j;
1051
1052       value = 0;
1053       for (j = 0; j < cpb; j++, p++)
1054         {
1055           int v;
1056
1057           if (*p >= '0' && *p <= '9')
1058             v = *p - '0';
1059           else
1060             {
1061               static const char alpha[] = "abcdef";
1062               const char *q = strchr (alpha, tolower ((unsigned char) *p));
1063
1064               if (q)
1065                 v = q - alpha + 10;
1066               else
1067                 v = base;
1068             }
1069
1070           if (v >= base)
1071             msg (SE, _("`%c' is not a valid %s digit."), *p, base_name);
1072
1073           value = value * base + v;
1074         }
1075
1076       ds_c_str (&tokstr)[i] = (unsigned char) value;
1077     }
1078
1079   ds_truncate (&tokstr, nb);
1080 }
1081
1082 /* Parses a string from the input buffer into tokstr.  The input
1083    buffer pointer prog must point to the initial single or double
1084    quote.  TYPE is 0 if it is an ordinary string, or 1, 2, or 3 for a
1085    binary, octal, or hexstring, respectively.  Returns token type. */
1086 static int
1087 parse_string (int type)
1088 {
1089   /* Accumulate the entire string, joining sections indicated by +
1090      signs. */
1091   for (;;)
1092     {
1093       /* Single or double quote. */
1094       int c = *prog++;
1095
1096       /* Accumulate section. */
1097       for (;;)
1098         {
1099           /* Check end of line. */
1100           if (*prog == 0)
1101             {
1102               msg (SE, _("Unterminated string constant."));
1103               goto finish;
1104             }
1105
1106           /* Double quote characters to embed them in strings. */
1107           if (*prog == c)
1108             {
1109               if (prog[1] == c)
1110                 prog++;
1111               else
1112                 break;
1113             }
1114
1115           ds_putc (&tokstr, *prog++);
1116         }
1117       prog++;
1118
1119       /* Skip whitespace after final quote mark. */
1120       if (eof)
1121         break;
1122       for (;;)
1123         {
1124           while (isspace ((unsigned char) *prog))
1125             prog++;
1126           if (*prog)
1127             break;
1128
1129           if (dot)
1130             goto finish;
1131
1132           if (!lex_get_line ())
1133             unexpected_eof ();
1134         }
1135
1136       /* Skip plus sign. */
1137       if (*prog != '+')
1138         break;
1139       prog++;
1140
1141       /* Skip whitespace after plus sign. */
1142       if (eof)
1143         break;
1144       for (;;)
1145         {
1146           while (isspace ((unsigned char) *prog))
1147             prog++;
1148           if (*prog)
1149             break;
1150
1151           if (dot)
1152             goto finish;
1153
1154           if (!lex_get_line ())
1155             unexpected_eof ();
1156         }
1157
1158       /* Ensure that a valid string follows. */
1159       if (*prog != '\'' && *prog != '"')
1160         {
1161           msg (SE, "String expected following `+'.");
1162           goto finish;
1163         }
1164     }
1165
1166   /* We come here when we've finished concatenating all the string sections
1167      into one large string. */
1168 finish:
1169   if (type != 0)
1170     convert_numeric_string_to_char_string (type - 1);
1171
1172   if (ds_length (&tokstr) > 255)
1173     {
1174       msg (SE, _("String exceeds 255 characters in length (%d characters)."),
1175            ds_length (&tokstr));
1176       ds_truncate (&tokstr, 255);
1177     }
1178
1179   {
1180     /* FIXME. */
1181     size_t i;
1182     int warned = 0;
1183
1184     for (i = 0; i < ds_length (&tokstr); i++)
1185       if (ds_c_str (&tokstr)[i] == 0)
1186         {
1187           if (!warned)
1188             {
1189               msg (SE, _("Sorry, literal strings may not contain null "
1190                          "characters.  Replacing with spaces."));
1191               warned = 1;
1192             }
1193           ds_c_str (&tokstr)[i] = ' ';
1194         }
1195   }
1196
1197   return T_STRING;
1198 }
1199 \f
1200 #if DUMP_TOKENS
1201 /* Reads one token from the lexer and writes a textual representation
1202    on stdout for debugging purposes. */
1203 static void
1204 dump_token (void)
1205 {
1206   {
1207     const char *curfn;
1208     int curln;
1209
1210     getl_location (&curfn, &curln);
1211     if (curfn)
1212       fprintf (stderr, "%s:%d\t", curfn, curln);
1213   }
1214
1215   switch (token)
1216     {
1217     case T_ID:
1218       fprintf (stderr, "ID\t%s\n", tokid);
1219       break;
1220
1221     case T_POS_NUM:
1222     case T_NEG_NUM:
1223       fprintf (stderr, "NUM\t%f\n", tokval);
1224       break;
1225
1226     case T_STRING:
1227       fprintf (stderr, "STRING\t\"%s\"\n", ds_c_str (&tokstr));
1228       break;
1229
1230     case T_STOP:
1231       fprintf (stderr, "STOP\n");
1232       break;
1233
1234     case T_EXP:
1235       fprintf (stderr, "MISC\tEXP\"");
1236       break;
1237
1238     case 0:
1239       fprintf (stderr, "MISC\tEOF\n");
1240       break;
1241
1242     default:
1243       if (token >= T_FIRST_KEYWORD && token <= T_LAST_KEYWORD)
1244         fprintf (stderr, "KEYWORD\t%s\n", lex_token_name (token));
1245       else
1246         fprintf (stderr, "PUNCT\t%c\n", token);
1247       break;
1248     }
1249 }
1250 #endif /* DUMP_TOKENS */