src/lexer.c

   1 /* PSPP - computes sample statistics.
   2    Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
   3    Written by Ben Pfaff <blp@gnu.org>.
   4
   5    This program is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU General Public License as
   7    published by the Free Software Foundation; either version 2 of the
   8    License, or (at your option) any later version.
   9
  10    This program is distributed in the hope that it will be useful, but
  11    WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program; if not, write to the Free Software
  17    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
  18    02111-1307, USA. */
  19
  20 #include <config.h>
  21 #include "lexer.h"
  22 #include <assert.h>
  23 #include <ctype.h>
  24 #include <errno.h>
  25 #include <limits.h>
  26 #include <math.h>
  27 #include <stdarg.h>
  28 #include <stdlib.h>
  29 #include "alloc.h"
  30 #include "command.h"
  31 #include "error.h"
  32 #include "getline.h"
  33 #include "magic.h"
  34 #include "settings.h"
  35 #include "str.h"
  36
  37 /*
  38 #define DUMP_TOKENS 1
  39 */
  40
  41 \f
  42 /* Global variables. */
  43
  44 /* Current token. */
  45 int token;
  46
  47 /* T_NUM: the token's value. */
  48 double tokval;
  49
  50 /* T_ID: the identifier. */
  51 char tokid[9];
  52
  53 /* T_ID, T_STRING: token string value.
  54    For T_ID, this is not truncated to 8 characters as is tokid. */
  55 struct string tokstr;
  56 \f
  57 /* Static variables. */
  58
  59 /* Table of keywords. */
  60 static const char *keywords[T_N_KEYWORDS + 1] =
  61   {
  62     "AND", "OR", "NOT",
  63     "EQ", "GE", "GT", "LE", "LT", "NE",
  64     "ALL", "BY", "TO", "WITH",
  65     NULL,
  66   };
  67
  68 /* Pointer to next token in getl_buf. */
  69 static char *prog;
  70
  71 /* Nonzero only if this line ends with a terminal dot. */
  72 static int dot;
  73
  74 /* Nonzero only if the last token returned was T_EOF. */
  75 static int eof;
  76
  77 /* If nonzero, next token returned by lex_get().
  78    Used only in exceptional circumstances. */
  79 static int put;
  80
  81 static void unexpected_eof (void);
  82 static inline int check_id (const char *id, size_t len);
  83 static void convert_numeric_string_to_char_string (int type);
  84 static int parse_string (int type);
  85
  86 #if DUMP_TOKENS
  87 static void dump_token (void);
  88 #endif
  89 \f
  90 /* Initialization. */
  91
  92 /* Initializes the lexer. */
  93 void
  94 lex_init (void)
  95 {
  96   if (!lex_get_line ())
  97     unexpected_eof ();
  98 }
  99 \f
 100 /* Common functions. */
 101
 102 /* Parses a single token, setting appropriate global variables to
 103    indicate the token's attributes. */
 104 void
 105 lex_get (void)
 106 {
 107   /* If a token was pushed ahead, return it. */
 108   if (put)
 109     {
 110       token = put;
 111       put = 0;
 112 #if DUMP_TOKENS
 113       dump_token ();
 114 #endif
 115       return;
 116     }
 117
 118   /* Find a token. */
 119   for (;;)
 120     {
 121       char *cp;
 122
 123       /* Skip whitespace. */
 124       if (eof)
 125         unexpected_eof ();
 126
 127       for (;;)
 128         {
 129           while (isspace ((unsigned char) *prog))
 130             prog++;
 131           if (*prog)
 132             break;
 133
 134           if (dot)
 135             {
 136               dot = 0;
 137               token = '.';
 138 #if DUMP_TOKENS
 139               dump_token ();
 140 #endif
 141               return;
 142             }
 143           else if (!lex_get_line ())
 144             {
 145               eof = 1;
 146               token = T_STOP;
 147 #if DUMP_TOKENS
 148               dump_token ();
 149 #endif
 150               return;
 151             }
 152
 153           if (put)
 154             {
 155               token = put;
 156               put = 0;
 157 #if DUMP_TOKENS
 158               dump_token ();
 159 #endif
 160               return;
 161             }
 162         }
 163
 164
 165       /* Actually parse the token. */
 166       cp = prog;
 167       ds_clear (&tokstr);
 168
 169       switch (*prog)
 170         {
 171         case '-': case '.':
 172         case '0': case '1': case '2': case '3': case '4':
 173         case '5': case '6': case '7': case '8': case '9':
 174           {
 175             char *tail;
 176
 177             /* `-' can introduce a negative number, or it can be a
 178                token by itself.  If it is not followed by a digit or a
 179                decimal point, it is definitely not a number.
 180                Otherwise, it might be either, but most of the time we
 181                want it as a number.  When the syntax calls for a `-'
 182                token, lex_negative_to_dash() must be used to break
 183                negative numbers into two tokens. */
 184             if (*cp == '-')
 185               {
 186                 ds_putchar (&tokstr, *prog++);
 187                 while (isspace ((unsigned char) *prog))
 188                   prog++;
 189
 190                 if (!isdigit ((unsigned char) *prog) && *prog != '.')
 191                   {
 192                     token = '-';
 193                     break;
 194                   }
 195               }
 196
 197             /* Parse the number, copying it into tokstr. */
 198             while (isdigit ((unsigned char) *prog))
 199               ds_putchar (&tokstr, *prog++);
 200             if (*prog == '.')
 201               {
 202                 ds_putchar (&tokstr, *prog++);
 203                 while (isdigit ((unsigned char) *prog))
 204                   ds_putchar (&tokstr, *prog++);
 205               }
 206             if (*prog == 'e' || *prog == 'E')
 207               {
 208                 ds_putchar (&tokstr, *prog++);
 209                 if (*prog == '+' || *prog == '-')
 210                   ds_putchar (&tokstr, *prog++);
 211                 while (isdigit ((unsigned char) *prog))
 212                   ds_putchar (&tokstr, *prog++);
 213               }
 214
 215             /* Parse as floating point. */
 216             tokval = strtod (ds_value (&tokstr), &tail);
 217             if (*tail)
 218               {
 219                 msg (SE, _("%s does not form a valid number."),
 220                      ds_value (&tokstr));
 221                 tokval = 0.0;
 222
 223                 ds_clear (&tokstr);
 224                 ds_putchar (&tokstr, '0');
 225               }
 226
 227             token = T_NUM;
 228             break;
 229           }
 230
 231         case '\'': case '"':
 232           token = parse_string (0);
 233           break;
 234
 235         case '(': case ')': case ',': case '=': case '+': case '/':
 236           token = *prog++;
 237           break;
 238
 239         case '*':
 240           if (*++prog == '*')
 241             {
 242               prog++;
 243               token = T_EXP;
 244             }
 245           else
 246             token = '*';
 247           break;
 248
 249         case '<':
 250           if (*++prog == '=')
 251             {
 252               prog++;
 253               token = T_LE;
 254             }
 255           else if (*prog == '>')
 256             {
 257               prog++;
 258               token = T_NE;
 259             }
 260           else
 261             token = T_LT;
 262           break;
 263
 264         case '>':
 265           if (*++prog == '=')
 266             {
 267               prog++;
 268               token = T_GE;
 269             }
 270           else
 271             token = T_GT;
 272           break;
 273
 274         case '~':
 275           if (*++prog == '=')
 276             {
 277               prog++;
 278               token = T_NE;
 279             }
 280           else
 281             token = T_NOT;
 282           break;
 283
 284         case '&':
 285           prog++;
 286           token = T_AND;
 287           break;
 288
 289         case '|':
 290           prog++;
 291           token = T_OR;
 292           break;
 293
 294         case 'a': case 'b': case 'c': case 'd': case 'e':
 295         case 'f': case 'g': case 'h': case 'i': case 'j':
 296         case 'k': case 'l': case 'm': case 'n': case 'o':
 297         case 'p': case 'q': case 'r': case 's': case 't':
 298         case 'u': case 'v': case 'w': case 'x': case 'y':
 299         case 'z':
 300         case 'A': case 'B': case 'C': case 'D': case 'E':
 301         case 'F': case 'G': case 'H': case 'I': case 'J':
 302         case 'K': case 'L': case 'M': case 'N': case 'O':
 303         case 'P': case 'Q': case 'R': case 'S': case 'T':
 304         case 'U': case 'V': case 'W': case 'X': case 'Y':
 305         case 'Z':
 306         case '#': case '$': case '@':
 307           /* Strings can be specified in binary, octal, or hex using
 308                this special syntax. */
 309           if (prog[1] == '\'' || prog[1] == '"')
 310             {
 311               static const char special[3] = "box";
 312               const char *p;
 313
 314               p = strchr (special, tolower ((unsigned char) *prog));
 315               if (p)
 316                 {
 317                   prog++;
 318                   token = parse_string (p - special + 1);
 319                   break;
 320                 }
 321             }
 322
 323           /* Copy id to tokstr. */
 324           ds_putchar (&tokstr, toupper ((unsigned char) *prog++));
 325           while (CHAR_IS_IDN (*prog))
 326             ds_putchar (&tokstr, toupper ((unsigned char) *prog++));
 327
 328           /* Copy tokstr to tokid, truncating it to 8 characters. */
 329           strncpy (tokid, ds_value (&tokstr), 8);
 330           tokid[8] = 0;
 331
 332           token = check_id (ds_value (&tokstr), ds_length (&tokstr));
 333           break;
 334
 335         default:
 336           if (isgraph ((unsigned char) *prog))
 337             msg (SE, _("Bad character in input: `%c'."), *prog++);
 338           else
 339             msg (SE, _("Bad character in input: `\\%o'."), *prog++);
 340           continue;
 341         }
 342
 343       break;
 344     }
 345
 346 #if DUMP_TOKENS
 347   dump_token ();
 348 #endif
 349 }
 350
 351 /* Prints a syntax error message containing the current token and
 352    given message MESSAGE (if non-null). */
 353 void
 354 lex_error (const char *message, ...)
 355 {
 356   char *token_rep;
 357
 358   token_rep = lex_token_representation ();
 359   if (token_rep[0] == 0)
 360     msg (SE, _("Syntax error at end of file."));
 361   else if (message)
 362     {
 363       char buf[1024];
 364       va_list args;
 365
 366       va_start (args, message);
 367       vsnprintf (buf, 1024, message, args);
 368       va_end (args);
 369
 370       msg (SE, _("Syntax error %s at `%s'."), buf, token_rep);
 371     }
 372   else
 373     msg (SE, _("Syntax error at `%s'."), token_rep);
 374
 375   free (token_rep);
 376 }
 377
 378 /* Checks that we're at end of command.
 379    If so, returns a successful command completion code.
 380    If not, flags a syntax error and returns an error command
 381    completion code. */
 382 int
 383 lex_end_of_command (void)
 384 {
 385   if (token != '.')
 386     {
 387       lex_error (_("expecting end of command"));
 388       return CMD_TRAILING_GARBAGE;
 389     }
 390   else
 391     return CMD_SUCCESS;
 392 }
 393 \f
 394 /* Token testing functions. */
 395
 396 /* Returns nonzero if the current token is an integer. */
 397 int
 398 lex_integer_p (void)
 399 {
 400   return (token == T_NUM
 401           && tokval != NOT_LONG
 402           && tokval >= LONG_MIN
 403           && tokval <= LONG_MAX
 404           && floor (tokval) == tokval);
 405 }
 406
 407 /* Returns the value of the current token, which must be an
 408    integer. */
 409 long
 410 lex_integer (void)
 411 {
 412   assert (lex_integer_p ());
 413   return tokval;
 414 }
 415 /* Returns nonzero if the current token is an floating point. */
 416 int
 417 lex_double_p (void)
 418 {
 419   return ( token == T_NUM
 420            && tokval != NOT_DOUBLE );
 421 }
 422
 423 /* Returns the value of the current token, which must be a
 424    floating point number. */
 425 double
 426 lex_double (void)
 427 {
 428   assert (lex_double_p ());
 429   return tokval;
 430 }
 431
 432 \f
 433 /* Token matching functions. */
 434
 435 /* If TOK is the current token, skips it and returns nonzero.
 436    Otherwise, returns zero. */
 437 int
 438 lex_match (int t)
 439 {
 440   if (token == t)
 441     {
 442       lex_get ();
 443       return 1;
 444     }
 445   else
 446     return 0;
 447 }
 448
 449 /* If the current token is the identifier S, skips it and returns
 450    nonzero.
 451    Otherwise, returns zero. */
 452 int
 453 lex_match_id (const char *s)
 454 {
 455   if (token == T_ID && lex_id_match (s, tokid))
 456     {
 457       lex_get ();
 458       return 1;
 459     }
 460   else
 461     return 0;
 462 }
 463
 464 /* If the current token is integer N, skips it and returns nonzero.
 465    Otherwise, returns zero. */
 466 int
 467 lex_match_int (int x)
 468 {
 469   if (lex_integer_p () && lex_integer () == x)
 470     {
 471       lex_get ();
 472       return 1;
 473     }
 474   else
 475     return 0;
 476 }
 477 \f
 478 /* Forced matches. */
 479
 480 /* If this token is identifier S, fetches the next token and returns
 481    nonzero.
 482    Otherwise, reports an error and returns zero. */
 483 int
 484 lex_force_match_id (const char *s)
 485 {
 486   if (token == T_ID && lex_id_match (s, tokid))
 487     {
 488       lex_get ();
 489       return 1;
 490     }
 491   else
 492     {
 493       lex_error (_("expecting `%s'"), s);
 494       return 0;
 495     }
 496 }
 497
 498 /* If the current token is T, skips the token.  Otherwise, reports an
 499    error and returns from the current function with return value 0. */
 500 int
 501 lex_force_match (int t)
 502 {
 503   if (token == t)
 504     {
 505       lex_get ();
 506       return 1;
 507     }
 508   else
 509     {
 510       lex_error (_("expecting %s"), lex_token_name (t));
 511       return 0;
 512     }
 513 }
 514
 515 /* If this token is a string, does nothing and returns nonzero.
 516    Otherwise, reports an error and returns zero. */
 517 int
 518 lex_force_string (void)
 519 {
 520   if (token == T_STRING)
 521     return 1;
 522   else
 523     {
 524       lex_error (_("expecting string"));
 525       return 0;
 526     }
 527 }
 528
 529 /* If this token is an integer, does nothing and returns nonzero.
 530    Otherwise, reports an error and returns zero. */
 531 int
 532 lex_force_int (void)
 533 {
 534   if (lex_integer_p ())
 535     return 1;
 536   else
 537     {
 538       lex_error (_("expecting integer"));
 539       return 0;
 540     }
 541 }
 542
 543 /* If this token is a number, does nothing and returns nonzero.
 544    Otherwise, reports an error and returns zero. */
 545 int
 546 lex_force_num (void)
 547 {
 548   if (token == T_NUM)
 549     return 1;
 550   else
 551     {
 552       lex_error (_("expecting number"));
 553       return 0;
 554     }
 555 }
 556
 557 /* If this token is an identifier, does nothing and returns nonzero.
 558    Otherwise, reports an error and returns zero. */
 559 int
 560 lex_force_id (void)
 561 {
 562   if (token == T_ID)
 563     return 1;
 564   else
 565     {
 566       lex_error (_("expecting identifier"));
 567       return 0;
 568     }
 569 }
 570 \f
 571 /* Comparing identifiers. */
 572
 573 /* Keywords match if one of the following is true: KW and TOK are
 574    identical (barring differences in case), or TOK is at least 3
 575    characters long and those characters are identical to KW.  KW_LEN
 576    is the length of KW, TOK_LEN is the length of TOK. */
 577 int
 578 lex_id_match_len (const char *kw, size_t kw_len,
 579                   const char *tok, size_t tok_len)
 580 {
 581   size_t i = 0;
 582
 583   assert (kw && tok);
 584   for (;;)
 585     {
 586       if (i == kw_len && i == tok_len)
 587         return 1;
 588       else if (i == tok_len)
 589         return i >= 3;
 590       else if (i == kw_len)
 591         return 0;
 592       else if (toupper ((unsigned char) kw[i])
 593                != toupper ((unsigned char) tok[i]))
 594         return 0;
 595
 596       i++;
 597     }
 598 }
 599
 600 /* Same as lex_id_match_len() minus the need to pass in the lengths. */
 601 int
 602 lex_id_match (const char *kw, const char *tok)
 603 {
 604   return lex_id_match_len (kw, strlen (kw), tok, strlen (tok));
 605 }
 606 \f
 607 /* Weird token functions. */
 608
 609 /* Returns the first character of the next token, except that if the
 610    next token is not an identifier, the character returned will not be
 611    a character that can begin an identifier.  Specifically, the
 612    hexstring lead-in X' causes lookahead() to return '.  Note that an
 613    alphanumeric return value doesn't guarantee an ID token, it could
 614    also be a reserved-word token. */
 615 int
 616 lex_look_ahead (void)
 617 {
 618   if (put)
 619     return put;
 620
 621   for (;;)
 622     {
 623       if (eof)
 624         unexpected_eof ();
 625
 626       for (;;)
 627         {
 628           while (isspace ((unsigned char) *prog))
 629             prog++;
 630           if (*prog)
 631             break;
 632
 633           if (dot)
 634             return '.';
 635           else if (!lex_get_line ())
 636             unexpected_eof ();
 637
 638           if (put)
 639             return put;
 640         }
 641
 642       if ((toupper ((unsigned char) *prog) == 'X'
 643            || toupper ((unsigned char) *prog) == 'B')
 644           && (prog[1] == '\'' || prog[1] == '"'))
 645         return '\'';
 646
 647       return *prog;
 648     }
 649 }
 650
 651 /* Makes the current token become the next token to be read; the
 652    current token is set to T. */
 653 void
 654 lex_put_back (int t)
 655 {
 656   put = token;
 657   token = t;
 658 }
 659
 660 /* Makes T the next token read. */
 661 void
 662 lex_put_forward (int t)
 663 {
 664   put = t;
 665 }
 666 \f
 667 /* Weird line processing functions. */
 668
 669 /* Discards the rest of the current input line for tokenization
 670    purposes, but returns the entire contents of the line for use by
 671    the caller. */
 672 char *
 673 lex_entire_line (void)
 674 {
 675   prog = ds_end (&getl_buf);
 676   dot = 0;
 677   return ds_value (&getl_buf);
 678 }
 679
 680 /* As lex_entire_line(), but only returns the part of the current line
 681    that hasn't already been tokenized.
 682    If HAD_DOT is non-null, stores nonzero into *HAD_DOT if the line
 683    ends with a terminal dot, or zero if it doesn't. */
 684 char *
 685 lex_rest_of_line (int *had_dot)
 686 {
 687   char *s = prog;
 688   prog = ds_end (&getl_buf);
 689
 690   if (had_dot)
 691     *had_dot = dot;
 692   dot = 0;
 693
 694   return s;
 695 }
 696
 697 /* Causes the rest of the current input line to be ignored for
 698    tokenization purposes. */
 699 void
 700 lex_discard_line (void)
 701 {
 702   msg (SW, _("The rest of this command has been discarded."));
 703
 704   ds_clear (&getl_buf);
 705   prog = ds_value (&getl_buf);
 706   dot = put = 0;
 707 }
 708
 709 /* Sets the current position in the current line to P, which must be
 710    in getl_buf. */
 711 void
 712 lex_set_prog (char *p)
 713 {
 714   prog = p;
 715 }
 716 \f
 717 /* Weird line reading functions. */
 718
 719 /* Read a line for use by the tokenizer. */
 720 int
 721 lex_get_line (void)
 722 {
 723   if (!getl_read_line ())
 724     return 0;
 725
 726   lex_preprocess_line ();
 727   return 1;
 728 }
 729
 730 /* Preprocesses getl_buf by removing comments, stripping trailing
 731    whitespace and the terminal dot, and removing leading indentors. */
 732 void
 733 lex_preprocess_line (void)
 734 {
 735   /* Strips comments. */
 736   {
 737     /* getl_buf iterator. */
 738     char *cp;
 739
 740     /* Nonzero inside a comment. */
 741     int comment;
 742
 743     /* Nonzero inside a quoted string. */
 744     int quote;
 745
 746     /* Remove C-style comments begun by slash-star and terminated by
 747      star-slash or newline. */
 748     quote = comment = 0;
 749     for (cp = ds_value (&getl_buf); *cp; )
 750       {
 751         /* If we're not commented out, toggle quoting. */
 752         if (!comment)
 753           {
 754             if (*cp == quote)
 755               quote = 0;
 756             else if (*cp == '\'' || *cp == '"')
 757               quote = *cp;
 758           }
 759
 760         /* If we're not quoting, toggle commenting. */
 761         if (!quote)
 762           {
 763             if (cp[0] == '/' && cp[1] == '*')
 764               {
 765                 comment = 1;
 766                 *cp++ = ' ';
 767                 *cp++ = ' ';
 768                 continue;
 769               }
 770             else if (cp[0] == '*' && cp[1] == '/' && comment)
 771               {
 772                 comment = 0;
 773                 *cp++ = ' ';
 774                 *cp++ = ' ';
 775                 continue;
 776               }
 777           }
 778
 779         /* Check commenting. */
 780         if (!comment)
 781           cp++;
 782         else
 783           *cp++ = ' ';
 784       }
 785   }
 786
 787   /* Strip trailing whitespace and terminal dot. */
 788   {
 789     size_t len = ds_length (&getl_buf);
 790     char *s = ds_value (&getl_buf);
 791
 792     /* Strip trailing whitespace. */
 793     while (len > 0 && isspace ((unsigned char) s[len - 1]))
 794       len--;
 795
 796     /* Check for and remove terminal dot. */
 797     if (len > 0 && s[len - 1] == get_endcmd() )
 798       {
 799         dot = 1;
 800         len--;
 801       }
 802     else if (len == 0 && get_nullline() )
 803       dot = 1;
 804     else
 805       dot = 0;
 806
 807     /* Set length. */
 808     ds_truncate (&getl_buf, len);
 809   }
 810
 811   /* In batch mode, strip leading indentors and insert a terminal dot
 812      as necessary. */
 813   if (getl_interactive != 2 && getl_mode == GETL_MODE_BATCH)
 814     {
 815       char *s = ds_value (&getl_buf);
 816
 817       if (s[0] == '+' || s[0] == '-' || s[0] == '.')
 818         s[0] = ' ';
 819       else if (s[0] && !isspace ((unsigned char) s[0]))
 820         lex_put_forward ('.');
 821     }
 822
 823   prog = ds_value (&getl_buf);
 824 }
 825 \f
 826 /* Token names. */
 827
 828 /* Returns the name of a token in a static buffer. */
 829 const char *
 830 lex_token_name (int token)
 831 {
 832   if (token >= T_FIRST_KEYWORD && token <= T_LAST_KEYWORD)
 833     return keywords[token - T_FIRST_KEYWORD];
 834
 835   if (token < 256)
 836     {
 837       static char t[2];
 838       t[0] = token;
 839       return t;
 840     }
 841
 842   return _("<ERROR>");
 843 }
 844
 845 /* Returns an ASCII representation of the current token as a
 846    malloc()'d string. */
 847 char *
 848 lex_token_representation (void)
 849 {
 850   char *token_rep;
 851
 852   switch (token)
 853     {
 854     case T_ID:
 855     case T_NUM:
 856       return xstrdup (ds_value (&tokstr));
 857       break;
 858
 859     case T_STRING:
 860       {
 861         int hexstring = 0;
 862         char *sp, *dp;
 863
 864         for (sp = ds_value (&tokstr); sp < ds_end (&tokstr); sp++)
 865           if (!isprint ((unsigned char) *sp))
 866             {
 867               hexstring = 1;
 868               break;
 869             }
 870
 871         token_rep = xmalloc (2 + ds_length (&tokstr) * 2 + 1 + 1);
 872
 873         dp = token_rep;
 874         if (hexstring)
 875           *dp++ = 'X';
 876         *dp++ = '\'';
 877
 878         if (!hexstring)
 879           for (sp = ds_value (&tokstr); *sp; )
 880             {
 881               if (*sp == '\'')
 882                 *dp++ = '\'';
 883               *dp++ = (unsigned char) *sp++;
 884             }
 885         else
 886           for (sp = ds_value (&tokstr); sp < ds_end (&tokstr); sp++)
 887             {
 888               *dp++ = (((unsigned char) *sp) >> 4)["0123456789ABCDEF"];
 889               *dp++ = (((unsigned char) *sp) & 15)["0123456789ABCDEF"];
 890             }
 891         *dp++ = '\'';
 892         *dp = '\0';
 893
 894         return token_rep;
 895       }
 896     break;
 897
 898     case T_STOP:
 899       token_rep = xmalloc (1);
 900       *token_rep = '\0';
 901       return token_rep;
 902
 903     case T_EXP:
 904       return xstrdup ("**");
 905
 906     default:
 907       if (token >= T_FIRST_KEYWORD && token <= T_LAST_KEYWORD)
 908         return xstrdup (keywords [token - T_FIRST_KEYWORD]);
 909       else
 910         {
 911           token_rep = xmalloc (2);
 912           token_rep[0] = token;
 913           token_rep[1] = '\0';
 914           return token_rep;
 915         }
 916     }
 917
 918   assert (0);
 919 }
 920 \f
 921 /* Really weird functions. */
 922
 923 /* Most of the time, a `-' is a lead-in to a negative number.  But
 924    sometimes it's actually part of the syntax.  If a dash can be part
 925    of syntax then this function is called to rip it off of a
 926    number. */
 927 void
 928 lex_negative_to_dash (void)
 929 {
 930   if (token == T_NUM && tokval < 0.0)
 931     {
 932       token = '-';
 933       tokval = -tokval;
 934       ds_replace (&tokstr, ds_value (&tokstr) + 1);
 935       lex_put_forward (T_NUM);
 936     }
 937 }
 938
 939 /* We're not at eof any more. */
 940 void
 941 lex_reset_eof (void)
 942 {
 943   eof = 0;
 944 }
 945
 946 /* Skip a COMMENT command. */
 947 void
 948 lex_skip_comment (void)
 949 {
 950   for (;;)
 951     {
 952       lex_get_line ();
 953       if (put == '.')
 954         break;
 955
 956       prog = ds_end (&getl_buf);
 957       if (dot)
 958         break;
 959     }
 960 }
 961 \f
 962 /* Private functions. */
 963
 964 /* Unexpected end of file. */
 965 static void
 966 unexpected_eof (void)
 967 {
 968   msg (FE, _("Unexpected end of file."));
 969 }
 970
 971 /* Returns the proper token type, either T_ID or a reserved keyword
 972    enum, for ID[], which must contain LEN characters. */
 973 static inline int
 974 check_id (const char *id, size_t len)
 975 {
 976   const char **kwp;
 977
 978   if (len < 2 || len > 4)
 979     return T_ID;
 980
 981   for (kwp = keywords; *kwp; kwp++)
 982     if (!strcmp (*kwp, id))
 983       return T_FIRST_KEYWORD + (kwp - keywords);
 984
 985   return T_ID;
 986 }
 987
 988 /* When invoked, tokstr contains a string of binary, octal, or hex
 989    digits, for values of TYPE of 0, 1, or 2, respectively.  The string
 990    is converted to characters having the specified values. */
 991 static void
 992 convert_numeric_string_to_char_string (int type)
 993 {
 994   static const char *base_names[] = {N_("binary"), N_("octal"), N_("hex")};
 995   static const int bases[] = {2, 8, 16};
 996   static const int chars_per_byte[] = {8, 3, 2};
 997
 998   const char *const base_name = base_names[type];
 999   const int base = bases[type];
1000   const int cpb = chars_per_byte[type];
1001   const int nb = ds_length (&tokstr) / cpb;
1002   int i;
1003   char *p;
1004
1005   assert (type >= 0 && type <= 2);
1006
1007   if (ds_length (&tokstr) % cpb)
1008     msg (SE, _("String of %s digits has %d characters, which is not a "
1009                "multiple of %d."),
1010          gettext (base_name), ds_length (&tokstr), cpb);
1011
1012   p = ds_value (&tokstr);
1013   for (i = 0; i < nb; i++)
1014     {
1015       int value;
1016       int j;
1017
1018       value = 0;
1019       for (j = 0; j < cpb; j++, p++)
1020         {
1021           int v;
1022
1023           if (*p >= '0' && *p <= '9')
1024             v = *p - '0';
1025           else
1026             {
1027               static const char alpha[] = "abcdef";
1028               const char *q = strchr (alpha, tolower ((unsigned char) *p));
1029
1030               if (q)
1031                 v = q - alpha + 10;
1032               else
1033                 v = base;
1034             }
1035
1036           if (v >= base)
1037             msg (SE, _("`%c' is not a valid %s digit."), *p, base_name);
1038
1039           value = value * base + v;
1040         }
1041
1042       ds_value (&tokstr)[i] = (unsigned char) value;
1043     }
1044
1045   ds_truncate (&tokstr, nb);
1046 }
1047
1048 /* Parses a string from the input buffer into tokstr.  The input
1049    buffer pointer prog must point to the initial single or double
1050    quote.  TYPE is 0 if it is an ordinary string, or 1, 2, or 3 for a
1051    binary, octal, or hexstring, respectively.  Returns token type. */
1052 static int
1053 parse_string (int type)
1054 {
1055   /* Accumulate the entire string, joining sections indicated by +
1056      signs. */
1057   for (;;)
1058     {
1059       /* Single or double quote. */
1060       int c = *prog++;
1061
1062       /* Accumulate section. */
1063       for (;;)
1064         {
1065           /* Check end of line. */
1066           if (*prog == 0)
1067             {
1068               msg (SE, _("Unterminated string constant."));
1069               goto finish;
1070             }
1071
1072           /* Double quote characters to embed them in strings. */
1073           if (*prog == c)
1074             {
1075               if (prog[1] == c)
1076                 prog++;
1077               else
1078                 break;
1079             }
1080
1081           ds_putchar (&tokstr, *prog++);
1082         }
1083       prog++;
1084
1085       /* Skip whitespace after final quote mark. */
1086       if (eof)
1087         break;
1088       for (;;)
1089         {
1090           while (isspace ((unsigned char) *prog))
1091             prog++;
1092           if (*prog)
1093             break;
1094
1095           if (dot)
1096             goto finish;
1097
1098           if (!lex_get_line ())
1099             unexpected_eof ();
1100         }
1101
1102       /* Skip plus sign. */
1103       if (*prog != '+')
1104         break;
1105       prog++;
1106
1107       /* Skip whitespace after plus sign. */
1108       if (eof)
1109         break;
1110       for (;;)
1111         {
1112           while (isspace ((unsigned char) *prog))
1113             prog++;
1114           if (*prog)
1115             break;
1116
1117           if (dot)
1118             goto finish;
1119
1120           if (!lex_get_line ())
1121             unexpected_eof ();
1122         }
1123
1124       /* Ensure that a valid string follows. */
1125       if (*prog != '\'' && *prog != '"')
1126         {
1127           msg (SE, "String expected following `+'.");
1128           goto finish;
1129         }
1130     }
1131
1132   /* We come here when we've finished concatenating all the string sections
1133      into one large string. */
1134 finish:
1135   if (type != 0)
1136     convert_numeric_string_to_char_string (type - 1);
1137
1138   if (ds_length (&tokstr) > 255)
1139     {
1140       msg (SE, _("String exceeds 255 characters in length (%d characters)."),
1141            ds_length (&tokstr));
1142       ds_truncate (&tokstr, 255);
1143     }
1144
1145   {
1146     /* FIXME. */
1147     size_t i;
1148     int warned = 0;
1149
1150     for (i = 0; i < ds_length (&tokstr); i++)
1151       if (ds_value (&tokstr)[i] == 0)
1152         {
1153           if (!warned)
1154             {
1155               msg (SE, _("Sorry, literal strings may not contain null "
1156                          "characters.  Replacing with spaces."));
1157               warned = 1;
1158             }
1159           ds_value (&tokstr)[i] = ' ';
1160         }
1161   }
1162
1163   return T_STRING;
1164 }
1165 \f
1166 #if DUMP_TOKENS
1167 /* Reads one token from the lexer and writes a textual representation
1168    on stdout for debugging purposes. */
1169 static void
1170 dump_token (void)
1171 {
1172   {
1173     const char *curfn;
1174     int curln;
1175
1176     getl_location (&curfn, &curln);
1177     if (curfn)
1178       printf ("%s:%d\t", curfn, curln);
1179   }
1180
1181   switch (token)
1182     {
1183     case T_ID:
1184       printf ("ID\t%s\n", tokid);
1185       break;
1186
1187     case T_NUM:
1188       printf ("NUM\t%f\n", tokval);
1189       break;
1190
1191     case T_STRING:
1192       printf ("STRING\t\"%s\"\n", ds_value (&tokstr));
1193       break;
1194
1195     case T_STOP:
1196       printf ("STOP\n");
1197       break;
1198
1199     case T_EXP:
1200       puts ("MISC\tEXP");
1201       break;
1202
1203     case 0:
1204       puts ("MISC\tEOF");
1205       break;
1206
1207     default:
1208       if (token >= T_FIRST_KEYWORD && token <= T_LAST_KEYWORD)
1209         printf ("KEYWORD\t%s\n", lex_token_name (token));
1210       else
1211         printf ("PUNCT\t%c\n", token);
1212       break;
1213     }
1214 }
1215 #endif /* DEBUGGING */