src/lexer.c

   1 /* PSPP - computes sample statistics.
   2    Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
   3    Written by Ben Pfaff <blp@gnu.org>.
   4
   5    This program is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU General Public License as
   7    published by the Free Software Foundation; either version 2 of the
   8    License, or (at your option) any later version.
   9
  10    This program is distributed in the hope that it will be useful, but
  11    WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program; if not, write to the Free Software
  17    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
  18    02111-1307, USA. */
  19
  20 #include <config.h>
  21 #include <assert.h>
  22 #include <ctype.h>
  23 #include <errno.h>
  24 #include <limits.h>
  25 #include <math.h>
  26 #include <stdarg.h>
  27 #include <stdlib.h>
  28 #include "alloc.h"
  29 #include "command.h"
  30 #include "error.h"
  31 #include "getline.h"
  32 #include "lexer.h"
  33 #include "magic.h"
  34 #include "settings.h"
  35 #include "str.h"
  36
  37 /*#define DUMP_TOKENS 1*/
  38
  39 \f
  40 /* Global variables. */
  41
  42 /* Current token. */
  43 int token;
  44
  45 /* T_NUM: the token's value. */
  46 double tokval;
  47
  48 /* T_ID: the identifier. */
  49 char tokid[9];
  50
  51 /* T_ID, T_STRING: token string value.
  52    For T_ID, this is not truncated to 8 characters as is tokid. */
  53 struct string tokstr;
  54 \f
  55 /* Static variables. */
  56
  57 /* Table of keywords. */
  58 static const char *keywords[T_N_KEYWORDS + 1] =
  59   {
  60     "AND", "OR", "NOT",
  61     "EQ", "GE", "GT", "LE", "LT", "NE",
  62     "ALL", "BY", "TO", "WITH",
  63     NULL,
  64   };
  65
  66 /* Pointer to next token in getl_buf. */
  67 static char *prog;
  68
  69 /* Nonzero only if this line ends with a terminal dot. */
  70 static int dot;
  71
  72 /* Nonzero only if the last token returned was T_EOF. */
  73 static int eof;
  74
  75 /* If nonzero, next token returned by lex_get().
  76    Used only in exceptional circumstances. */
  77 static int put;
  78
  79 static void unexpected_eof (void);
  80 static inline int check_id (const char *id, size_t len);
  81 static void convert_numeric_string_to_char_string (int type);
  82 static int parse_string (int type);
  83
  84 #if DUMP_TOKENS
  85 static void dump_token (void);
  86 #endif
  87 \f
  88 /* Initialization. */
  89
  90 /* Initializes the lexer. */
  91 void
  92 lex_init (void)
  93 {
  94   if (!lex_get_line ())
  95     unexpected_eof ();
  96 }
  97 \f
  98 /* Common functions. */
  99
 100 /* Parses a single token, setting appropriate global variables to
 101    indicate the token's attributes. */
 102 void
 103 lex_get (void)
 104 {
 105   /* If a token was pushed ahead, return it. */
 106   if (put)
 107     {
 108       token = put;
 109       put = 0;
 110 #if DUMP_TOKENS
 111       dump_token ();
 112 #endif
 113       return;
 114     }
 115
 116   /* Find a token. */
 117   for (;;)
 118     {
 119       char *cp;
 120
 121       /* Skip whitespace. */
 122       if (eof)
 123         unexpected_eof ();
 124
 125       for (;;)
 126         {
 127           while (isspace ((unsigned char) *prog))
 128             prog++;
 129           if (*prog)
 130             break;
 131
 132           if (dot)
 133             {
 134               dot = 0;
 135               token = '.';
 136 #if DUMP_TOKENS
 137               dump_token ();
 138 #endif
 139               return;
 140             }
 141           else if (!lex_get_line ())
 142             {
 143               eof = 1;
 144               token = T_STOP;
 145 #if DUMP_TOKENS
 146               dump_token ();
 147 #endif
 148               return;
 149             }
 150
 151           if (put)
 152             {
 153               token = put;
 154               put = 0;
 155 #if DUMP_TOKENS
 156               dump_token ();
 157 #endif
 158               return;
 159             }
 160         }
 161
 162       /* Actually parse the token. */
 163       cp = prog;
 164       ds_clear (&tokstr);
 165
 166       switch (*prog)
 167         {
 168         case '-': case '.':
 169         case '0': case '1': case '2': case '3': case '4':
 170         case '5': case '6': case '7': case '8': case '9':
 171           {
 172             char *tail;
 173
 174             /* `-' can introduce a negative number, or it can be a
 175                token by itself.  If it is not followed by a digit or a
 176                decimal point, it is definitely not a number.
 177                Otherwise, it might be either, but most of the time we
 178                want it as a number.  When the syntax calls for a `-'
 179                token, lex_negative_to_dash() must be used to break
 180                negative numbers into two tokens. */
 181             if (*cp == '-')
 182               {
 183                 ds_putchar (&tokstr, *prog++);
 184                 while (isspace ((unsigned char) *prog))
 185                   prog++;
 186
 187                 if (!isdigit ((unsigned char) *prog) && *prog != '.')
 188                   {
 189                     token = '-';
 190                     break;
 191                   }
 192               }
 193
 194             /* Parse the number, copying it into tokstr. */
 195             while (isdigit ((unsigned char) *prog))
 196               ds_putchar (&tokstr, *prog++);
 197             if (*prog == '.')
 198               {
 199                 ds_putchar (&tokstr, *prog++);
 200                 while (isdigit ((unsigned char) *prog))
 201                   ds_putchar (&tokstr, *prog++);
 202               }
 203             if (*prog == 'e' || *prog == 'E')
 204               {
 205                 ds_putchar (&tokstr, *prog++);
 206                 if (*prog == '+' || *prog == '-')
 207                   ds_putchar (&tokstr, *prog++);
 208                 while (isdigit ((unsigned char) *prog))
 209                   ds_putchar (&tokstr, *prog++);
 210               }
 211
 212             /* Parse as floating point. */
 213             tokval = strtod (ds_value (&tokstr), &tail);
 214             if (*tail)
 215               {
 216                 msg (SE, _("%s does not form a valid number."),
 217                      ds_value (&tokstr));
 218                 tokval = 0.0;
 219
 220                 ds_clear (&tokstr);
 221                 ds_putchar (&tokstr, '0');
 222               }
 223
 224             token = T_NUM;
 225             break;
 226           }
 227
 228         case '\'': case '"':
 229           token = parse_string (0);
 230           break;
 231
 232         case '(': case ')': case ',': case '=': case '+': case '/':
 233           token = *prog++;
 234           break;
 235
 236         case '*':
 237           if (*++prog == '*')
 238             {
 239               prog++;
 240               token = T_EXP;
 241             }
 242           else
 243             token = '*';
 244           break;
 245
 246         case '<':
 247           if (*++prog == '=')
 248             {
 249               prog++;
 250               token = T_LE;
 251             }
 252           else if (*prog == '>')
 253             {
 254               prog++;
 255               token = T_NE;
 256             }
 257           else
 258             token = T_LT;
 259           break;
 260
 261         case '>':
 262           if (*++prog == '=')
 263             {
 264               prog++;
 265               token = T_GE;
 266             }
 267           else
 268             token = T_GT;
 269           break;
 270
 271         case '~':
 272           if (*++prog == '=')
 273             {
 274               prog++;
 275               token = T_NE;
 276             }
 277           else
 278             token = T_NOT;
 279           break;
 280
 281         case '&':
 282           prog++;
 283           token = T_AND;
 284           break;
 285
 286         case '|':
 287           prog++;
 288           token = T_OR;
 289           break;
 290
 291         case 'a': case 'b': case 'c': case 'd': case 'e':
 292         case 'f': case 'g': case 'h': case 'i': case 'j':
 293         case 'k': case 'l': case 'm': case 'n': case 'o':
 294         case 'p': case 'q': case 'r': case 's': case 't':
 295         case 'u': case 'v': case 'w': case 'x': case 'y':
 296         case 'z':
 297         case 'A': case 'B': case 'C': case 'D': case 'E':
 298         case 'F': case 'G': case 'H': case 'I': case 'J':
 299         case 'K': case 'L': case 'M': case 'N': case 'O':
 300         case 'P': case 'Q': case 'R': case 'S': case 'T':
 301         case 'U': case 'V': case 'W': case 'X': case 'Y':
 302         case 'Z':
 303         case '#': case '$': case '@':
 304           /* Strings can be specified in binary, octal, or hex using
 305                this special syntax. */
 306           if (prog[1] == '\'' || prog[1] == '"')
 307             {
 308               static const char special[3] = "box";
 309               const char *p;
 310
 311               p = strchr (special, tolower ((unsigned char) *prog));
 312               if (p)
 313                 {
 314                   prog++;
 315                   token = parse_string (p - special + 1);
 316                   break;
 317                 }
 318             }
 319
 320           /* Copy id to tokstr. */
 321           ds_putchar (&tokstr, toupper ((unsigned char) *prog++));
 322           while (CHAR_IS_IDN (*prog))
 323             ds_putchar (&tokstr, toupper ((unsigned char) *prog++));
 324
 325           /* Copy tokstr to tokid, truncating it to 8 characters. */
 326           strncpy (tokid, ds_value (&tokstr), 8);
 327           tokid[8] = 0;
 328
 329           token = check_id (ds_value (&tokstr), ds_length (&tokstr));
 330           break;
 331
 332         default:
 333           if (isgraph ((unsigned char) *prog))
 334             msg (SE, _("Bad character in input: `%c'."), *prog++);
 335           else
 336             msg (SE, _("Bad character in input: `\\%o'."), *prog++);
 337           continue;
 338         }
 339
 340       break;
 341     }
 342
 343 #if DUMP_TOKENS
 344   dump_token ();
 345 #endif
 346 }
 347
 348 /* Prints a syntax error message containing the current token and
 349    given message MESSAGE (if non-null). */
 350 void
 351 lex_error (const char *message, ...)
 352 {
 353   char *token_rep;
 354
 355   token_rep = lex_token_representation ();
 356   if (token_rep[0] == 0)
 357     msg (SE, _("Syntax error at end of file."));
 358   else if (message)
 359     {
 360       char buf[1024];
 361       va_list args;
 362
 363       va_start (args, message);
 364       vsnprintf (buf, 1024, message, args);
 365       va_end (args);
 366
 367       msg (SE, _("Syntax error %s at `%s'."), buf, token_rep);
 368     }
 369   else
 370     msg (SE, _("Syntax error at `%s'."), token_rep);
 371
 372   free (token_rep);
 373 }
 374
 375 /* Checks that we're at end of command.
 376    If so, returns a successful command completion code.
 377    If not, flags a syntax error and returns an error command
 378    completion code. */
 379 int
 380 lex_end_of_command (void)
 381 {
 382   if (token != '.')
 383     {
 384       lex_error (_("expecting end of command"));
 385       return CMD_TRAILING_GARBAGE;
 386     }
 387   else
 388     return CMD_SUCCESS;
 389 }
 390 \f
 391 /* Token testing functions. */
 392
 393 /* Returns nonzero if the current token is an integer. */
 394 int
 395 lex_integer_p (void)
 396 {
 397   return (token == T_NUM
 398           && tokval != NOT_LONG
 399           && tokval >= LONG_MIN
 400           && tokval <= LONG_MAX
 401           && floor (tokval) == tokval);
 402 }
 403
 404 /* Returns the value of the current token, which must be an
 405    integer. */
 406 long
 407 lex_integer (void)
 408 {
 409   assert (lex_integer_p ());
 410   return tokval;
 411 }
 412 \f
 413 /* Token matching functions. */
 414
 415 /* If TOK is the current token, skips it and returns nonzero.
 416    Otherwise, returns zero. */
 417 int
 418 lex_match (int t)
 419 {
 420   if (token == t)
 421     {
 422       lex_get ();
 423       return 1;
 424     }
 425   else
 426     return 0;
 427 }
 428
 429 /* If the current token is the identifier S, skips it and returns
 430    nonzero.
 431    Otherwise, returns zero. */
 432 int
 433 lex_match_id (const char *s)
 434 {
 435   if (token == T_ID && lex_id_match (s, tokid))
 436     {
 437       lex_get ();
 438       return 1;
 439     }
 440   else
 441     return 0;
 442 }
 443
 444 /* If the current token is integer N, skips it and returns nonzero.
 445    Otherwise, returns zero. */
 446 int
 447 lex_match_int (int x)
 448 {
 449   if (lex_integer_p () && lex_integer () == x)
 450     {
 451       lex_get ();
 452       return 1;
 453     }
 454   else
 455     return 0;
 456 }
 457 \f
 458 /* Forced matches. */
 459
 460 /* If this token is identifier S, fetches the next token and returns
 461    nonzero.
 462    Otherwise, reports an error and returns zero. */
 463 int
 464 lex_force_match_id (const char *s)
 465 {
 466   if (token == T_ID && lex_id_match (s, tokid))
 467     {
 468       lex_get ();
 469       return 1;
 470     }
 471   else
 472     {
 473       lex_error (_("expecting `%s'"), s);
 474       return 0;
 475     }
 476 }
 477
 478 /* If the current token is T, skips the token.  Otherwise, reports an
 479    error and returns from the current function with return value 0. */
 480 int
 481 lex_force_match (int t)
 482 {
 483   if (token == t)
 484     {
 485       lex_get ();
 486       return 1;
 487     }
 488   else
 489     {
 490       lex_error (_("expecting %s"), lex_token_name (t));
 491       return 0;
 492     }
 493 }
 494
 495 /* If this token is a string, does nothing and returns nonzero.
 496    Otherwise, reports an error and returns zero. */
 497 int
 498 lex_force_string (void)
 499 {
 500   if (token == T_STRING)
 501     return 1;
 502   else
 503     {
 504       lex_error (_("expecting string"));
 505       return 0;
 506     }
 507 }
 508
 509 /* If this token is an integer, does nothing and returns nonzero.
 510    Otherwise, reports an error and returns zero. */
 511 int
 512 lex_force_int (void)
 513 {
 514   if (lex_integer_p ())
 515     return 1;
 516   else
 517     {
 518       lex_error (_("expecting integer"));
 519       return 0;
 520     }
 521 }
 522
 523 /* If this token is a number, does nothing and returns nonzero.
 524    Otherwise, reports an error and returns zero. */
 525 int
 526 lex_force_num (void)
 527 {
 528   if (token == T_NUM)
 529     return 1;
 530   else
 531     {
 532       lex_error (_("expecting number"));
 533       return 0;
 534     }
 535 }
 536
 537 /* If this token is an identifier, does nothing and returns nonzero.
 538    Otherwise, reports an error and returns zero. */
 539 int
 540 lex_force_id (void)
 541 {
 542   if (token == T_ID)
 543     return 1;
 544   else
 545     {
 546       lex_error (_("expecting identifier"));
 547       return 0;
 548     }
 549 }
 550 \f
 551 /* Comparing identifiers. */
 552
 553 /* Keywords match if one of the following is true: KW and TOK are
 554    identical (barring differences in case), or TOK is at least 3
 555    characters long and those characters are identical to KW.  KW_LEN
 556    is the length of KW, TOK_LEN is the length of TOK. */
 557 int
 558 lex_id_match_len (const char *kw, size_t kw_len,
 559                   const char *tok, size_t tok_len)
 560 {
 561   size_t i = 0;
 562
 563   assert (kw && tok);
 564   for (;;)
 565     {
 566       if (i == kw_len && i == tok_len)
 567         return 1;
 568       else if (i == tok_len)
 569         return i >= 3;
 570       else if (i == kw_len)
 571         return 0;
 572       else if (toupper ((unsigned char) kw[i])
 573                != toupper ((unsigned char) tok[i]))
 574         return 0;
 575
 576       i++;
 577     }
 578 }
 579
 580 /* Same as lex_id_match_len() minus the need to pass in the lengths. */
 581 int
 582 lex_id_match (const char *kw, const char *tok)
 583 {
 584   return lex_id_match_len (kw, strlen (kw), tok, strlen (tok));
 585 }
 586 \f
 587 /* Weird token functions. */
 588
 589 /* Returns the first character of the next token, except that if the
 590    next token is not an identifier, the character returned will not be
 591    a character that can begin an identifier.  Specifically, the
 592    hexstring lead-in X' causes lookahead() to return '.  Note that an
 593    alphanumeric return value doesn't guarantee an ID token, it could
 594    also be a reserved-word token. */
 595 int
 596 lex_look_ahead (void)
 597 {
 598   if (put)
 599     return put;
 600
 601   for (;;)
 602     {
 603       if (eof)
 604         unexpected_eof ();
 605
 606       for (;;)
 607         {
 608           while (isspace ((unsigned char) *prog))
 609             prog++;
 610           if (*prog)
 611             break;
 612
 613           if (dot)
 614             return '.';
 615           else if (!lex_get_line ())
 616             unexpected_eof ();
 617
 618           if (put)
 619             return put;
 620         }
 621
 622       if ((toupper ((unsigned char) *prog) == 'X'
 623            || toupper ((unsigned char) *prog) == 'B')
 624           && (prog[1] == '\'' || prog[1] == '"'))
 625         return '\'';
 626
 627       return *prog;
 628     }
 629 }
 630
 631 /* Makes the current token become the next token to be read; the
 632    current token is set to T. */
 633 void
 634 lex_put_back (int t)
 635 {
 636   put = token;
 637   token = t;
 638 }
 639
 640 /* Makes T the next token read. */
 641 void
 642 lex_put_forward (int t)
 643 {
 644   put = t;
 645 }
 646 \f
 647 /* Weird line processing functions. */
 648
 649 /* Discards the rest of the current input line for tokenization
 650    purposes, but returns the entire contents of the line for use by
 651    the caller. */
 652 char *
 653 lex_entire_line (void)
 654 {
 655   prog = ds_end (&getl_buf);
 656   dot = 0;
 657   return ds_value (&getl_buf);
 658 }
 659
 660 /* As lex_entire_line(), but only returns the part of the current line
 661    that hasn't already been tokenized.
 662    If HAD_DOT is non-null, stores nonzero into *HAD_DOT if the line
 663    ends with a terminal dot, or zero if it doesn't. */
 664 char *
 665 lex_rest_of_line (int *had_dot)
 666 {
 667   char *s = prog;
 668   prog = ds_end (&getl_buf);
 669
 670   if (had_dot)
 671     *had_dot = dot;
 672   dot = 0;
 673
 674   return s;
 675 }
 676
 677 /* Causes the rest of the current input line to be ignored for
 678    tokenization purposes. */
 679 void
 680 lex_discard_line (void)
 681 {
 682   msg (SW, _("The rest of this command has been discarded."));
 683
 684   ds_clear (&getl_buf);
 685   prog = ds_value (&getl_buf);
 686   dot = put = 0;
 687 }
 688
 689 /* Sets the current position in the current line to P, which must be
 690    in getl_buf. */
 691 void
 692 lex_set_prog (char *p)
 693 {
 694   prog = p;
 695 }
 696 \f
 697 /* Weird line reading functions. */
 698
 699 /* Read a line for use by the tokenizer. */
 700 int
 701 lex_get_line (void)
 702 {
 703   if (!getl_read_line ())
 704     return 0;
 705
 706   lex_preprocess_line ();
 707   return 1;
 708 }
 709
 710 /* Preprocesses getl_buf by removing comments, stripping trailing
 711    whitespace and the terminal dot, and removing leading indentors. */
 712 void
 713 lex_preprocess_line (void)
 714 {
 715   /* Strips comments. */
 716   {
 717     /* getl_buf iterator. */
 718     char *cp;
 719
 720     /* Nonzero inside a comment. */
 721     int comment;
 722
 723     /* Nonzero inside a quoted string. */
 724     int quote;
 725
 726     /* Remove C-style comments begun by slash-star and terminated by
 727      star-slash or newline. */
 728     quote = comment = 0;
 729     for (cp = ds_value (&getl_buf); *cp; )
 730       {
 731         /* If we're not commented out, toggle quoting. */
 732         if (!comment)
 733           {
 734             if (*cp == quote)
 735               quote = 0;
 736             else if (*cp == '\'' || *cp == '"')
 737               quote = *cp;
 738           }
 739
 740         /* If we're not quoting, toggle commenting. */
 741         if (!quote)
 742           {
 743             if (cp[0] == '/' && cp[1] == '*')
 744               {
 745                 comment = 1;
 746                 *cp++ = ' ';
 747                 *cp++ = ' ';
 748                 continue;
 749               }
 750             else if (cp[0] == '*' && cp[1] == '/' && comment)
 751               {
 752                 comment = 0;
 753                 *cp++ = ' ';
 754                 *cp++ = ' ';
 755                 continue;
 756               }
 757           }
 758
 759         /* Check commenting. */
 760         if (!comment)
 761           cp++;
 762         else
 763           *cp++ = ' ';
 764       }
 765   }
 766
 767   /* Strip trailing whitespace and terminal dot. */
 768   {
 769     size_t len = ds_length (&getl_buf);
 770     char *s = ds_value (&getl_buf);
 771
 772     /* Strip trailing whitespace. */
 773     while (len > 0 && isspace ((unsigned char) s[len - 1]))
 774       len--;
 775
 776     /* Check for and remove terminal dot. */
 777     if (len > 0 && s[len - 1] == set_endcmd)
 778       {
 779         dot = 1;
 780         len--;
 781       }
 782     else if (len == 0 && set_nullline)
 783       dot = 1;
 784     else
 785       dot = 0;
 786
 787     /* Set length. */
 788     ds_truncate (&getl_buf, len);
 789   }
 790
 791   /* In batch mode, strip leading indentors and insert a terminal dot
 792      as necessary. */
 793   if (getl_interactive != 2 && getl_mode == GETL_MODE_BATCH)
 794     {
 795       char *s = ds_value (&getl_buf);
 796
 797       if (s[0] == '+' || s[0] == '-' || s[0] == '.')
 798         s[0] = ' ';
 799       else if (s[0] && !isspace ((unsigned char) s[0]))
 800         lex_put_forward ('.');
 801     }
 802
 803   prog = ds_value (&getl_buf);
 804 }
 805 \f
 806 /* Token names. */
 807
 808 /* Returns the name of a token in a static buffer. */
 809 const char *
 810 lex_token_name (int token)
 811 {
 812   if (token >= T_FIRST_KEYWORD && token <= T_LAST_KEYWORD)
 813     return keywords[token - T_FIRST_KEYWORD];
 814
 815   if (token < 256)
 816     {
 817       static char t[2];
 818       t[0] = token;
 819       return t;
 820     }
 821
 822   return _("<ERROR>");
 823 }
 824
 825 /* Returns an ASCII representation of the current token as a
 826    malloc()'d string. */
 827 char *
 828 lex_token_representation (void)
 829 {
 830   char *token_rep;
 831
 832   switch (token)
 833     {
 834     case T_ID:
 835     case T_NUM:
 836       return xstrdup (ds_value (&tokstr));
 837       break;
 838
 839     case T_STRING:
 840       {
 841         int hexstring = 0;
 842         char *sp, *dp;
 843
 844         for (sp = ds_value (&tokstr); sp < ds_end (&tokstr); sp++)
 845           if (!isprint ((unsigned char) *sp))
 846             {
 847               hexstring = 1;
 848               break;
 849             }
 850
 851         token_rep = xmalloc (2 + ds_length (&tokstr) * 2 + 1 + 1);
 852
 853         dp = token_rep;
 854         if (hexstring)
 855           *dp++ = 'X';
 856         *dp++ = '\'';
 857
 858         if (!hexstring)
 859           for (sp = ds_value (&tokstr); *sp; )
 860             {
 861               if (*sp == '\'')
 862                 *dp++ = '\'';
 863               *dp++ = (unsigned char) *sp++;
 864             }
 865         else
 866           for (sp = ds_value (&tokstr); sp < ds_end (&tokstr); sp++)
 867             {
 868               *dp++ = (((unsigned char) *sp) >> 4)["0123456789ABCDEF"];
 869               *dp++ = (((unsigned char) *sp) & 15)["0123456789ABCDEF"];
 870             }
 871         *dp++ = '\'';
 872         *dp = '\0';
 873
 874         return token_rep;
 875       }
 876     break;
 877
 878     case T_STOP:
 879       token_rep = xmalloc (1);
 880       *token_rep = '\0';
 881       return token_rep;
 882
 883     case T_EXP:
 884       return xstrdup ("**");
 885
 886     default:
 887       if (token >= T_FIRST_KEYWORD && token <= T_LAST_KEYWORD)
 888         return xstrdup (keywords [token - T_FIRST_KEYWORD]);
 889       else
 890         {
 891           token_rep = xmalloc (2);
 892           token_rep[0] = token;
 893           token_rep[1] = '\0';
 894           return token_rep;
 895         }
 896     }
 897
 898   assert (0);
 899 }
 900 \f
 901 /* Really weird functions. */
 902
 903 /* Most of the time, a `-' is a lead-in to a negative number.  But
 904    sometimes it's actually part of the syntax.  If a dash can be part
 905    of syntax then this function is called to rip it off of a
 906    number. */
 907 void
 908 lex_negative_to_dash (void)
 909 {
 910   if (token == T_NUM && tokval < 0.0)
 911     {
 912       token = '-';
 913       tokval = -tokval;
 914       ds_replace (&tokstr, ds_value (&tokstr) + 1);
 915       lex_put_forward (T_NUM);
 916     }
 917 }
 918
 919 /* We're not at eof any more. */
 920 void
 921 lex_reset_eof (void)
 922 {
 923   eof = 0;
 924 }
 925
 926 /* Skip a COMMENT command. */
 927 void
 928 lex_skip_comment (void)
 929 {
 930   for (;;)
 931     {
 932       lex_get_line ();
 933       if (put == '.')
 934         break;
 935
 936       prog = ds_end (&getl_buf);
 937       if (dot)
 938         break;
 939     }
 940 }
 941 \f
 942 /* Private functions. */
 943
 944 /* Unexpected end of file. */
 945 static void
 946 unexpected_eof (void)
 947 {
 948   msg (FE, _("Unexpected end of file."));
 949 }
 950
 951 /* Returns the proper token type, either T_ID or a reserved keyword
 952    enum, for ID[], which must contain LEN characters. */
 953 static inline int
 954 check_id (const char *id, size_t len)
 955 {
 956   const char **kwp;
 957
 958   if (len < 2 || len > 4)
 959     return T_ID;
 960
 961   for (kwp = keywords; *kwp; kwp++)
 962     if (!strcmp (*kwp, id))
 963       return T_FIRST_KEYWORD + (kwp - keywords);
 964
 965   return T_ID;
 966 }
 967
 968 /* When invoked, tokstr contains a string of binary, octal, or hex
 969    digits, for values of TYPE of 0, 1, or 2, respectively.  The string
 970    is converted to characters having the specified values. */
 971 static void
 972 convert_numeric_string_to_char_string (int type)
 973 {
 974   static const char *base_names[] = {N_("binary"), N_("octal"), N_("hex")};
 975   static const int bases[] = {2, 8, 16};
 976   static const int chars_per_byte[] = {8, 3, 2};
 977
 978   const char *const base_name = base_names[type];
 979   const int base = bases[type];
 980   const int cpb = chars_per_byte[type];
 981   const int nb = ds_length (&tokstr) / cpb;
 982   int i;
 983   char *p;
 984
 985   assert (type >= 0 && type <= 2);
 986
 987   if (ds_length (&tokstr) % cpb)
 988     msg (SE, _("String of %s digits has %d characters, which is not a "
 989                "multiple of %d."),
 990          gettext (base_name), ds_length (&tokstr), cpb);
 991
 992   p = ds_value (&tokstr);
 993   for (i = 0; i < nb; i++)
 994     {
 995       int value;
 996       int j;
 997
 998       value = 0;
 999       for (j = 0; j < cpb; j++, p++)
1000         {
1001           int v;
1002
1003           if (*p >= '0' && *p <= '9')
1004             v = *p - '0';
1005           else
1006             {
1007               static const char alpha[] = "abcdef";
1008               const char *q = strchr (alpha, tolower ((unsigned char) *p));
1009
1010               if (q)
1011                 v = q - alpha + 10;
1012               else
1013                 v = base;
1014             }
1015
1016           if (v >= base)
1017             msg (SE, _("`%c' is not a valid %s digit."), *p, base_name);
1018
1019           value = value * base + v;
1020         }
1021
1022       ds_value (&tokstr)[i] = (unsigned char) value;
1023     }
1024
1025   ds_truncate (&tokstr, nb);
1026 }
1027
1028 /* Parses a string from the input buffer into tokstr.  The input
1029    buffer pointer prog must point to the initial single or double
1030    quote.  TYPE is 0 if it is an ordinary string, or 1, 2, or 3 for a
1031    binary, octal, or hexstring, respectively.  Returns token type. */
1032 static int
1033 parse_string (int type)
1034 {
1035   /* Accumulate the entire string, joining sections indicated by +
1036      signs. */
1037   for (;;)
1038     {
1039       /* Single or double quote. */
1040       int c = *prog++;
1041
1042       /* Accumulate section. */
1043       for (;;)
1044         {
1045           /* Check end of line. */
1046           if (*prog == 0)
1047             {
1048               msg (SE, _("Unterminated string constant."));
1049               goto finish;
1050             }
1051
1052           /* Double quote characters to embed them in strings. */
1053           if (*prog == c)
1054             {
1055               if (prog[1] == c)
1056                 prog++;
1057               else
1058                 break;
1059             }
1060
1061           ds_putchar (&tokstr, *prog++);
1062         }
1063       prog++;
1064
1065       /* Skip whitespace after final quote mark. */
1066       if (eof)
1067         break;
1068       for (;;)
1069         {
1070           while (isspace ((unsigned char) *prog))
1071             prog++;
1072           if (*prog)
1073             break;
1074
1075           if (dot)
1076             goto finish;
1077
1078           if (!lex_get_line ())
1079             unexpected_eof ();
1080         }
1081
1082       /* Skip plus sign. */
1083       if (*prog != '+')
1084         break;
1085       prog++;
1086
1087       /* Skip whitespace after plus sign. */
1088       if (eof)
1089         break;
1090       for (;;)
1091         {
1092           while (isspace ((unsigned char) *prog))
1093             prog++;
1094           if (*prog)
1095             break;
1096
1097           if (dot)
1098             goto finish;
1099
1100           if (!lex_get_line ())
1101             unexpected_eof ();
1102         }
1103
1104       /* Ensure that a valid string follows. */
1105       if (*prog != '\'' && *prog != '"')
1106         {
1107           msg (SE, "String expected following `+'.");
1108           goto finish;
1109         }
1110     }
1111
1112   /* We come here when we've finished concatenating all the string sections
1113      into one large string. */
1114 finish:
1115   if (type != 0)
1116     convert_numeric_string_to_char_string (type - 1);
1117
1118   if (ds_length (&tokstr) > 255)
1119     {
1120       msg (SE, _("String exceeds 255 characters in length (%d characters)."),
1121            ds_length (&tokstr));
1122       ds_truncate (&tokstr, 255);
1123     }
1124
1125   {
1126     /* FIXME. */
1127     size_t i;
1128     int warned = 0;
1129
1130     for (i = 0; i < ds_length (&tokstr); i++)
1131       if (ds_value (&tokstr)[i] == 0)
1132         {
1133           if (!warned)
1134             {
1135               msg (SE, _("Sorry, literal strings may not contain null "
1136                          "characters.  Replacing with spaces."));
1137               warned = 1;
1138             }
1139           ds_value (&tokstr)[i] = ' ';
1140         }
1141   }
1142
1143   return T_STRING;
1144 }
1145 \f
1146 #if DUMP_TOKENS
1147 /* Reads one token from the lexer and writes a textual representation
1148    on stdout for debugging purposes. */
1149 static void
1150 dump_token (void)
1151 {
1152   {
1153     const char *curfn;
1154     int curln;
1155
1156     getl_location (&curfn, &curln);
1157     if (curfn)
1158       printf ("%s:%d\t", curfn, curln);
1159   }
1160
1161   switch (token)
1162     {
1163     case T_ID:
1164       printf ("ID\t%s\n", tokid);
1165       break;
1166
1167     case T_NUM:
1168       printf ("NUM\t%f\n", tokval);
1169       break;
1170
1171     case T_STRING:
1172       printf ("STRING\t\"%s\"\n", ds_value (&tokstr));
1173       break;
1174
1175     case T_STOP:
1176       printf ("STOP\n");
1177       break;
1178
1179     case T_EXP:
1180       puts ("MISC\tEXP");
1181       break;
1182
1183     case 0:
1184       puts ("MISC\tEOF");
1185       break;
1186
1187     default:
1188       if (token >= T_FIRST_KEYWORD && token <= T_LAST_KEYWORD)
1189         printf ("KEYWORD\t%s\n", lex_token_name (token));
1190       else
1191         printf ("PUNCT\t%c\n", token);
1192       break;
1193     }
1194 }
1195 #endif /* DEBUGGING */