src/language/lexer/lexer.c

   1 /* PSPP - computes sample statistics.
   2    Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
   3    Written by Ben Pfaff <blp@gnu.org>.
   4
   5    This program is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU General Public License as
   7    published by the Free Software Foundation; either version 2 of the
   8    License, or (at your option) any later version.
   9
  10    This program is distributed in the hope that it will be useful, but
  11    WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program; if not, write to the Free Software
  17    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  18    02110-1301, USA. */
  19
  20 #include <config.h>
  21 #include "lexer.h"
  22 #include <libpspp/message.h>
  23 #include <ctype.h>
  24 #include <errno.h>
  25 #include <limits.h>
  26 #include <math.h>
  27 #include <stdarg.h>
  28 #include <stdlib.h>
  29 #include <libpspp/alloc.h>
  30 #include <libpspp/assertion.h>
  31 #include <language/command.h>
  32 #include <libpspp/message.h>
  33 #include <language/line-buffer.h>
  34 #include <libpspp/magic.h>
  35 #include <data/settings.h>
  36 #include <libpspp/str.h>
  37
  38 #include "size_max.h"
  39
  40 #include "gettext.h"
  41 #define _(msgid) gettext (msgid)
  42 #define N_(msgid) msgid
  43
  44 /*
  45 #define DUMP_TOKENS 1
  46 */
  47
  48 \f
  49 /* Current token. */
  50 int token;
  51
  52 /* T_POS_NUM, T_NEG_NUM: the token's value. */
  53 double tokval;
  54
  55 /* T_ID: the identifier. */
  56 char tokid[LONG_NAME_LEN + 1];
  57
  58 /* T_ID, T_STRING: token string value.
  59    For T_ID, this is not truncated as is tokid. */
  60 struct string tokstr;
  61 \f
  62 /* Static variables. */
  63
  64 /* Pointer to next token in getl_buf. */
  65 static char *prog;
  66
  67 /* True only if this line ends with a terminal dot. */
  68 static bool dot;
  69
  70 /* True only if the last token returned was T_STOP. */
  71 static bool eof;
  72
  73 /* If nonzero, next token returned by lex_get().
  74    Used only in exceptional circumstances. */
  75 static int put_token;
  76 static struct string put_tokstr;
  77 static double put_tokval;
  78
  79 static int parse_id (void);
  80
  81 /* How a string represents its contents. */
  82 enum string_type
  83   {
  84     CHARACTER_STRING,   /* Characters. */
  85     BINARY_STRING,      /* Binary digits. */
  86     OCTAL_STRING,       /* Octal digits. */
  87     HEX_STRING          /* Hexadecimal digits. */
  88   };
  89
  90 static int parse_string (enum string_type);
  91
  92 #if DUMP_TOKENS
  93 static void dump_token (void);
  94 #endif
  95 \f
  96 /* Initialization. */
  97
  98 /* Initializes the lexer. */
  99 void
 100 lex_init (void)
 101 {
 102   ds_init_empty (&tokstr);
 103   ds_init_empty (&put_tokstr);
 104   if (!lex_get_line ())
 105     eof = true;
 106 }
 107
 108 void
 109 lex_done (void)
 110 {
 111   ds_destroy (&put_tokstr);
 112   ds_destroy (&tokstr);
 113 }
 114
 115 \f
 116 /* Common functions. */
 117
 118 /* Copies put_token, put_tokstr, put_tokval into token, tokstr,
 119    tokval, respectively, and sets tokid appropriately. */
 120 static void
 121 restore_token (void)
 122 {
 123   assert (put_token != 0);
 124   token = put_token;
 125   ds_assign_string (&tokstr, &put_tokstr);
 126   str_copy_trunc (tokid, sizeof tokid, ds_cstr (&tokstr));
 127   tokval = put_tokval;
 128   put_token = 0;
 129 }
 130
 131 /* Copies token, tokstr, tokval into put_token, put_tokstr,
 132    put_tokval respectively. */
 133 static void
 134 save_token (void)
 135 {
 136   put_token = token;
 137   ds_assign_string (&put_tokstr, &tokstr);
 138   put_tokval = tokval;
 139 }
 140
 141 /* Parses a single token, setting appropriate global variables to
 142    indicate the token's attributes. */
 143 void
 144 lex_get (void)
 145 {
 146   /* If a token was pushed ahead, return it. */
 147   if (put_token)
 148     {
 149       restore_token ();
 150 #if DUMP_TOKENS
 151       dump_token ();
 152 #endif
 153       return;
 154     }
 155
 156   /* Find a token. */
 157   for (;;)
 158     {
 159       /* Skip whitespace. */
 160       if (eof)
 161         {
 162           token = T_STOP;
 163           return;
 164         }
 165
 166       for (;;)
 167         {
 168           while (isspace ((unsigned char) *prog))
 169             prog++;
 170           if (*prog)
 171             break;
 172
 173           if (dot)
 174             {
 175               dot = 0;
 176               token = '.';
 177 #if DUMP_TOKENS
 178               dump_token ();
 179 #endif
 180               return;
 181             }
 182           else if (!lex_get_line ())
 183             {
 184               eof = true;
 185               token = T_STOP;
 186 #if DUMP_TOKENS
 187               dump_token ();
 188 #endif
 189               return;
 190             }
 191
 192           if (put_token)
 193             {
 194               restore_token ();
 195 #if DUMP_TOKENS
 196               dump_token ();
 197 #endif
 198               return;
 199             }
 200         }
 201
 202
 203       /* Actually parse the token. */
 204       ds_clear (&tokstr);
 205
 206       switch (*prog)
 207         {
 208         case '-': case '.':
 209         case '0': case '1': case '2': case '3': case '4':
 210         case '5': case '6': case '7': case '8': case '9':
 211           {
 212             char *tail;
 213
 214             /* `-' can introduce a negative number, or it can be a
 215                token by itself.  If it is not followed by a digit or a
 216                decimal point, it is definitely not a number.
 217                Otherwise, it might be either, but most of the time we
 218                want it as a number.  When the syntax calls for a `-'
 219                token, lex_negative_to_dash() must be used to break
 220                negative numbers into two tokens. */
 221             if (*prog == '-')
 222               {
 223                 ds_put_char (&tokstr, *prog++);
 224                 while (isspace ((unsigned char) *prog))
 225                   prog++;
 226
 227                 if (!isdigit ((unsigned char) *prog) && *prog != '.')
 228                   {
 229                     token = '-';
 230                     break;
 231                   }
 232                 token = T_NEG_NUM;
 233               }
 234             else
 235               token = T_POS_NUM;
 236
 237             /* Parse the number, copying it into tokstr. */
 238             while (isdigit ((unsigned char) *prog))
 239               ds_put_char (&tokstr, *prog++);
 240             if (*prog == '.')
 241               {
 242                 ds_put_char (&tokstr, *prog++);
 243                 while (isdigit ((unsigned char) *prog))
 244                   ds_put_char (&tokstr, *prog++);
 245               }
 246             if (*prog == 'e' || *prog == 'E')
 247               {
 248                 ds_put_char (&tokstr, *prog++);
 249                 if (*prog == '+' || *prog == '-')
 250                   ds_put_char (&tokstr, *prog++);
 251                 while (isdigit ((unsigned char) *prog))
 252                   ds_put_char (&tokstr, *prog++);
 253               }
 254
 255             /* Parse as floating point. */
 256             tokval = strtod (ds_cstr (&tokstr), &tail);
 257             if (*tail)
 258               {
 259                 msg (SE, _("%s does not form a valid number."),
 260                      ds_cstr (&tokstr));
 261                 tokval = 0.0;
 262
 263                 ds_clear (&tokstr);
 264                 ds_put_char (&tokstr, '0');
 265               }
 266
 267             break;
 268           }
 269
 270         case '\'': case '"':
 271           token = parse_string (CHARACTER_STRING);
 272           break;
 273
 274         case '(': case ')': case ',': case '=': case '+': case '/':
 275           token = *prog++;
 276           break;
 277
 278         case '*':
 279           if (*++prog == '*')
 280             {
 281               prog++;
 282               token = T_EXP;
 283             }
 284           else
 285             token = '*';
 286           break;
 287
 288         case '<':
 289           if (*++prog == '=')
 290             {
 291               prog++;
 292               token = T_LE;
 293             }
 294           else if (*prog == '>')
 295             {
 296               prog++;
 297               token = T_NE;
 298             }
 299           else
 300             token = T_LT;
 301           break;
 302
 303         case '>':
 304           if (*++prog == '=')
 305             {
 306               prog++;
 307               token = T_GE;
 308             }
 309           else
 310             token = T_GT;
 311           break;
 312
 313         case '~':
 314           if (*++prog == '=')
 315             {
 316               prog++;
 317               token = T_NE;
 318             }
 319           else
 320             token = T_NOT;
 321           break;
 322
 323         case '&':
 324           prog++;
 325           token = T_AND;
 326           break;
 327
 328         case '|':
 329           prog++;
 330           token = T_OR;
 331           break;
 332
 333         case 'b': case 'B':
 334           if (prog[1] == '\'' || prog[1] == '"')
 335             token = parse_string (BINARY_STRING);
 336           else
 337             token = parse_id ();
 338           break;
 339
 340         case 'o': case 'O':
 341           if (prog[1] == '\'' || prog[1] == '"')
 342             token = parse_string (OCTAL_STRING);
 343           else
 344             token = parse_id ();
 345           break;
 346
 347         case 'x': case 'X':
 348           if (prog[1] == '\'' || prog[1] == '"')
 349             token = parse_string (HEX_STRING);
 350           else
 351             token = parse_id ();
 352           break;
 353
 354         default:
 355           if (lex_is_id1 (*prog))
 356             {
 357               token = parse_id ();
 358               break;
 359             }
 360           else
 361             {
 362               if (isgraph ((unsigned char) *prog))
 363                 msg (SE, _("Bad character in input: `%c'."), *prog++);
 364               else
 365                 msg (SE, _("Bad character in input: `\\%o'."), *prog++);
 366               continue;
 367             }
 368         }
 369       break;
 370     }
 371
 372 #if DUMP_TOKENS
 373   dump_token ();
 374 #endif
 375 }
 376
 377 /* Parses an identifier at the current position into tokid and
 378    tokstr.
 379    Returns the correct token type. */
 380 static int
 381 parse_id (void)
 382 {
 383   const char *start = prog;
 384   prog = lex_skip_identifier (start);
 385
 386   ds_put_substring (&tokstr, ss_buffer (start, prog - start));
 387   str_copy_trunc (tokid, sizeof tokid, ds_cstr (&tokstr));
 388   return lex_id_to_token (ds_cstr (&tokstr), ds_length (&tokstr));
 389 }
 390
 391 /* Reports an error to the effect that subcommand SBC may only be
 392    specified once. */
 393 void
 394 lex_sbc_only_once (const char *sbc)
 395 {
 396   msg (SE, _("Subcommand %s may only be specified once."), sbc);
 397 }
 398
 399 /* Reports an error to the effect that subcommand SBC is
 400    missing. */
 401 void
 402 lex_sbc_missing (const char *sbc)
 403 {
 404   lex_error (_("missing required subcommand %s"), sbc);
 405 }
 406
 407 /* Prints a syntax error message containing the current token and
 408    given message MESSAGE (if non-null). */
 409 void
 410 lex_error (const char *message, ...)
 411 {
 412   char *token_rep;
 413   char where[128];
 414
 415   token_rep = lex_token_representation ();
 416   if (token == T_STOP)
 417     strcpy (where, "end of file");
 418   else if (token == '.')
 419     strcpy (where, "end of command");
 420   else
 421     snprintf (where, sizeof where, "`%s'", token_rep);
 422   free (token_rep);
 423
 424   if (message)
 425     {
 426       char buf[1024];
 427       va_list args;
 428
 429       va_start (args, message);
 430       vsnprintf (buf, 1024, message, args);
 431       va_end (args);
 432
 433       msg (SE, _("Syntax error %s at %s."), buf, where);
 434     }
 435   else
 436     msg (SE, _("Syntax error at %s."), where);
 437 }
 438
 439 /* Checks that we're at end of command.
 440    If so, returns a successful command completion code.
 441    If not, flags a syntax error and returns an error command
 442    completion code. */
 443 int
 444 lex_end_of_command (void)
 445 {
 446   if (token != '.')
 447     {
 448       lex_error (_("expecting end of command"));
 449       return CMD_FAILURE;
 450     }
 451   else
 452     return CMD_SUCCESS;
 453 }
 454 \f
 455 /* Token testing functions. */
 456
 457 /* Returns true if the current token is a number. */
 458 bool
 459 lex_is_number (void)
 460 {
 461   return token == T_POS_NUM || token == T_NEG_NUM;
 462 }
 463
 464 /* Returns the value of the current token, which must be a
 465    floating point number. */
 466 double
 467 lex_number (void)
 468 {
 469   assert (lex_is_number ());
 470   return tokval;
 471 }
 472
 473 /* Returns true iff the current token is an integer. */
 474 bool
 475 lex_is_integer (void)
 476 {
 477   return (lex_is_number ()
 478           && tokval != NOT_LONG
 479           && tokval >= LONG_MIN
 480           && tokval <= LONG_MAX
 481           && floor (tokval) == tokval);
 482 }
 483
 484 /* Returns the value of the current token, which must be an
 485    integer. */
 486 long
 487 lex_integer (void)
 488 {
 489   assert (lex_is_integer ());
 490   return tokval;
 491 }
 492 \f
 493 /* Token matching functions. */
 494
 495 /* If TOK is the current token, skips it and returns true
 496    Otherwise, returns false. */
 497 bool
 498 lex_match (int t)
 499 {
 500   if (token == t)
 501     {
 502       lex_get ();
 503       return true;
 504     }
 505   else
 506     return false;
 507 }
 508
 509 /* If the current token is the identifier S, skips it and returns
 510    true.  The identifier may be abbreviated to its first three
 511    letters.
 512    Otherwise, returns false. */
 513 bool
 514 lex_match_id (const char *s)
 515 {
 516   if (token == T_ID && lex_id_match (s, tokid))
 517     {
 518       lex_get ();
 519       return true;
 520     }
 521   else
 522     return false;
 523 }
 524
 525 /* If the current token is integer N, skips it and returns true.
 526    Otherwise, returns false. */
 527 bool
 528 lex_match_int (int x)
 529 {
 530   if (lex_is_integer () && lex_integer () == x)
 531     {
 532       lex_get ();
 533       return true;
 534     }
 535   else
 536     return false;
 537 }
 538 \f
 539 /* Forced matches. */
 540
 541 /* If this token is identifier S, fetches the next token and returns
 542    nonzero.
 543    Otherwise, reports an error and returns zero. */
 544 bool
 545 lex_force_match_id (const char *s)
 546 {
 547   if (token == T_ID && lex_id_match (s, tokid))
 548     {
 549       lex_get ();
 550       return true;
 551     }
 552   else
 553     {
 554       lex_error (_("expecting `%s'"), s);
 555       return false;
 556     }
 557 }
 558
 559 /* If the current token is T, skips the token.  Otherwise, reports an
 560    error and returns from the current function with return value false. */
 561 bool
 562 lex_force_match (int t)
 563 {
 564   if (token == t)
 565     {
 566       lex_get ();
 567       return true;
 568     }
 569   else
 570     {
 571       lex_error (_("expecting `%s'"), lex_token_name (t));
 572       return false;
 573     }
 574 }
 575
 576 /* If this token is a string, does nothing and returns true.
 577    Otherwise, reports an error and returns false. */
 578 bool
 579 lex_force_string (void)
 580 {
 581   if (token == T_STRING)
 582     return true;
 583   else
 584     {
 585       lex_error (_("expecting string"));
 586       return false;
 587     }
 588 }
 589
 590 /* If this token is an integer, does nothing and returns true.
 591    Otherwise, reports an error and returns false. */
 592 bool
 593 lex_force_int (void)
 594 {
 595   if (lex_is_integer ())
 596     return true;
 597   else
 598     {
 599       lex_error (_("expecting integer"));
 600       return false;
 601     }
 602 }
 603
 604 /* If this token is a number, does nothing and returns true.
 605    Otherwise, reports an error and returns false. */
 606 bool
 607 lex_force_num (void)
 608 {
 609   if (lex_is_number ())
 610     return true;
 611   else
 612     {
 613       lex_error (_("expecting number"));
 614       return false;
 615     }
 616 }
 617
 618 /* If this token is an identifier, does nothing and returns true.
 619    Otherwise, reports an error and returns false. */
 620 bool
 621 lex_force_id (void)
 622 {
 623   if (token == T_ID)
 624     return true;
 625   else
 626     {
 627       lex_error (_("expecting identifier"));
 628       return false;
 629     }
 630 }
 631 /* Weird token functions. */
 632
 633 /* Returns the first character of the next token, except that if the
 634    next token is not an identifier, the character returned will not be
 635    a character that can begin an identifier.  Specifically, the
 636    hexstring lead-in X' causes lookahead() to return '.  Note that an
 637    alphanumeric return value doesn't guarantee an ID token, it could
 638    also be a reserved-word token. */
 639 int
 640 lex_look_ahead (void)
 641 {
 642   if (put_token)
 643     return put_token;
 644
 645   for (;;)
 646     {
 647       if (eof)
 648         return 0;
 649
 650       for (;;)
 651         {
 652           while (isspace ((unsigned char) *prog))
 653             prog++;
 654           if (*prog)
 655             break;
 656
 657           if (dot)
 658             return '.';
 659           else if (!lex_get_line ())
 660             return 0;
 661
 662           if (put_token)
 663             return put_token;
 664         }
 665
 666       if ((toupper ((unsigned char) *prog) == 'X'
 667            || toupper ((unsigned char) *prog) == 'B'
 668            || toupper ((unsigned char) *prog) == 'O')
 669           && (prog[1] == '\'' || prog[1] == '"'))
 670         return '\'';
 671
 672       return *prog;
 673     }
 674 }
 675
 676 /* Makes the current token become the next token to be read; the
 677    current token is set to T. */
 678 void
 679 lex_put_back (int t)
 680 {
 681   save_token ();
 682   token = t;
 683 }
 684
 685 /* Makes the current token become the next token to be read; the
 686    current token is set to the identifier ID. */
 687 void
 688 lex_put_back_id (const char *id)
 689 {
 690   assert (lex_id_to_token (id, strlen (id)) == T_ID);
 691   save_token ();
 692   token = T_ID;
 693   ds_assign_cstr (&tokstr, id);
 694   str_copy_trunc (tokid, sizeof tokid, ds_cstr (&tokstr));
 695 }
 696 \f
 697 /* Weird line processing functions. */
 698
 699 /* Returns the entire contents of the current line. */
 700 const char *
 701 lex_entire_line (void)
 702 {
 703   return ds_cstr (&getl_buf);
 704 }
 705
 706 /* As lex_entire_line(), but only returns the part of the current line
 707    that hasn't already been tokenized.
 708    If END_DOT is non-null, stores nonzero into *END_DOT if the line
 709    ends with a terminal dot, or zero if it doesn't. */
 710 const char *
 711 lex_rest_of_line (int *end_dot)
 712 {
 713   if (end_dot)
 714     *end_dot = dot;
 715   return prog;
 716 }
 717
 718 /* Causes the rest of the current input line to be ignored for
 719    tokenization purposes. */
 720 void
 721 lex_discard_line (void)
 722 {
 723   prog = ds_end (&getl_buf);
 724   dot = false;
 725   put_token = 0;
 726 }
 727
 728 /* Sets the current position in the current line to P, which must be
 729    in getl_buf. */
 730 void
 731 lex_set_prog (char *p)
 732 {
 733   prog = p;
 734 }
 735
 736 /* Discards the rest of the current command.
 737    When we're reading commands from a file, we skip tokens until
 738    a terminal dot or EOF.
 739    When we're reading commands interactively from the user,
 740    that's just discarding the current line, because presumably
 741    the user doesn't want to finish typing a command that will be
 742    ignored anyway. */
 743 void
 744 lex_discard_rest_of_command (void)
 745 {
 746   if (!getl_is_interactive ())
 747     {
 748       while (token != T_STOP && token != '.')
 749         lex_get ();
 750     }
 751   else
 752     lex_discard_line ();
 753 }
 754 \f
 755 /* Weird line reading functions. */
 756
 757 /* Remove C-style comments in STRING, begun by slash-star and
 758    terminated by star-slash or newline. */
 759 static void
 760 strip_comments (struct string *string)
 761 {
 762   char *cp;
 763   int quote;
 764   bool in_comment;
 765
 766   in_comment = false;
 767   quote = EOF;
 768   for (cp = ds_cstr (string); *cp; )
 769     {
 770       /* If we're not in a comment, check for quote marks. */
 771       if (!in_comment)
 772         {
 773           if (*cp == quote)
 774             quote = EOF;
 775           else if (*cp == '\'' || *cp == '"')
 776             quote = *cp;
 777         }
 778
 779       /* If we're not inside a quotation, check for comment. */
 780       if (quote == EOF)
 781         {
 782           if (cp[0] == '/' && cp[1] == '*')
 783             {
 784               in_comment = true;
 785               *cp++ = ' ';
 786               *cp++ = ' ';
 787               continue;
 788             }
 789           else if (in_comment && cp[0] == '*' && cp[1] == '/')
 790             {
 791               in_comment = false;
 792               *cp++ = ' ';
 793               *cp++ = ' ';
 794               continue;
 795             }
 796         }
 797
 798       /* Check commenting. */
 799       if (in_comment)
 800         *cp = ' ';
 801       cp++;
 802     }
 803 }
 804
 805 /* Reads a line for use by the tokenizer, and preprocesses it by
 806    removing comments, stripping trailing whitespace and the
 807    terminal dot, and removing leading indentors. */
 808 bool
 809 lex_get_line (void)
 810 {
 811   struct string *line = &getl_buf;
 812   bool interactive;
 813
 814   if (!getl_read_line (&interactive))
 815     return false;
 816
 817   strip_comments (line);
 818   ds_rtrim (line, ss_cstr (CC_SPACES));
 819
 820   /* Check for and remove terminal dot. */
 821   dot = (ds_chomp (line, get_endcmd ())
 822          || (ds_is_empty (line) && get_nulline ()));
 823
 824   /* Strip leading indentors or insert a terminal dot (unless the
 825      line was obtained interactively). */
 826   if (!interactive)
 827     {
 828       int first = ds_first (line);
 829
 830       if (first == '+' || first == '-')
 831         *ds_data (line) = ' ';
 832       else if (first != EOF && !isspace (first))
 833         put_token = '.';
 834     }
 835
 836   prog = ds_cstr (line);
 837
 838   return true;
 839 }
 840 \f
 841 /* Token names. */
 842
 843 /* Returns the name of a token in a static buffer. */
 844 const char *
 845 lex_token_name (int token)
 846 {
 847   if (token >= T_FIRST_KEYWORD && token <= T_LAST_KEYWORD)
 848     return keywords[token - T_FIRST_KEYWORD];
 849
 850   if (token < 256)
 851     {
 852       static char t[2];
 853       t[0] = token;
 854       return t;
 855     }
 856
 857   NOT_REACHED ();
 858 }
 859
 860 /* Returns an ASCII representation of the current token as a
 861    malloc()'d string. */
 862 char *
 863 lex_token_representation (void)
 864 {
 865   char *token_rep;
 866
 867   switch (token)
 868     {
 869     case T_ID:
 870     case T_POS_NUM:
 871     case T_NEG_NUM:
 872       return ds_xstrdup (&tokstr);
 873       break;
 874
 875     case T_STRING:
 876       {
 877         int hexstring = 0;
 878         char *sp, *dp;
 879
 880         for (sp = ds_cstr (&tokstr); sp < ds_end (&tokstr); sp++)
 881           if (!isprint ((unsigned char) *sp))
 882             {
 883               hexstring = 1;
 884               break;
 885             }
 886
 887         token_rep = xmalloc (2 + ds_length (&tokstr) * 2 + 1 + 1);
 888
 889         dp = token_rep;
 890         if (hexstring)
 891           *dp++ = 'X';
 892         *dp++ = '\'';
 893
 894         if (!hexstring)
 895           for (sp = ds_cstr (&tokstr); *sp; )
 896             {
 897               if (*sp == '\'')
 898                 *dp++ = '\'';
 899               *dp++ = (unsigned char) *sp++;
 900             }
 901         else
 902           for (sp = ds_cstr (&tokstr); sp < ds_end (&tokstr); sp++)
 903             {
 904               *dp++ = (((unsigned char) *sp) >> 4)["0123456789ABCDEF"];
 905               *dp++ = (((unsigned char) *sp) & 15)["0123456789ABCDEF"];
 906             }
 907         *dp++ = '\'';
 908         *dp = '\0';
 909
 910         return token_rep;
 911       }
 912     break;
 913
 914     case T_STOP:
 915       token_rep = xmalloc (1);
 916       *token_rep = '\0';
 917       return token_rep;
 918
 919     case T_EXP:
 920       return xstrdup ("**");
 921
 922     default:
 923       if (token >= T_FIRST_KEYWORD && token <= T_LAST_KEYWORD)
 924         return xstrdup (keywords [token - T_FIRST_KEYWORD]);
 925       else
 926         {
 927           token_rep = xmalloc (2);
 928           token_rep[0] = token;
 929           token_rep[1] = '\0';
 930           return token_rep;
 931         }
 932     }
 933
 934   NOT_REACHED ();
 935 }
 936 \f
 937 /* Really weird functions. */
 938
 939 /* Most of the time, a `-' is a lead-in to a negative number.  But
 940    sometimes it's actually part of the syntax.  If a dash can be part
 941    of syntax then this function is called to rip it off of a
 942    number. */
 943 void
 944 lex_negative_to_dash (void)
 945 {
 946   if (token == T_NEG_NUM)
 947     {
 948       token = T_POS_NUM;
 949       tokval = -tokval;
 950       ds_assign_substring (&tokstr, ds_substr (&tokstr, 1, SIZE_MAX));
 951       save_token ();
 952       token = '-';
 953     }
 954 }
 955
 956 /* We're not at eof any more. */
 957 void
 958 lex_reset_eof (void)
 959 {
 960   eof = false;
 961 }
 962
 963 /* Skip a COMMENT command. */
 964 void
 965 lex_skip_comment (void)
 966 {
 967   for (;;)
 968     {
 969       if (!lex_get_line ())
 970         {
 971           put_token = T_STOP;
 972           eof = true;
 973           return;
 974         }
 975
 976       if (put_token == '.')
 977         break;
 978
 979       prog = ds_end (&getl_buf);
 980       if (dot)
 981         break;
 982     }
 983 }
 984 \f
 985 /* Private functions. */
 986
 987 /* When invoked, tokstr contains a string of binary, octal, or
 988    hex digits, according to TYPE.  The string is converted to
 989    characters having the specified values. */
 990 static void
 991 convert_numeric_string_to_char_string (enum string_type type)
 992 {
 993   const char *base_name;
 994   int base;
 995   int chars_per_byte;
 996   size_t byte_cnt;
 997   size_t i;
 998   char *p;
 999
1000   switch (type)
1001     {
1002     case BINARY_STRING:
1003       base_name = _("binary");
1004       base = 2;
1005       chars_per_byte = 8;
1006       break;
1007     case OCTAL_STRING:
1008       base_name = _("octal");
1009       base = 8;
1010       chars_per_byte = 3;
1011       break;
1012     case HEX_STRING:
1013       base_name = _("hex");
1014       base = 16;
1015       chars_per_byte = 2;
1016       break;
1017     default:
1018       NOT_REACHED ();
1019     }
1020
1021   byte_cnt = ds_length (&tokstr) / chars_per_byte;
1022   if (ds_length (&tokstr) % chars_per_byte)
1023     msg (SE, _("String of %s digits has %d characters, which is not a "
1024                "multiple of %d."),
1025          base_name, ds_length (&tokstr), chars_per_byte);
1026
1027   p = ds_cstr (&tokstr);
1028   for (i = 0; i < byte_cnt; i++)
1029     {
1030       int value;
1031       int j;
1032
1033       value = 0;
1034       for (j = 0; j < chars_per_byte; j++, p++)
1035         {
1036           int v;
1037
1038           if (*p >= '0' && *p <= '9')
1039             v = *p - '0';
1040           else
1041             {
1042               static const char alpha[] = "abcdef";
1043               const char *q = strchr (alpha, tolower ((unsigned char) *p));
1044
1045               if (q)
1046                 v = q - alpha + 10;
1047               else
1048                 v = base;
1049             }
1050
1051           if (v >= base)
1052             msg (SE, _("`%c' is not a valid %s digit."), *p, base_name);
1053
1054           value = value * base + v;
1055         }
1056
1057       ds_cstr (&tokstr)[i] = (unsigned char) value;
1058     }
1059
1060   ds_truncate (&tokstr, byte_cnt);
1061 }
1062
1063 /* Parses a string from the input buffer into tokstr.  The input
1064    buffer pointer prog must point to the initial single or double
1065    quote.  TYPE indicates the type of string to be parsed.
1066    Returns token type. */
1067 static int
1068 parse_string (enum string_type type)
1069 {
1070   /* Accumulate the entire string, joining sections indicated by +
1071      signs. */
1072   for (;;)
1073     {
1074       /* Single or double quote. */
1075       int c = *prog++;
1076
1077       /* Accumulate section. */
1078       for (;;)
1079         {
1080           /* Check end of line. */
1081           if (*prog == '\0')
1082             {
1083               msg (SE, _("Unterminated string constant."));
1084               goto finish;
1085             }
1086
1087           /* Double quote characters to embed them in strings. */
1088           if (*prog == c)
1089             {
1090               if (prog[1] == c)
1091                 prog++;
1092               else
1093                 break;
1094             }
1095
1096           ds_put_char (&tokstr, *prog++);
1097         }
1098       prog++;
1099
1100       /* Skip whitespace after final quote mark. */
1101       if (eof)
1102         break;
1103       for (;;)
1104         {
1105           while (isspace ((unsigned char) *prog))
1106             prog++;
1107           if (*prog)
1108             break;
1109
1110           if (dot)
1111             goto finish;
1112
1113           if (!lex_get_line ())
1114             goto finish;
1115         }
1116
1117       /* Skip plus sign. */
1118       if (*prog != '+')
1119         break;
1120       prog++;
1121
1122       /* Skip whitespace after plus sign. */
1123       if (eof)
1124         break;
1125       for (;;)
1126         {
1127           while (isspace ((unsigned char) *prog))
1128             prog++;
1129           if (*prog)
1130             break;
1131
1132           if (dot)
1133             goto finish;
1134
1135           if (!lex_get_line ())
1136             {
1137               msg (SE, _("Unexpected end of file in string concatenation."));
1138               goto finish;
1139             }
1140         }
1141
1142       /* Ensure that a valid string follows. */
1143       if (*prog != '\'' && *prog != '"')
1144         {
1145           msg (SE, _("String expected following `+'."));
1146           goto finish;
1147         }
1148     }
1149
1150   /* We come here when we've finished concatenating all the string sections
1151      into one large string. */
1152 finish:
1153   if (type != CHARACTER_STRING)
1154     convert_numeric_string_to_char_string (type);
1155
1156   if (ds_length (&tokstr) > 255)
1157     {
1158       msg (SE, _("String exceeds 255 characters in length (%d characters)."),
1159            ds_length (&tokstr));
1160       ds_truncate (&tokstr, 255);
1161     }
1162
1163   {
1164     /* FIXME. */
1165     size_t i;
1166     int warned = 0;
1167
1168     for (i = 0; i < ds_length (&tokstr); i++)
1169       if (ds_cstr (&tokstr)[i] == 0)
1170         {
1171           if (!warned)
1172             {
1173               msg (SE, _("Sorry, literal strings may not contain null "
1174                          "characters.  Replacing with spaces."));
1175               warned = 1;
1176             }
1177           ds_cstr (&tokstr)[i] = ' ';
1178         }
1179   }
1180
1181   return T_STRING;
1182 }
1183 \f
1184 #if DUMP_TOKENS
1185 /* Reads one token from the lexer and writes a textual representation
1186    on stdout for debugging purposes. */
1187 static void
1188 dump_token (void)
1189 {
1190   {
1191     const char *curfn;
1192     int curln;
1193
1194     getl_location (&curfn, &curln);
1195     if (curfn)
1196       fprintf (stderr, "%s:%d\t", curfn, curln);
1197   }
1198
1199   switch (token)
1200     {
1201     case T_ID:
1202       fprintf (stderr, "ID\t%s\n", tokid);
1203       break;
1204
1205     case T_POS_NUM:
1206     case T_NEG_NUM:
1207       fprintf (stderr, "NUM\t%f\n", tokval);
1208       break;
1209
1210     case T_STRING:
1211       fprintf (stderr, "STRING\t\"%s\"\n", ds_cstr (&tokstr));
1212       break;
1213
1214     case T_STOP:
1215       fprintf (stderr, "STOP\n");
1216       break;
1217
1218     case T_EXP:
1219       fprintf (stderr, "MISC\tEXP\"");
1220       break;
1221
1222     case 0:
1223       fprintf (stderr, "MISC\tEOF\n");
1224       break;
1225
1226     default:
1227       if (token >= T_FIRST_KEYWORD && token <= T_LAST_KEYWORD)
1228         fprintf (stderr, "KEYWORD\t%s\n", lex_token_name (token));
1229       else
1230         fprintf (stderr, "PUNCT\t%c\n", token);
1231       break;
1232     }
1233 }
1234 #endif /* DUMP_TOKENS */