src/language/lexer/lexer.c

   1 /* PSPP - computes sample statistics.
   2    Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
   3    Written by Ben Pfaff <blp@gnu.org>.
   4
   5    This program is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU General Public License as
   7    published by the Free Software Foundation; either version 2 of the
   8    License, or (at your option) any later version.
   9
  10    This program is distributed in the hope that it will be useful, but
  11    WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program; if not, write to the Free Software
  17    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  18    02110-1301, USA. */
  19
  20 #include <config.h>
  21 #include "lexer.h"
  22 #include <libpspp/message.h>
  23 #include <ctype.h>
  24 #include <errno.h>
  25 #include <limits.h>
  26 #include <math.h>
  27 #include <stdarg.h>
  28 #include <stdlib.h>
  29 #include <libpspp/alloc.h>
  30 #include <language/command.h>
  31 #include <libpspp/message.h>
  32 #include <language/line-buffer.h>
  33 #include <libpspp/magic.h>
  34 #include <data/settings.h>
  35 #include <libpspp/str.h>
  36
  37 #include "size_max.h"
  38
  39 #include "gettext.h"
  40 #define _(msgid) gettext (msgid)
  41 #define N_(msgid) msgid
  42
  43 /*
  44 #define DUMP_TOKENS 1
  45 */
  46
  47 \f
  48 /* Global variables. */
  49
  50 extern const char *keywords[T_N_KEYWORDS + 1];
  51
  52
  53 /* Current token. */
  54 int token;
  55
  56 /* T_POS_NUM, T_NEG_NUM: the token's value. */
  57 double tokval;
  58
  59 /* T_ID: the identifier. */
  60 char tokid[LONG_NAME_LEN + 1];
  61
  62 /* T_ID, T_STRING: token string value.
  63    For T_ID, this is not truncated as is tokid. */
  64 struct string tokstr;
  65 \f
  66 /* Static variables. */
  67
  68 /* Pointer to next token in getl_buf. */
  69 static char *prog;
  70
  71 /* Nonzero only if this line ends with a terminal dot. */
  72 static int dot;
  73
  74 /* Nonzero only if the last token returned was T_STOP. */
  75 static int eof;
  76
  77 /* If nonzero, next token returned by lex_get().
  78    Used only in exceptional circumstances. */
  79 static int put_token;
  80 static struct string put_tokstr;
  81 static double put_tokval;
  82
  83 static int parse_id (void);
  84
  85 /* How a string represents its contents. */
  86 enum string_type
  87   {
  88     CHARACTER_STRING,   /* Characters. */
  89     BINARY_STRING,      /* Binary digits. */
  90     OCTAL_STRING,       /* Octal digits. */
  91     HEX_STRING          /* Hexadecimal digits. */
  92   };
  93
  94 static int parse_string (enum string_type);
  95
  96 #if DUMP_TOKENS
  97 static void dump_token (void);
  98 #endif
  99 \f
 100 /* Initialization. */
 101
 102 /* Initializes the lexer. */
 103 void
 104 lex_init (void)
 105 {
 106   ds_init_empty (&tokstr);
 107   ds_init_empty (&put_tokstr);
 108   if (!lex_get_line ())
 109     eof = true;
 110 }
 111
 112 void
 113 lex_done (void)
 114 {
 115   ds_destroy (&put_tokstr);
 116   ds_destroy (&tokstr);
 117 }
 118
 119 \f
 120 /* Common functions. */
 121
 122 /* Copies put_token, put_tokstr, put_tokval into token, tokstr,
 123    tokval, respectively, and sets tokid appropriately. */
 124 static void
 125 restore_token (void)
 126 {
 127   assert (put_token != 0);
 128   token = put_token;
 129   ds_assign_string (&tokstr, &put_tokstr);
 130   str_copy_trunc (tokid, sizeof tokid, ds_cstr (&tokstr));
 131   tokval = put_tokval;
 132   put_token = 0;
 133 }
 134
 135 /* Copies token, tokstr, tokval into put_token, put_tokstr,
 136    put_tokval respectively. */
 137 static void
 138 save_token (void)
 139 {
 140   put_token = token;
 141   ds_assign_string (&put_tokstr, &tokstr);
 142   put_tokval = tokval;
 143 }
 144
 145 /* Parses a single token, setting appropriate global variables to
 146    indicate the token's attributes. */
 147 void
 148 lex_get (void)
 149 {
 150   /* If a token was pushed ahead, return it. */
 151   if (put_token)
 152     {
 153       restore_token ();
 154 #if DUMP_TOKENS
 155       dump_token ();
 156 #endif
 157       return;
 158     }
 159
 160   /* Find a token. */
 161   for (;;)
 162     {
 163       /* Skip whitespace. */
 164       if (eof)
 165         {
 166           token = T_STOP;
 167           return;
 168         }
 169
 170       for (;;)
 171         {
 172           while (isspace ((unsigned char) *prog))
 173             prog++;
 174           if (*prog)
 175             break;
 176
 177           if (dot)
 178             {
 179               dot = 0;
 180               token = '.';
 181 #if DUMP_TOKENS
 182               dump_token ();
 183 #endif
 184               return;
 185             }
 186           else if (!lex_get_line ())
 187             {
 188               eof = 1;
 189               token = T_STOP;
 190 #if DUMP_TOKENS
 191               dump_token ();
 192 #endif
 193               return;
 194             }
 195
 196           if (put_token)
 197             {
 198               restore_token ();
 199 #if DUMP_TOKENS
 200               dump_token ();
 201 #endif
 202               return;
 203             }
 204         }
 205
 206
 207       /* Actually parse the token. */
 208       ds_clear (&tokstr);
 209
 210       switch (*prog)
 211         {
 212         case '-': case '.':
 213         case '0': case '1': case '2': case '3': case '4':
 214         case '5': case '6': case '7': case '8': case '9':
 215           {
 216             char *tail;
 217
 218             /* `-' can introduce a negative number, or it can be a
 219                token by itself.  If it is not followed by a digit or a
 220                decimal point, it is definitely not a number.
 221                Otherwise, it might be either, but most of the time we
 222                want it as a number.  When the syntax calls for a `-'
 223                token, lex_negative_to_dash() must be used to break
 224                negative numbers into two tokens. */
 225             if (*prog == '-')
 226               {
 227                 ds_put_char (&tokstr, *prog++);
 228                 while (isspace ((unsigned char) *prog))
 229                   prog++;
 230
 231                 if (!isdigit ((unsigned char) *prog) && *prog != '.')
 232                   {
 233                     token = '-';
 234                     break;
 235                   }
 236                 token = T_NEG_NUM;
 237               }
 238             else
 239               token = T_POS_NUM;
 240
 241             /* Parse the number, copying it into tokstr. */
 242             while (isdigit ((unsigned char) *prog))
 243               ds_put_char (&tokstr, *prog++);
 244             if (*prog == '.')
 245               {
 246                 ds_put_char (&tokstr, *prog++);
 247                 while (isdigit ((unsigned char) *prog))
 248                   ds_put_char (&tokstr, *prog++);
 249               }
 250             if (*prog == 'e' || *prog == 'E')
 251               {
 252                 ds_put_char (&tokstr, *prog++);
 253                 if (*prog == '+' || *prog == '-')
 254                   ds_put_char (&tokstr, *prog++);
 255                 while (isdigit ((unsigned char) *prog))
 256                   ds_put_char (&tokstr, *prog++);
 257               }
 258
 259             /* Parse as floating point. */
 260             tokval = strtod (ds_cstr (&tokstr), &tail);
 261             if (*tail)
 262               {
 263                 msg (SE, _("%s does not form a valid number."),
 264                      ds_cstr (&tokstr));
 265                 tokval = 0.0;
 266
 267                 ds_clear (&tokstr);
 268                 ds_put_char (&tokstr, '0');
 269               }
 270
 271             break;
 272           }
 273
 274         case '\'': case '"':
 275           token = parse_string (CHARACTER_STRING);
 276           break;
 277
 278         case '(': case ')': case ',': case '=': case '+': case '/':
 279           token = *prog++;
 280           break;
 281
 282         case '*':
 283           if (*++prog == '*')
 284             {
 285               prog++;
 286               token = T_EXP;
 287             }
 288           else
 289             token = '*';
 290           break;
 291
 292         case '<':
 293           if (*++prog == '=')
 294             {
 295               prog++;
 296               token = T_LE;
 297             }
 298           else if (*prog == '>')
 299             {
 300               prog++;
 301               token = T_NE;
 302             }
 303           else
 304             token = T_LT;
 305           break;
 306
 307         case '>':
 308           if (*++prog == '=')
 309             {
 310               prog++;
 311               token = T_GE;
 312             }
 313           else
 314             token = T_GT;
 315           break;
 316
 317         case '~':
 318           if (*++prog == '=')
 319             {
 320               prog++;
 321               token = T_NE;
 322             }
 323           else
 324             token = T_NOT;
 325           break;
 326
 327         case '&':
 328           prog++;
 329           token = T_AND;
 330           break;
 331
 332         case '|':
 333           prog++;
 334           token = T_OR;
 335           break;
 336
 337         case 'b': case 'B':
 338           if (prog[1] == '\'' || prog[1] == '"')
 339             token = parse_string (BINARY_STRING);
 340           else
 341             token = parse_id ();
 342           break;
 343
 344         case 'o': case 'O':
 345           if (prog[1] == '\'' || prog[1] == '"')
 346             token = parse_string (OCTAL_STRING);
 347           else
 348             token = parse_id ();
 349           break;
 350
 351         case 'x': case 'X':
 352           if (prog[1] == '\'' || prog[1] == '"')
 353             token = parse_string (HEX_STRING);
 354           else
 355             token = parse_id ();
 356           break;
 357
 358         default:
 359           if (lex_is_id1 (*prog))
 360             {
 361               token = parse_id ();
 362               break;
 363             }
 364           else
 365             {
 366               if (isgraph ((unsigned char) *prog))
 367                 msg (SE, _("Bad character in input: `%c'."), *prog++);
 368               else
 369                 msg (SE, _("Bad character in input: `\\%o'."), *prog++);
 370               continue;
 371             }
 372         }
 373       break;
 374     }
 375
 376 #if DUMP_TOKENS
 377   dump_token ();
 378 #endif
 379 }
 380
 381 /* Parses an identifier at the current position into tokid and
 382    tokstr.
 383    Returns the correct token type. */
 384 static int
 385 parse_id (void)
 386 {
 387   const char *start = prog;
 388   prog = lex_skip_identifier (start);
 389
 390   ds_put_substring (&tokstr, ss_buffer (start, prog - start));
 391   str_copy_trunc (tokid, sizeof tokid, ds_cstr (&tokstr));
 392   return lex_id_to_token (ds_cstr (&tokstr), ds_length (&tokstr));
 393 }
 394
 395 /* Reports an error to the effect that subcommand SBC may only be
 396    specified once. */
 397 void
 398 lex_sbc_only_once (const char *sbc)
 399 {
 400   msg (SE, _("Subcommand %s may only be specified once."), sbc);
 401 }
 402
 403 /* Reports an error to the effect that subcommand SBC is
 404    missing. */
 405 void
 406 lex_sbc_missing (const char *sbc)
 407 {
 408   lex_error (_("missing required subcommand %s"), sbc);
 409 }
 410
 411 /* Prints a syntax error message containing the current token and
 412    given message MESSAGE (if non-null). */
 413 void
 414 lex_error (const char *message, ...)
 415 {
 416   char *token_rep;
 417   char where[128];
 418
 419   token_rep = lex_token_representation ();
 420   if (token == T_STOP)
 421     strcpy (where, "end of file");
 422   else if (token == '.')
 423     strcpy (where, "end of command");
 424   else
 425     snprintf (where, sizeof where, "`%s'", token_rep);
 426   free (token_rep);
 427
 428   if (message)
 429     {
 430       char buf[1024];
 431       va_list args;
 432
 433       va_start (args, message);
 434       vsnprintf (buf, 1024, message, args);
 435       va_end (args);
 436
 437       msg (SE, _("Syntax error %s at %s."), buf, where);
 438     }
 439   else
 440     msg (SE, _("Syntax error at %s."), where);
 441 }
 442
 443 /* Checks that we're at end of command.
 444    If so, returns a successful command completion code.
 445    If not, flags a syntax error and returns an error command
 446    completion code. */
 447 int
 448 lex_end_of_command (void)
 449 {
 450   if (token != '.')
 451     {
 452       lex_error (_("expecting end of command"));
 453       return CMD_FAILURE;
 454     }
 455   else
 456     return CMD_SUCCESS;
 457 }
 458 \f
 459 /* Token testing functions. */
 460
 461 /* Returns true if the current token is a number. */
 462 bool
 463 lex_is_number (void)
 464 {
 465   return token == T_POS_NUM || token == T_NEG_NUM;
 466 }
 467
 468 /* Returns the value of the current token, which must be a
 469    floating point number. */
 470 double
 471 lex_number (void)
 472 {
 473   assert (lex_is_number ());
 474   return tokval;
 475 }
 476
 477 /* Returns true iff the current token is an integer. */
 478 bool
 479 lex_is_integer (void)
 480 {
 481   return (lex_is_number ()
 482           && tokval != NOT_LONG
 483           && tokval >= LONG_MIN
 484           && tokval <= LONG_MAX
 485           && floor (tokval) == tokval);
 486 }
 487
 488 /* Returns the value of the current token, which must be an
 489    integer. */
 490 long
 491 lex_integer (void)
 492 {
 493   assert (lex_is_integer ());
 494   return tokval;
 495 }
 496 \f
 497 /* Token matching functions. */
 498
 499 /* If TOK is the current token, skips it and returns nonzero.
 500    Otherwise, returns zero. */
 501 int
 502 lex_match (int t)
 503 {
 504   if (token == t)
 505     {
 506       lex_get ();
 507       return 1;
 508     }
 509   else
 510     return 0;
 511 }
 512
 513 /* If the current token is the identifier S, skips it and returns
 514    nonzero.  The identifier may be abbreviated to its first three
 515    letters.
 516    Otherwise, returns zero. */
 517 int
 518 lex_match_id (const char *s)
 519 {
 520   if (token == T_ID && lex_id_match (s, tokid))
 521     {
 522       lex_get ();
 523       return 1;
 524     }
 525   else
 526     return 0;
 527 }
 528
 529 /* If the current token is integer N, skips it and returns nonzero.
 530    Otherwise, returns zero. */
 531 int
 532 lex_match_int (int x)
 533 {
 534   if (lex_is_integer () && lex_integer () == x)
 535     {
 536       lex_get ();
 537       return 1;
 538     }
 539   else
 540     return 0;
 541 }
 542 \f
 543 /* Forced matches. */
 544
 545 /* If this token is identifier S, fetches the next token and returns
 546    nonzero.
 547    Otherwise, reports an error and returns zero. */
 548 int
 549 lex_force_match_id (const char *s)
 550 {
 551   if (token == T_ID && lex_id_match (s, tokid))
 552     {
 553       lex_get ();
 554       return 1;
 555     }
 556   else
 557     {
 558       lex_error (_("expecting `%s'"), s);
 559       return 0;
 560     }
 561 }
 562
 563 /* If the current token is T, skips the token.  Otherwise, reports an
 564    error and returns from the current function with return value 0. */
 565 int
 566 lex_force_match (int t)
 567 {
 568   if (token == t)
 569     {
 570       lex_get ();
 571       return 1;
 572     }
 573   else
 574     {
 575       lex_error (_("expecting `%s'"), lex_token_name (t));
 576       return 0;
 577     }
 578 }
 579
 580 /* If this token is a string, does nothing and returns nonzero.
 581    Otherwise, reports an error and returns zero. */
 582 int
 583 lex_force_string (void)
 584 {
 585   if (token == T_STRING)
 586     return 1;
 587   else
 588     {
 589       lex_error (_("expecting string"));
 590       return 0;
 591     }
 592 }
 593
 594 /* If this token is an integer, does nothing and returns nonzero.
 595    Otherwise, reports an error and returns zero. */
 596 int
 597 lex_force_int (void)
 598 {
 599   if (lex_is_integer ())
 600     return 1;
 601   else
 602     {
 603       lex_error (_("expecting integer"));
 604       return 0;
 605     }
 606 }
 607
 608 /* If this token is a number, does nothing and returns nonzero.
 609    Otherwise, reports an error and returns zero. */
 610 int
 611 lex_force_num (void)
 612 {
 613   if (lex_is_number ())
 614     return 1;
 615   else
 616     {
 617       lex_error (_("expecting number"));
 618       return 0;
 619     }
 620 }
 621
 622 /* If this token is an identifier, does nothing and returns nonzero.
 623    Otherwise, reports an error and returns zero. */
 624 int
 625 lex_force_id (void)
 626 {
 627   if (token == T_ID)
 628     return 1;
 629   else
 630     {
 631       lex_error (_("expecting identifier"));
 632       return 0;
 633     }
 634 }
 635 /* Weird token functions. */
 636
 637 /* Returns the first character of the next token, except that if the
 638    next token is not an identifier, the character returned will not be
 639    a character that can begin an identifier.  Specifically, the
 640    hexstring lead-in X' causes lookahead() to return '.  Note that an
 641    alphanumeric return value doesn't guarantee an ID token, it could
 642    also be a reserved-word token. */
 643 int
 644 lex_look_ahead (void)
 645 {
 646   if (put_token)
 647     return put_token;
 648
 649   for (;;)
 650     {
 651       if (eof)
 652         return 0;
 653
 654       for (;;)
 655         {
 656           while (isspace ((unsigned char) *prog))
 657             prog++;
 658           if (*prog)
 659             break;
 660
 661           if (dot)
 662             return '.';
 663           else if (!lex_get_line ())
 664             return 0;
 665
 666           if (put_token)
 667             return put_token;
 668         }
 669
 670       if ((toupper ((unsigned char) *prog) == 'X'
 671            || toupper ((unsigned char) *prog) == 'B'
 672            || toupper ((unsigned char) *prog) == 'O')
 673           && (prog[1] == '\'' || prog[1] == '"'))
 674         return '\'';
 675
 676       return *prog;
 677     }
 678 }
 679
 680 /* Makes the current token become the next token to be read; the
 681    current token is set to T. */
 682 void
 683 lex_put_back (int t)
 684 {
 685   save_token ();
 686   token = t;
 687 }
 688
 689 /* Makes the current token become the next token to be read; the
 690    current token is set to the identifier ID. */
 691 void
 692 lex_put_back_id (const char *id)
 693 {
 694   assert (lex_id_to_token (id, strlen (id)) == T_ID);
 695   save_token ();
 696   token = T_ID;
 697   ds_assign_cstr (&tokstr, id);
 698   str_copy_trunc (tokid, sizeof tokid, ds_cstr (&tokstr));
 699 }
 700 \f
 701 /* Weird line processing functions. */
 702
 703 /* Returns the entire contents of the current line. */
 704 const char *
 705 lex_entire_line (void)
 706 {
 707   return ds_cstr (&getl_buf);
 708 }
 709
 710 /* As lex_entire_line(), but only returns the part of the current line
 711    that hasn't already been tokenized.
 712    If END_DOT is non-null, stores nonzero into *END_DOT if the line
 713    ends with a terminal dot, or zero if it doesn't. */
 714 const char *
 715 lex_rest_of_line (int *end_dot)
 716 {
 717   if (end_dot)
 718     *end_dot = dot;
 719   return prog;
 720 }
 721
 722 /* Causes the rest of the current input line to be ignored for
 723    tokenization purposes. */
 724 void
 725 lex_discard_line (void)
 726 {
 727   prog = ds_end (&getl_buf);
 728   dot = put_token = 0;
 729 }
 730
 731 /* Sets the current position in the current line to P, which must be
 732    in getl_buf. */
 733 void
 734 lex_set_prog (char *p)
 735 {
 736   prog = p;
 737 }
 738
 739 /* Discards the rest of the current command.
 740    When we're reading commands from a file, we skip tokens until
 741    a terminal dot or EOF.
 742    When we're reading commands interactively from the user,
 743    that's just discarding the current line, because presumably
 744    the user doesn't want to finish typing a command that will be
 745    ignored anyway. */
 746 void
 747 lex_discard_rest_of_command (void)
 748 {
 749   if (!getl_is_interactive ())
 750     {
 751       while (token != T_STOP && token != '.')
 752         lex_get ();
 753     }
 754   else
 755     lex_discard_line ();
 756 }
 757 \f
 758 /* Weird line reading functions. */
 759
 760 /* Remove C-style comments in STRING, begun by slash-star and
 761    terminated by star-slash or newline. */
 762 static void
 763 strip_comments (struct string *string)
 764 {
 765   char *cp;
 766   int quote;
 767   bool in_comment;
 768
 769   in_comment = false;
 770   quote = EOF;
 771   for (cp = ds_cstr (string); *cp; )
 772     {
 773       /* If we're not in a comment, check for quote marks. */
 774       if (!in_comment)
 775         {
 776           if (*cp == quote)
 777             quote = EOF;
 778           else if (*cp == '\'' || *cp == '"')
 779             quote = *cp;
 780         }
 781
 782       /* If we're not inside a quotation, check for comment. */
 783       if (quote == EOF)
 784         {
 785           if (cp[0] == '/' && cp[1] == '*')
 786             {
 787               in_comment = true;
 788               *cp++ = ' ';
 789               *cp++ = ' ';
 790               continue;
 791             }
 792           else if (in_comment && cp[0] == '*' && cp[1] == '/')
 793             {
 794               in_comment = false;
 795               *cp++ = ' ';
 796               *cp++ = ' ';
 797               continue;
 798             }
 799         }
 800
 801       /* Check commenting. */
 802       if (in_comment)
 803         *cp = ' ';
 804       cp++;
 805     }
 806 }
 807
 808 /* Reads a line for use by the tokenizer, and preprocesses it by
 809    removing comments, stripping trailing whitespace and the
 810    terminal dot, and removing leading indentors. */
 811 bool
 812 lex_get_line (void)
 813 {
 814   struct string *line = &getl_buf;
 815   bool interactive;
 816
 817   if (!getl_read_line (&interactive))
 818     return false;
 819
 820   strip_comments (line);
 821   ds_rtrim (line, ss_cstr (CC_SPACES));
 822
 823   /* Check for and remove terminal dot. */
 824   dot = (ds_chomp (line, get_endcmd ())
 825          || (ds_is_empty (line) && get_nulline ()));
 826
 827   /* Strip leading indentors or insert a terminal dot (unless the
 828      line was obtained interactively). */
 829   if (!interactive)
 830     {
 831       int first = ds_first (line);
 832
 833       if (first == '+' || first == '-')
 834         *ds_data (line) = ' ';
 835       else if (first != EOF && !isspace (first))
 836         put_token = '.';
 837     }
 838
 839   prog = ds_cstr (line);
 840
 841   return true;
 842 }
 843 \f
 844 /* Token names. */
 845
 846 /* Returns the name of a token in a static buffer. */
 847 const char *
 848 lex_token_name (int token)
 849 {
 850   if (token >= T_FIRST_KEYWORD && token <= T_LAST_KEYWORD)
 851     return keywords[token - T_FIRST_KEYWORD];
 852
 853   if (token < 256)
 854     {
 855       static char t[2];
 856       t[0] = token;
 857       return t;
 858     }
 859
 860   return _("<ERROR>");
 861 }
 862
 863 /* Returns an ASCII representation of the current token as a
 864    malloc()'d string. */
 865 char *
 866 lex_token_representation (void)
 867 {
 868   char *token_rep;
 869
 870   switch (token)
 871     {
 872     case T_ID:
 873     case T_POS_NUM:
 874     case T_NEG_NUM:
 875       return ds_xstrdup (&tokstr);
 876       break;
 877
 878     case T_STRING:
 879       {
 880         int hexstring = 0;
 881         char *sp, *dp;
 882
 883         for (sp = ds_cstr (&tokstr); sp < ds_end (&tokstr); sp++)
 884           if (!isprint ((unsigned char) *sp))
 885             {
 886               hexstring = 1;
 887               break;
 888             }
 889
 890         token_rep = xmalloc (2 + ds_length (&tokstr) * 2 + 1 + 1);
 891
 892         dp = token_rep;
 893         if (hexstring)
 894           *dp++ = 'X';
 895         *dp++ = '\'';
 896
 897         if (!hexstring)
 898           for (sp = ds_cstr (&tokstr); *sp; )
 899             {
 900               if (*sp == '\'')
 901                 *dp++ = '\'';
 902               *dp++ = (unsigned char) *sp++;
 903             }
 904         else
 905           for (sp = ds_cstr (&tokstr); sp < ds_end (&tokstr); sp++)
 906             {
 907               *dp++ = (((unsigned char) *sp) >> 4)["0123456789ABCDEF"];
 908               *dp++ = (((unsigned char) *sp) & 15)["0123456789ABCDEF"];
 909             }
 910         *dp++ = '\'';
 911         *dp = '\0';
 912
 913         return token_rep;
 914       }
 915     break;
 916
 917     case T_STOP:
 918       token_rep = xmalloc (1);
 919       *token_rep = '\0';
 920       return token_rep;
 921
 922     case T_EXP:
 923       return xstrdup ("**");
 924
 925     default:
 926       if (token >= T_FIRST_KEYWORD && token <= T_LAST_KEYWORD)
 927         return xstrdup (keywords [token - T_FIRST_KEYWORD]);
 928       else
 929         {
 930           token_rep = xmalloc (2);
 931           token_rep[0] = token;
 932           token_rep[1] = '\0';
 933           return token_rep;
 934         }
 935     }
 936
 937   assert (0);
 938 }
 939 \f
 940 /* Really weird functions. */
 941
 942 /* Most of the time, a `-' is a lead-in to a negative number.  But
 943    sometimes it's actually part of the syntax.  If a dash can be part
 944    of syntax then this function is called to rip it off of a
 945    number. */
 946 void
 947 lex_negative_to_dash (void)
 948 {
 949   if (token == T_NEG_NUM)
 950     {
 951       token = T_POS_NUM;
 952       tokval = -tokval;
 953       ds_assign_substring (&tokstr, ds_substr (&tokstr, 1, SIZE_MAX));
 954       save_token ();
 955       token = '-';
 956     }
 957 }
 958
 959 /* We're not at eof any more. */
 960 void
 961 lex_reset_eof (void)
 962 {
 963   eof = 0;
 964 }
 965
 966 /* Skip a COMMENT command. */
 967 void
 968 lex_skip_comment (void)
 969 {
 970   for (;;)
 971     {
 972       if (!lex_get_line ())
 973         {
 974           put_token = T_STOP;
 975           eof = 1;
 976           return;
 977         }
 978
 979       if (put_token == '.')
 980         break;
 981
 982       prog = ds_end (&getl_buf);
 983       if (dot)
 984         break;
 985     }
 986 }
 987 \f
 988 /* Private functions. */
 989
 990 /* When invoked, tokstr contains a string of binary, octal, or
 991    hex digits, according to TYPE.  The string is converted to
 992    characters having the specified values. */
 993 static void
 994 convert_numeric_string_to_char_string (enum string_type type)
 995 {
 996   const char *base_name;
 997   int base;
 998   int chars_per_byte;
 999   size_t byte_cnt;
1000   size_t i;
1001   char *p;
1002
1003   switch (type)
1004     {
1005     case BINARY_STRING:
1006       base_name = _("binary");
1007       base = 2;
1008       chars_per_byte = 8;
1009       break;
1010     case OCTAL_STRING:
1011       base_name = _("octal");
1012       base = 8;
1013       chars_per_byte = 3;
1014       break;
1015     case HEX_STRING:
1016       base_name = _("hex");
1017       base = 16;
1018       chars_per_byte = 2;
1019       break;
1020     default:
1021       abort ();
1022     }
1023
1024   byte_cnt = ds_length (&tokstr) / chars_per_byte;
1025   if (ds_length (&tokstr) % chars_per_byte)
1026     msg (SE, _("String of %s digits has %d characters, which is not a "
1027                "multiple of %d."),
1028          base_name, ds_length (&tokstr), chars_per_byte);
1029
1030   p = ds_cstr (&tokstr);
1031   for (i = 0; i < byte_cnt; i++)
1032     {
1033       int value;
1034       int j;
1035
1036       value = 0;
1037       for (j = 0; j < chars_per_byte; j++, p++)
1038         {
1039           int v;
1040
1041           if (*p >= '0' && *p <= '9')
1042             v = *p - '0';
1043           else
1044             {
1045               static const char alpha[] = "abcdef";
1046               const char *q = strchr (alpha, tolower ((unsigned char) *p));
1047
1048               if (q)
1049                 v = q - alpha + 10;
1050               else
1051                 v = base;
1052             }
1053
1054           if (v >= base)
1055             msg (SE, _("`%c' is not a valid %s digit."), *p, base_name);
1056
1057           value = value * base + v;
1058         }
1059
1060       ds_cstr (&tokstr)[i] = (unsigned char) value;
1061     }
1062
1063   ds_truncate (&tokstr, byte_cnt);
1064 }
1065
1066 /* Parses a string from the input buffer into tokstr.  The input
1067    buffer pointer prog must point to the initial single or double
1068    quote.  TYPE indicates the type of string to be parsed.
1069    Returns token type. */
1070 static int
1071 parse_string (enum string_type type)
1072 {
1073   /* Accumulate the entire string, joining sections indicated by +
1074      signs. */
1075   for (;;)
1076     {
1077       /* Single or double quote. */
1078       int c = *prog++;
1079
1080       /* Accumulate section. */
1081       for (;;)
1082         {
1083           /* Check end of line. */
1084           if (*prog == '\0')
1085             {
1086               msg (SE, _("Unterminated string constant."));
1087               goto finish;
1088             }
1089
1090           /* Double quote characters to embed them in strings. */
1091           if (*prog == c)
1092             {
1093               if (prog[1] == c)
1094                 prog++;
1095               else
1096                 break;
1097             }
1098
1099           ds_put_char (&tokstr, *prog++);
1100         }
1101       prog++;
1102
1103       /* Skip whitespace after final quote mark. */
1104       if (eof)
1105         break;
1106       for (;;)
1107         {
1108           while (isspace ((unsigned char) *prog))
1109             prog++;
1110           if (*prog)
1111             break;
1112
1113           if (dot)
1114             goto finish;
1115
1116           if (!lex_get_line ())
1117             goto finish;
1118         }
1119
1120       /* Skip plus sign. */
1121       if (*prog != '+')
1122         break;
1123       prog++;
1124
1125       /* Skip whitespace after plus sign. */
1126       if (eof)
1127         break;
1128       for (;;)
1129         {
1130           while (isspace ((unsigned char) *prog))
1131             prog++;
1132           if (*prog)
1133             break;
1134
1135           if (dot)
1136             goto finish;
1137
1138           if (!lex_get_line ())
1139             {
1140               msg (SE, _("Unexpected end of file in string concatenation."));
1141               goto finish;
1142             }
1143         }
1144
1145       /* Ensure that a valid string follows. */
1146       if (*prog != '\'' && *prog != '"')
1147         {
1148           msg (SE, _("String expected following `+'."));
1149           goto finish;
1150         }
1151     }
1152
1153   /* We come here when we've finished concatenating all the string sections
1154      into one large string. */
1155 finish:
1156   if (type != CHARACTER_STRING)
1157     convert_numeric_string_to_char_string (type);
1158
1159   if (ds_length (&tokstr) > 255)
1160     {
1161       msg (SE, _("String exceeds 255 characters in length (%d characters)."),
1162            ds_length (&tokstr));
1163       ds_truncate (&tokstr, 255);
1164     }
1165
1166   {
1167     /* FIXME. */
1168     size_t i;
1169     int warned = 0;
1170
1171     for (i = 0; i < ds_length (&tokstr); i++)
1172       if (ds_cstr (&tokstr)[i] == 0)
1173         {
1174           if (!warned)
1175             {
1176               msg (SE, _("Sorry, literal strings may not contain null "
1177                          "characters.  Replacing with spaces."));
1178               warned = 1;
1179             }
1180           ds_cstr (&tokstr)[i] = ' ';
1181         }
1182   }
1183
1184   return T_STRING;
1185 }
1186 \f
1187 #if DUMP_TOKENS
1188 /* Reads one token from the lexer and writes a textual representation
1189    on stdout for debugging purposes. */
1190 static void
1191 dump_token (void)
1192 {
1193   {
1194     const char *curfn;
1195     int curln;
1196
1197     getl_location (&curfn, &curln);
1198     if (curfn)
1199       fprintf (stderr, "%s:%d\t", curfn, curln);
1200   }
1201
1202   switch (token)
1203     {
1204     case T_ID:
1205       fprintf (stderr, "ID\t%s\n", tokid);
1206       break;
1207
1208     case T_POS_NUM:
1209     case T_NEG_NUM:
1210       fprintf (stderr, "NUM\t%f\n", tokval);
1211       break;
1212
1213     case T_STRING:
1214       fprintf (stderr, "STRING\t\"%s\"\n", ds_cstr (&tokstr));
1215       break;
1216
1217     case T_STOP:
1218       fprintf (stderr, "STOP\n");
1219       break;
1220
1221     case T_EXP:
1222       fprintf (stderr, "MISC\tEXP\"");
1223       break;
1224
1225     case 0:
1226       fprintf (stderr, "MISC\tEOF\n");
1227       break;
1228
1229     default:
1230       if (token >= T_FIRST_KEYWORD && token <= T_LAST_KEYWORD)
1231         fprintf (stderr, "KEYWORD\t%s\n", lex_token_name (token));
1232       else
1233         fprintf (stderr, "PUNCT\t%c\n", token);
1234       break;
1235     }
1236 }
1237 #endif /* DUMP_TOKENS */