src/language/lexer/lexer.c

   1 /* PSPP - computes sample statistics.
   2    Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
   3    Written by Ben Pfaff <blp@gnu.org>.
   4
   5    This program is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU General Public License as
   7    published by the Free Software Foundation; either version 2 of the
   8    License, or (at your option) any later version.
   9
  10    This program is distributed in the hope that it will be useful, but
  11    WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program; if not, write to the Free Software
  17    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  18    02110-1301, USA. */
  19
  20 #include <config.h>
  21 #include "lexer.h"
  22 #include <libpspp/message.h>
  23 #include <ctype.h>
  24 #include <errno.h>
  25 #include <limits.h>
  26 #include <math.h>
  27 #include <stdarg.h>
  28 #include <stdlib.h>
  29 #include <libpspp/alloc.h>
  30 #include <libpspp/assertion.h>
  31 #include <language/command.h>
  32 #include <libpspp/message.h>
  33 #include <language/line-buffer.h>
  34 #include <libpspp/magic.h>
  35 #include <data/settings.h>
  36 #include <libpspp/str.h>
  37
  38 #include "size_max.h"
  39
  40 #include "gettext.h"
  41 #define _(msgid) gettext (msgid)
  42 #define N_(msgid) msgid
  43
  44 /*
  45 #define DUMP_TOKENS 1
  46 */
  47
  48 \f
  49 /* Global variables. */
  50
  51 extern const char *keywords[T_N_KEYWORDS + 1];
  52
  53
  54 /* Current token. */
  55 int token;
  56
  57 /* T_POS_NUM, T_NEG_NUM: the token's value. */
  58 double tokval;
  59
  60 /* T_ID: the identifier. */
  61 char tokid[LONG_NAME_LEN + 1];
  62
  63 /* T_ID, T_STRING: token string value.
  64    For T_ID, this is not truncated as is tokid. */
  65 struct string tokstr;
  66 \f
  67 /* Static variables. */
  68
  69 /* Pointer to next token in getl_buf. */
  70 static char *prog;
  71
  72 /* Nonzero only if this line ends with a terminal dot. */
  73 static int dot;
  74
  75 /* Nonzero only if the last token returned was T_STOP. */
  76 static int eof;
  77
  78 /* If nonzero, next token returned by lex_get().
  79    Used only in exceptional circumstances. */
  80 static int put_token;
  81 static struct string put_tokstr;
  82 static double put_tokval;
  83
  84 static int parse_id (void);
  85
  86 /* How a string represents its contents. */
  87 enum string_type
  88   {
  89     CHARACTER_STRING,   /* Characters. */
  90     BINARY_STRING,      /* Binary digits. */
  91     OCTAL_STRING,       /* Octal digits. */
  92     HEX_STRING          /* Hexadecimal digits. */
  93   };
  94
  95 static int parse_string (enum string_type);
  96
  97 #if DUMP_TOKENS
  98 static void dump_token (void);
  99 #endif
 100 \f
 101 /* Initialization. */
 102
 103 /* Initializes the lexer. */
 104 void
 105 lex_init (void)
 106 {
 107   ds_init_empty (&tokstr);
 108   ds_init_empty (&put_tokstr);
 109   if (!lex_get_line ())
 110     eof = true;
 111 }
 112
 113 void
 114 lex_done (void)
 115 {
 116   ds_destroy (&put_tokstr);
 117   ds_destroy (&tokstr);
 118 }
 119
 120 \f
 121 /* Common functions. */
 122
 123 /* Copies put_token, put_tokstr, put_tokval into token, tokstr,
 124    tokval, respectively, and sets tokid appropriately. */
 125 static void
 126 restore_token (void)
 127 {
 128   assert (put_token != 0);
 129   token = put_token;
 130   ds_assign_string (&tokstr, &put_tokstr);
 131   str_copy_trunc (tokid, sizeof tokid, ds_cstr (&tokstr));
 132   tokval = put_tokval;
 133   put_token = 0;
 134 }
 135
 136 /* Copies token, tokstr, tokval into put_token, put_tokstr,
 137    put_tokval respectively. */
 138 static void
 139 save_token (void)
 140 {
 141   put_token = token;
 142   ds_assign_string (&put_tokstr, &tokstr);
 143   put_tokval = tokval;
 144 }
 145
 146 /* Parses a single token, setting appropriate global variables to
 147    indicate the token's attributes. */
 148 void
 149 lex_get (void)
 150 {
 151   /* If a token was pushed ahead, return it. */
 152   if (put_token)
 153     {
 154       restore_token ();
 155 #if DUMP_TOKENS
 156       dump_token ();
 157 #endif
 158       return;
 159     }
 160
 161   /* Find a token. */
 162   for (;;)
 163     {
 164       /* Skip whitespace. */
 165       if (eof)
 166         {
 167           token = T_STOP;
 168           return;
 169         }
 170
 171       for (;;)
 172         {
 173           while (isspace ((unsigned char) *prog))
 174             prog++;
 175           if (*prog)
 176             break;
 177
 178           if (dot)
 179             {
 180               dot = 0;
 181               token = '.';
 182 #if DUMP_TOKENS
 183               dump_token ();
 184 #endif
 185               return;
 186             }
 187           else if (!lex_get_line ())
 188             {
 189               eof = 1;
 190               token = T_STOP;
 191 #if DUMP_TOKENS
 192               dump_token ();
 193 #endif
 194               return;
 195             }
 196
 197           if (put_token)
 198             {
 199               restore_token ();
 200 #if DUMP_TOKENS
 201               dump_token ();
 202 #endif
 203               return;
 204             }
 205         }
 206
 207
 208       /* Actually parse the token. */
 209       ds_clear (&tokstr);
 210
 211       switch (*prog)
 212         {
 213         case '-': case '.':
 214         case '0': case '1': case '2': case '3': case '4':
 215         case '5': case '6': case '7': case '8': case '9':
 216           {
 217             char *tail;
 218
 219             /* `-' can introduce a negative number, or it can be a
 220                token by itself.  If it is not followed by a digit or a
 221                decimal point, it is definitely not a number.
 222                Otherwise, it might be either, but most of the time we
 223                want it as a number.  When the syntax calls for a `-'
 224                token, lex_negative_to_dash() must be used to break
 225                negative numbers into two tokens. */
 226             if (*prog == '-')
 227               {
 228                 ds_put_char (&tokstr, *prog++);
 229                 while (isspace ((unsigned char) *prog))
 230                   prog++;
 231
 232                 if (!isdigit ((unsigned char) *prog) && *prog != '.')
 233                   {
 234                     token = '-';
 235                     break;
 236                   }
 237                 token = T_NEG_NUM;
 238               }
 239             else
 240               token = T_POS_NUM;
 241
 242             /* Parse the number, copying it into tokstr. */
 243             while (isdigit ((unsigned char) *prog))
 244               ds_put_char (&tokstr, *prog++);
 245             if (*prog == '.')
 246               {
 247                 ds_put_char (&tokstr, *prog++);
 248                 while (isdigit ((unsigned char) *prog))
 249                   ds_put_char (&tokstr, *prog++);
 250               }
 251             if (*prog == 'e' || *prog == 'E')
 252               {
 253                 ds_put_char (&tokstr, *prog++);
 254                 if (*prog == '+' || *prog == '-')
 255                   ds_put_char (&tokstr, *prog++);
 256                 while (isdigit ((unsigned char) *prog))
 257                   ds_put_char (&tokstr, *prog++);
 258               }
 259
 260             /* Parse as floating point. */
 261             tokval = strtod (ds_cstr (&tokstr), &tail);
 262             if (*tail)
 263               {
 264                 msg (SE, _("%s does not form a valid number."),
 265                      ds_cstr (&tokstr));
 266                 tokval = 0.0;
 267
 268                 ds_clear (&tokstr);
 269                 ds_put_char (&tokstr, '0');
 270               }
 271
 272             break;
 273           }
 274
 275         case '\'': case '"':
 276           token = parse_string (CHARACTER_STRING);
 277           break;
 278
 279         case '(': case ')': case ',': case '=': case '+': case '/':
 280           token = *prog++;
 281           break;
 282
 283         case '*':
 284           if (*++prog == '*')
 285             {
 286               prog++;
 287               token = T_EXP;
 288             }
 289           else
 290             token = '*';
 291           break;
 292
 293         case '<':
 294           if (*++prog == '=')
 295             {
 296               prog++;
 297               token = T_LE;
 298             }
 299           else if (*prog == '>')
 300             {
 301               prog++;
 302               token = T_NE;
 303             }
 304           else
 305             token = T_LT;
 306           break;
 307
 308         case '>':
 309           if (*++prog == '=')
 310             {
 311               prog++;
 312               token = T_GE;
 313             }
 314           else
 315             token = T_GT;
 316           break;
 317
 318         case '~':
 319           if (*++prog == '=')
 320             {
 321               prog++;
 322               token = T_NE;
 323             }
 324           else
 325             token = T_NOT;
 326           break;
 327
 328         case '&':
 329           prog++;
 330           token = T_AND;
 331           break;
 332
 333         case '|':
 334           prog++;
 335           token = T_OR;
 336           break;
 337
 338         case 'b': case 'B':
 339           if (prog[1] == '\'' || prog[1] == '"')
 340             token = parse_string (BINARY_STRING);
 341           else
 342             token = parse_id ();
 343           break;
 344
 345         case 'o': case 'O':
 346           if (prog[1] == '\'' || prog[1] == '"')
 347             token = parse_string (OCTAL_STRING);
 348           else
 349             token = parse_id ();
 350           break;
 351
 352         case 'x': case 'X':
 353           if (prog[1] == '\'' || prog[1] == '"')
 354             token = parse_string (HEX_STRING);
 355           else
 356             token = parse_id ();
 357           break;
 358
 359         default:
 360           if (lex_is_id1 (*prog))
 361             {
 362               token = parse_id ();
 363               break;
 364             }
 365           else
 366             {
 367               if (isgraph ((unsigned char) *prog))
 368                 msg (SE, _("Bad character in input: `%c'."), *prog++);
 369               else
 370                 msg (SE, _("Bad character in input: `\\%o'."), *prog++);
 371               continue;
 372             }
 373         }
 374       break;
 375     }
 376
 377 #if DUMP_TOKENS
 378   dump_token ();
 379 #endif
 380 }
 381
 382 /* Parses an identifier at the current position into tokid and
 383    tokstr.
 384    Returns the correct token type. */
 385 static int
 386 parse_id (void)
 387 {
 388   const char *start = prog;
 389   prog = lex_skip_identifier (start);
 390
 391   ds_put_substring (&tokstr, ss_buffer (start, prog - start));
 392   str_copy_trunc (tokid, sizeof tokid, ds_cstr (&tokstr));
 393   return lex_id_to_token (ds_cstr (&tokstr), ds_length (&tokstr));
 394 }
 395
 396 /* Reports an error to the effect that subcommand SBC may only be
 397    specified once. */
 398 void
 399 lex_sbc_only_once (const char *sbc)
 400 {
 401   msg (SE, _("Subcommand %s may only be specified once."), sbc);
 402 }
 403
 404 /* Reports an error to the effect that subcommand SBC is
 405    missing. */
 406 void
 407 lex_sbc_missing (const char *sbc)
 408 {
 409   lex_error (_("missing required subcommand %s"), sbc);
 410 }
 411
 412 /* Prints a syntax error message containing the current token and
 413    given message MESSAGE (if non-null). */
 414 void
 415 lex_error (const char *message, ...)
 416 {
 417   char *token_rep;
 418   char where[128];
 419
 420   token_rep = lex_token_representation ();
 421   if (token == T_STOP)
 422     strcpy (where, "end of file");
 423   else if (token == '.')
 424     strcpy (where, "end of command");
 425   else
 426     snprintf (where, sizeof where, "`%s'", token_rep);
 427   free (token_rep);
 428
 429   if (message)
 430     {
 431       char buf[1024];
 432       va_list args;
 433
 434       va_start (args, message);
 435       vsnprintf (buf, 1024, message, args);
 436       va_end (args);
 437
 438       msg (SE, _("Syntax error %s at %s."), buf, where);
 439     }
 440   else
 441     msg (SE, _("Syntax error at %s."), where);
 442 }
 443
 444 /* Checks that we're at end of command.
 445    If so, returns a successful command completion code.
 446    If not, flags a syntax error and returns an error command
 447    completion code. */
 448 int
 449 lex_end_of_command (void)
 450 {
 451   if (token != '.')
 452     {
 453       lex_error (_("expecting end of command"));
 454       return CMD_FAILURE;
 455     }
 456   else
 457     return CMD_SUCCESS;
 458 }
 459 \f
 460 /* Token testing functions. */
 461
 462 /* Returns true if the current token is a number. */
 463 bool
 464 lex_is_number (void)
 465 {
 466   return token == T_POS_NUM || token == T_NEG_NUM;
 467 }
 468
 469 /* Returns the value of the current token, which must be a
 470    floating point number. */
 471 double
 472 lex_number (void)
 473 {
 474   assert (lex_is_number ());
 475   return tokval;
 476 }
 477
 478 /* Returns true iff the current token is an integer. */
 479 bool
 480 lex_is_integer (void)
 481 {
 482   return (lex_is_number ()
 483           && tokval != NOT_LONG
 484           && tokval >= LONG_MIN
 485           && tokval <= LONG_MAX
 486           && floor (tokval) == tokval);
 487 }
 488
 489 /* Returns the value of the current token, which must be an
 490    integer. */
 491 long
 492 lex_integer (void)
 493 {
 494   assert (lex_is_integer ());
 495   return tokval;
 496 }
 497 \f
 498 /* Token matching functions. */
 499
 500 /* If TOK is the current token, skips it and returns nonzero.
 501    Otherwise, returns zero. */
 502 int
 503 lex_match (int t)
 504 {
 505   if (token == t)
 506     {
 507       lex_get ();
 508       return 1;
 509     }
 510   else
 511     return 0;
 512 }
 513
 514 /* If the current token is the identifier S, skips it and returns
 515    nonzero.  The identifier may be abbreviated to its first three
 516    letters.
 517    Otherwise, returns zero. */
 518 int
 519 lex_match_id (const char *s)
 520 {
 521   if (token == T_ID && lex_id_match (s, tokid))
 522     {
 523       lex_get ();
 524       return 1;
 525     }
 526   else
 527     return 0;
 528 }
 529
 530 /* If the current token is integer N, skips it and returns nonzero.
 531    Otherwise, returns zero. */
 532 int
 533 lex_match_int (int x)
 534 {
 535   if (lex_is_integer () && lex_integer () == x)
 536     {
 537       lex_get ();
 538       return 1;
 539     }
 540   else
 541     return 0;
 542 }
 543 \f
 544 /* Forced matches. */
 545
 546 /* If this token is identifier S, fetches the next token and returns
 547    nonzero.
 548    Otherwise, reports an error and returns zero. */
 549 int
 550 lex_force_match_id (const char *s)
 551 {
 552   if (token == T_ID && lex_id_match (s, tokid))
 553     {
 554       lex_get ();
 555       return 1;
 556     }
 557   else
 558     {
 559       lex_error (_("expecting `%s'"), s);
 560       return 0;
 561     }
 562 }
 563
 564 /* If the current token is T, skips the token.  Otherwise, reports an
 565    error and returns from the current function with return value 0. */
 566 int
 567 lex_force_match (int t)
 568 {
 569   if (token == t)
 570     {
 571       lex_get ();
 572       return 1;
 573     }
 574   else
 575     {
 576       lex_error (_("expecting `%s'"), lex_token_name (t));
 577       return 0;
 578     }
 579 }
 580
 581 /* If this token is a string, does nothing and returns nonzero.
 582    Otherwise, reports an error and returns zero. */
 583 int
 584 lex_force_string (void)
 585 {
 586   if (token == T_STRING)
 587     return 1;
 588   else
 589     {
 590       lex_error (_("expecting string"));
 591       return 0;
 592     }
 593 }
 594
 595 /* If this token is an integer, does nothing and returns nonzero.
 596    Otherwise, reports an error and returns zero. */
 597 int
 598 lex_force_int (void)
 599 {
 600   if (lex_is_integer ())
 601     return 1;
 602   else
 603     {
 604       lex_error (_("expecting integer"));
 605       return 0;
 606     }
 607 }
 608
 609 /* If this token is a number, does nothing and returns nonzero.
 610    Otherwise, reports an error and returns zero. */
 611 int
 612 lex_force_num (void)
 613 {
 614   if (lex_is_number ())
 615     return 1;
 616   else
 617     {
 618       lex_error (_("expecting number"));
 619       return 0;
 620     }
 621 }
 622
 623 /* If this token is an identifier, does nothing and returns nonzero.
 624    Otherwise, reports an error and returns zero. */
 625 int
 626 lex_force_id (void)
 627 {
 628   if (token == T_ID)
 629     return 1;
 630   else
 631     {
 632       lex_error (_("expecting identifier"));
 633       return 0;
 634     }
 635 }
 636 /* Weird token functions. */
 637
 638 /* Returns the first character of the next token, except that if the
 639    next token is not an identifier, the character returned will not be
 640    a character that can begin an identifier.  Specifically, the
 641    hexstring lead-in X' causes lookahead() to return '.  Note that an
 642    alphanumeric return value doesn't guarantee an ID token, it could
 643    also be a reserved-word token. */
 644 int
 645 lex_look_ahead (void)
 646 {
 647   if (put_token)
 648     return put_token;
 649
 650   for (;;)
 651     {
 652       if (eof)
 653         return 0;
 654
 655       for (;;)
 656         {
 657           while (isspace ((unsigned char) *prog))
 658             prog++;
 659           if (*prog)
 660             break;
 661
 662           if (dot)
 663             return '.';
 664           else if (!lex_get_line ())
 665             return 0;
 666
 667           if (put_token)
 668             return put_token;
 669         }
 670
 671       if ((toupper ((unsigned char) *prog) == 'X'
 672            || toupper ((unsigned char) *prog) == 'B'
 673            || toupper ((unsigned char) *prog) == 'O')
 674           && (prog[1] == '\'' || prog[1] == '"'))
 675         return '\'';
 676
 677       return *prog;
 678     }
 679 }
 680
 681 /* Makes the current token become the next token to be read; the
 682    current token is set to T. */
 683 void
 684 lex_put_back (int t)
 685 {
 686   save_token ();
 687   token = t;
 688 }
 689
 690 /* Makes the current token become the next token to be read; the
 691    current token is set to the identifier ID. */
 692 void
 693 lex_put_back_id (const char *id)
 694 {
 695   assert (lex_id_to_token (id, strlen (id)) == T_ID);
 696   save_token ();
 697   token = T_ID;
 698   ds_assign_cstr (&tokstr, id);
 699   str_copy_trunc (tokid, sizeof tokid, ds_cstr (&tokstr));
 700 }
 701 \f
 702 /* Weird line processing functions. */
 703
 704 /* Returns the entire contents of the current line. */
 705 const char *
 706 lex_entire_line (void)
 707 {
 708   return ds_cstr (&getl_buf);
 709 }
 710
 711 /* As lex_entire_line(), but only returns the part of the current line
 712    that hasn't already been tokenized.
 713    If END_DOT is non-null, stores nonzero into *END_DOT if the line
 714    ends with a terminal dot, or zero if it doesn't. */
 715 const char *
 716 lex_rest_of_line (int *end_dot)
 717 {
 718   if (end_dot)
 719     *end_dot = dot;
 720   return prog;
 721 }
 722
 723 /* Causes the rest of the current input line to be ignored for
 724    tokenization purposes. */
 725 void
 726 lex_discard_line (void)
 727 {
 728   prog = ds_end (&getl_buf);
 729   dot = put_token = 0;
 730 }
 731
 732 /* Sets the current position in the current line to P, which must be
 733    in getl_buf. */
 734 void
 735 lex_set_prog (char *p)
 736 {
 737   prog = p;
 738 }
 739
 740 /* Discards the rest of the current command.
 741    When we're reading commands from a file, we skip tokens until
 742    a terminal dot or EOF.
 743    When we're reading commands interactively from the user,
 744    that's just discarding the current line, because presumably
 745    the user doesn't want to finish typing a command that will be
 746    ignored anyway. */
 747 void
 748 lex_discard_rest_of_command (void)
 749 {
 750   if (!getl_is_interactive ())
 751     {
 752       while (token != T_STOP && token != '.')
 753         lex_get ();
 754     }
 755   else
 756     lex_discard_line ();
 757 }
 758 \f
 759 /* Weird line reading functions. */
 760
 761 /* Remove C-style comments in STRING, begun by slash-star and
 762    terminated by star-slash or newline. */
 763 static void
 764 strip_comments (struct string *string)
 765 {
 766   char *cp;
 767   int quote;
 768   bool in_comment;
 769
 770   in_comment = false;
 771   quote = EOF;
 772   for (cp = ds_cstr (string); *cp; )
 773     {
 774       /* If we're not in a comment, check for quote marks. */
 775       if (!in_comment)
 776         {
 777           if (*cp == quote)
 778             quote = EOF;
 779           else if (*cp == '\'' || *cp == '"')
 780             quote = *cp;
 781         }
 782
 783       /* If we're not inside a quotation, check for comment. */
 784       if (quote == EOF)
 785         {
 786           if (cp[0] == '/' && cp[1] == '*')
 787             {
 788               in_comment = true;
 789               *cp++ = ' ';
 790               *cp++ = ' ';
 791               continue;
 792             }
 793           else if (in_comment && cp[0] == '*' && cp[1] == '/')
 794             {
 795               in_comment = false;
 796               *cp++ = ' ';
 797               *cp++ = ' ';
 798               continue;
 799             }
 800         }
 801
 802       /* Check commenting. */
 803       if (in_comment)
 804         *cp = ' ';
 805       cp++;
 806     }
 807 }
 808
 809 /* Reads a line for use by the tokenizer, and preprocesses it by
 810    removing comments, stripping trailing whitespace and the
 811    terminal dot, and removing leading indentors. */
 812 bool
 813 lex_get_line (void)
 814 {
 815   struct string *line = &getl_buf;
 816   bool interactive;
 817
 818   if (!getl_read_line (&interactive))
 819     return false;
 820
 821   strip_comments (line);
 822   ds_rtrim (line, ss_cstr (CC_SPACES));
 823
 824   /* Check for and remove terminal dot. */
 825   dot = (ds_chomp (line, get_endcmd ())
 826          || (ds_is_empty (line) && get_nulline ()));
 827
 828   /* Strip leading indentors or insert a terminal dot (unless the
 829      line was obtained interactively). */
 830   if (!interactive)
 831     {
 832       int first = ds_first (line);
 833
 834       if (first == '+' || first == '-')
 835         *ds_data (line) = ' ';
 836       else if (first != EOF && !isspace (first))
 837         put_token = '.';
 838     }
 839
 840   prog = ds_cstr (line);
 841
 842   return true;
 843 }
 844 \f
 845 /* Token names. */
 846
 847 /* Returns the name of a token in a static buffer. */
 848 const char *
 849 lex_token_name (int token)
 850 {
 851   if (token >= T_FIRST_KEYWORD && token <= T_LAST_KEYWORD)
 852     return keywords[token - T_FIRST_KEYWORD];
 853
 854   if (token < 256)
 855     {
 856       static char t[2];
 857       t[0] = token;
 858       return t;
 859     }
 860
 861   return _("<ERROR>");
 862 }
 863
 864 /* Returns an ASCII representation of the current token as a
 865    malloc()'d string. */
 866 char *
 867 lex_token_representation (void)
 868 {
 869   char *token_rep;
 870
 871   switch (token)
 872     {
 873     case T_ID:
 874     case T_POS_NUM:
 875     case T_NEG_NUM:
 876       return ds_xstrdup (&tokstr);
 877       break;
 878
 879     case T_STRING:
 880       {
 881         int hexstring = 0;
 882         char *sp, *dp;
 883
 884         for (sp = ds_cstr (&tokstr); sp < ds_end (&tokstr); sp++)
 885           if (!isprint ((unsigned char) *sp))
 886             {
 887               hexstring = 1;
 888               break;
 889             }
 890
 891         token_rep = xmalloc (2 + ds_length (&tokstr) * 2 + 1 + 1);
 892
 893         dp = token_rep;
 894         if (hexstring)
 895           *dp++ = 'X';
 896         *dp++ = '\'';
 897
 898         if (!hexstring)
 899           for (sp = ds_cstr (&tokstr); *sp; )
 900             {
 901               if (*sp == '\'')
 902                 *dp++ = '\'';
 903               *dp++ = (unsigned char) *sp++;
 904             }
 905         else
 906           for (sp = ds_cstr (&tokstr); sp < ds_end (&tokstr); sp++)
 907             {
 908               *dp++ = (((unsigned char) *sp) >> 4)["0123456789ABCDEF"];
 909               *dp++ = (((unsigned char) *sp) & 15)["0123456789ABCDEF"];
 910             }
 911         *dp++ = '\'';
 912         *dp = '\0';
 913
 914         return token_rep;
 915       }
 916     break;
 917
 918     case T_STOP:
 919       token_rep = xmalloc (1);
 920       *token_rep = '\0';
 921       return token_rep;
 922
 923     case T_EXP:
 924       return xstrdup ("**");
 925
 926     default:
 927       if (token >= T_FIRST_KEYWORD && token <= T_LAST_KEYWORD)
 928         return xstrdup (keywords [token - T_FIRST_KEYWORD]);
 929       else
 930         {
 931           token_rep = xmalloc (2);
 932           token_rep[0] = token;
 933           token_rep[1] = '\0';
 934           return token_rep;
 935         }
 936     }
 937
 938   NOT_REACHED ();
 939 }
 940 \f
 941 /* Really weird functions. */
 942
 943 /* Most of the time, a `-' is a lead-in to a negative number.  But
 944    sometimes it's actually part of the syntax.  If a dash can be part
 945    of syntax then this function is called to rip it off of a
 946    number. */
 947 void
 948 lex_negative_to_dash (void)
 949 {
 950   if (token == T_NEG_NUM)
 951     {
 952       token = T_POS_NUM;
 953       tokval = -tokval;
 954       ds_assign_substring (&tokstr, ds_substr (&tokstr, 1, SIZE_MAX));
 955       save_token ();
 956       token = '-';
 957     }
 958 }
 959
 960 /* We're not at eof any more. */
 961 void
 962 lex_reset_eof (void)
 963 {
 964   eof = 0;
 965 }
 966
 967 /* Skip a COMMENT command. */
 968 void
 969 lex_skip_comment (void)
 970 {
 971   for (;;)
 972     {
 973       if (!lex_get_line ())
 974         {
 975           put_token = T_STOP;
 976           eof = 1;
 977           return;
 978         }
 979
 980       if (put_token == '.')
 981         break;
 982
 983       prog = ds_end (&getl_buf);
 984       if (dot)
 985         break;
 986     }
 987 }
 988 \f
 989 /* Private functions. */
 990
 991 /* When invoked, tokstr contains a string of binary, octal, or
 992    hex digits, according to TYPE.  The string is converted to
 993    characters having the specified values. */
 994 static void
 995 convert_numeric_string_to_char_string (enum string_type type)
 996 {
 997   const char *base_name;
 998   int base;
 999   int chars_per_byte;
1000   size_t byte_cnt;
1001   size_t i;
1002   char *p;
1003
1004   switch (type)
1005     {
1006     case BINARY_STRING:
1007       base_name = _("binary");
1008       base = 2;
1009       chars_per_byte = 8;
1010       break;
1011     case OCTAL_STRING:
1012       base_name = _("octal");
1013       base = 8;
1014       chars_per_byte = 3;
1015       break;
1016     case HEX_STRING:
1017       base_name = _("hex");
1018       base = 16;
1019       chars_per_byte = 2;
1020       break;
1021     default:
1022       NOT_REACHED ();
1023     }
1024
1025   byte_cnt = ds_length (&tokstr) / chars_per_byte;
1026   if (ds_length (&tokstr) % chars_per_byte)
1027     msg (SE, _("String of %s digits has %d characters, which is not a "
1028                "multiple of %d."),
1029          base_name, ds_length (&tokstr), chars_per_byte);
1030
1031   p = ds_cstr (&tokstr);
1032   for (i = 0; i < byte_cnt; i++)
1033     {
1034       int value;
1035       int j;
1036
1037       value = 0;
1038       for (j = 0; j < chars_per_byte; j++, p++)
1039         {
1040           int v;
1041
1042           if (*p >= '0' && *p <= '9')
1043             v = *p - '0';
1044           else
1045             {
1046               static const char alpha[] = "abcdef";
1047               const char *q = strchr (alpha, tolower ((unsigned char) *p));
1048
1049               if (q)
1050                 v = q - alpha + 10;
1051               else
1052                 v = base;
1053             }
1054
1055           if (v >= base)
1056             msg (SE, _("`%c' is not a valid %s digit."), *p, base_name);
1057
1058           value = value * base + v;
1059         }
1060
1061       ds_cstr (&tokstr)[i] = (unsigned char) value;
1062     }
1063
1064   ds_truncate (&tokstr, byte_cnt);
1065 }
1066
1067 /* Parses a string from the input buffer into tokstr.  The input
1068    buffer pointer prog must point to the initial single or double
1069    quote.  TYPE indicates the type of string to be parsed.
1070    Returns token type. */
1071 static int
1072 parse_string (enum string_type type)
1073 {
1074   /* Accumulate the entire string, joining sections indicated by +
1075      signs. */
1076   for (;;)
1077     {
1078       /* Single or double quote. */
1079       int c = *prog++;
1080
1081       /* Accumulate section. */
1082       for (;;)
1083         {
1084           /* Check end of line. */
1085           if (*prog == '\0')
1086             {
1087               msg (SE, _("Unterminated string constant."));
1088               goto finish;
1089             }
1090
1091           /* Double quote characters to embed them in strings. */
1092           if (*prog == c)
1093             {
1094               if (prog[1] == c)
1095                 prog++;
1096               else
1097                 break;
1098             }
1099
1100           ds_put_char (&tokstr, *prog++);
1101         }
1102       prog++;
1103
1104       /* Skip whitespace after final quote mark. */
1105       if (eof)
1106         break;
1107       for (;;)
1108         {
1109           while (isspace ((unsigned char) *prog))
1110             prog++;
1111           if (*prog)
1112             break;
1113
1114           if (dot)
1115             goto finish;
1116
1117           if (!lex_get_line ())
1118             goto finish;
1119         }
1120
1121       /* Skip plus sign. */
1122       if (*prog != '+')
1123         break;
1124       prog++;
1125
1126       /* Skip whitespace after plus sign. */
1127       if (eof)
1128         break;
1129       for (;;)
1130         {
1131           while (isspace ((unsigned char) *prog))
1132             prog++;
1133           if (*prog)
1134             break;
1135
1136           if (dot)
1137             goto finish;
1138
1139           if (!lex_get_line ())
1140             {
1141               msg (SE, _("Unexpected end of file in string concatenation."));
1142               goto finish;
1143             }
1144         }
1145
1146       /* Ensure that a valid string follows. */
1147       if (*prog != '\'' && *prog != '"')
1148         {
1149           msg (SE, _("String expected following `+'."));
1150           goto finish;
1151         }
1152     }
1153
1154   /* We come here when we've finished concatenating all the string sections
1155      into one large string. */
1156 finish:
1157   if (type != CHARACTER_STRING)
1158     convert_numeric_string_to_char_string (type);
1159
1160   if (ds_length (&tokstr) > 255)
1161     {
1162       msg (SE, _("String exceeds 255 characters in length (%d characters)."),
1163            ds_length (&tokstr));
1164       ds_truncate (&tokstr, 255);
1165     }
1166
1167   {
1168     /* FIXME. */
1169     size_t i;
1170     int warned = 0;
1171
1172     for (i = 0; i < ds_length (&tokstr); i++)
1173       if (ds_cstr (&tokstr)[i] == 0)
1174         {
1175           if (!warned)
1176             {
1177               msg (SE, _("Sorry, literal strings may not contain null "
1178                          "characters.  Replacing with spaces."));
1179               warned = 1;
1180             }
1181           ds_cstr (&tokstr)[i] = ' ';
1182         }
1183   }
1184
1185   return T_STRING;
1186 }
1187 \f
1188 #if DUMP_TOKENS
1189 /* Reads one token from the lexer and writes a textual representation
1190    on stdout for debugging purposes. */
1191 static void
1192 dump_token (void)
1193 {
1194   {
1195     const char *curfn;
1196     int curln;
1197
1198     getl_location (&curfn, &curln);
1199     if (curfn)
1200       fprintf (stderr, "%s:%d\t", curfn, curln);
1201   }
1202
1203   switch (token)
1204     {
1205     case T_ID:
1206       fprintf (stderr, "ID\t%s\n", tokid);
1207       break;
1208
1209     case T_POS_NUM:
1210     case T_NEG_NUM:
1211       fprintf (stderr, "NUM\t%f\n", tokval);
1212       break;
1213
1214     case T_STRING:
1215       fprintf (stderr, "STRING\t\"%s\"\n", ds_cstr (&tokstr));
1216       break;
1217
1218     case T_STOP:
1219       fprintf (stderr, "STOP\n");
1220       break;
1221
1222     case T_EXP:
1223       fprintf (stderr, "MISC\tEXP\"");
1224       break;
1225
1226     case 0:
1227       fprintf (stderr, "MISC\tEOF\n");
1228       break;
1229
1230     default:
1231       if (token >= T_FIRST_KEYWORD && token <= T_LAST_KEYWORD)
1232         fprintf (stderr, "KEYWORD\t%s\n", lex_token_name (token));
1233       else
1234         fprintf (stderr, "PUNCT\t%c\n", token);
1235       break;
1236     }
1237 }
1238 #endif /* DUMP_TOKENS */