src/language/lexer/lexer.c

   1 /* PSPP - computes sample statistics.
   2    Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
   3    Written by Ben Pfaff <blp@gnu.org>.
   4
   5    This program is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU General Public License as
   7    published by the Free Software Foundation; either version 2 of the
   8    License, or (at your option) any later version.
   9
  10    This program is distributed in the hope that it will be useful, but
  11    WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program; if not, write to the Free Software
  17    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  18    02110-1301, USA. */
  19
  20 #include <config.h>
  21 #include "lexer.h"
  22 #include <libpspp/message.h>
  23 #include <ctype.h>
  24 #include <errno.h>
  25 #include <limits.h>
  26 #include <math.h>
  27 #include <stdarg.h>
  28 #include <stdlib.h>
  29 #include <libpspp/alloc.h>
  30 #include <libpspp/assertion.h>
  31 #include <language/command.h>
  32 #include <libpspp/message.h>
  33 #include <language/line-buffer.h>
  34 #include <libpspp/magic.h>
  35 #include <data/settings.h>
  36 #include <libpspp/str.h>
  37
  38 #include "size_max.h"
  39
  40 #include "gettext.h"
  41 #define _(msgid) gettext (msgid)
  42 #define N_(msgid) msgid
  43
  44 /*
  45 #define DUMP_TOKENS 1
  46 */
  47
  48 \f
  49 /* Current token. */
  50 int token;
  51
  52 /* T_POS_NUM, T_NEG_NUM: the token's value. */
  53 double tokval;
  54
  55 /* T_ID: the identifier. */
  56 char tokid[LONG_NAME_LEN + 1];
  57
  58 /* T_ID, T_STRING: token string value.
  59    For T_ID, this is not truncated as is tokid. */
  60 struct string tokstr;
  61 \f
  62 /* Static variables. */
  63
  64 /* Pointer to next token in line_buffer. */
  65 static char *prog;
  66
  67 /* True only if this line ends with a terminal dot. */
  68 static bool dot;
  69
  70 /* True only if the last token returned was T_STOP. */
  71 static bool eof;
  72
  73 /* If nonzero, next token returned by lex_get().
  74    Used only in exceptional circumstances. */
  75 static int put_token;
  76 static struct string put_tokstr;
  77 static double put_tokval;
  78
  79 static int parse_id (void);
  80
  81 /* How a string represents its contents. */
  82 enum string_type
  83   {
  84     CHARACTER_STRING,   /* Characters. */
  85     BINARY_STRING,      /* Binary digits. */
  86     OCTAL_STRING,       /* Octal digits. */
  87     HEX_STRING          /* Hexadecimal digits. */
  88   };
  89
  90 static int parse_string (enum string_type);
  91
  92 #if DUMP_TOKENS
  93 static void dump_token (void);
  94 #endif
  95 \f
  96 /* Initialization. */
  97
  98 static struct string line_buffer;
  99
 100 static bool (*lex_read_line) (struct string *, bool *);
 101
 102 /* Initializes the lexer. */
 103 void
 104 lex_init (bool (*read_line_func) (struct string *, bool *))
 105 {
 106   ds_init_empty (&tokstr);
 107   ds_init_empty (&put_tokstr);
 108   ds_init_empty (&line_buffer);
 109   lex_read_line = read_line_func;
 110
 111   if (!lex_get_line ())
 112     eof = true;
 113 }
 114
 115 void
 116 lex_done (void)
 117 {
 118   ds_destroy (&put_tokstr);
 119   ds_destroy (&tokstr);
 120   ds_destroy (&line_buffer);
 121 }
 122
 123 \f
 124 /* Common functions. */
 125
 126 /* Copies put_token, put_tokstr, put_tokval into token, tokstr,
 127    tokval, respectively, and sets tokid appropriately. */
 128 static void
 129 restore_token (void)
 130 {
 131   assert (put_token != 0);
 132   token = put_token;
 133   ds_assign_string (&tokstr, &put_tokstr);
 134   str_copy_trunc (tokid, sizeof tokid, ds_cstr (&tokstr));
 135   tokval = put_tokval;
 136   put_token = 0;
 137 }
 138
 139 /* Copies token, tokstr, tokval into put_token, put_tokstr,
 140    put_tokval respectively. */
 141 static void
 142 save_token (void)
 143 {
 144   put_token = token;
 145   ds_assign_string (&put_tokstr, &tokstr);
 146   put_tokval = tokval;
 147 }
 148
 149 /* Parses a single token, setting appropriate global variables to
 150    indicate the token's attributes. */
 151 void
 152 lex_get (void)
 153 {
 154   /* If a token was pushed ahead, return it. */
 155   if (put_token)
 156     {
 157       restore_token ();
 158 #if DUMP_TOKENS
 159       dump_token ();
 160 #endif
 161       return;
 162     }
 163
 164   /* Find a token. */
 165   for (;;)
 166     {
 167       /* Skip whitespace. */
 168       if (eof)
 169         {
 170           token = T_STOP;
 171           return;
 172         }
 173
 174       for (;;)
 175         {
 176           while (isspace ((unsigned char) *prog))
 177             prog++;
 178           if (*prog)
 179             break;
 180
 181           if (dot)
 182             {
 183               dot = 0;
 184               token = '.';
 185 #if DUMP_TOKENS
 186               dump_token ();
 187 #endif
 188               return;
 189             }
 190           else if (!lex_get_line ())
 191             {
 192               eof = true;
 193               token = T_STOP;
 194 #if DUMP_TOKENS
 195               dump_token ();
 196 #endif
 197               return;
 198             }
 199
 200           if (put_token)
 201             {
 202               restore_token ();
 203 #if DUMP_TOKENS
 204               dump_token ();
 205 #endif
 206               return;
 207             }
 208         }
 209
 210
 211       /* Actually parse the token. */
 212       ds_clear (&tokstr);
 213
 214       switch (*prog)
 215         {
 216         case '-': case '.':
 217         case '0': case '1': case '2': case '3': case '4':
 218         case '5': case '6': case '7': case '8': case '9':
 219           {
 220             char *tail;
 221
 222             /* `-' can introduce a negative number, or it can be a
 223                token by itself.  If it is not followed by a digit or a
 224                decimal point, it is definitely not a number.
 225                Otherwise, it might be either, but most of the time we
 226                want it as a number.  When the syntax calls for a `-'
 227                token, lex_negative_to_dash() must be used to break
 228                negative numbers into two tokens. */
 229             if (*prog == '-')
 230               {
 231                 ds_put_char (&tokstr, *prog++);
 232                 while (isspace ((unsigned char) *prog))
 233                   prog++;
 234
 235                 if (!isdigit ((unsigned char) *prog) && *prog != '.')
 236                   {
 237                     token = '-';
 238                     break;
 239                   }
 240                 token = T_NEG_NUM;
 241               }
 242             else
 243               token = T_POS_NUM;
 244
 245             /* Parse the number, copying it into tokstr. */
 246             while (isdigit ((unsigned char) *prog))
 247               ds_put_char (&tokstr, *prog++);
 248             if (*prog == '.')
 249               {
 250                 ds_put_char (&tokstr, *prog++);
 251                 while (isdigit ((unsigned char) *prog))
 252                   ds_put_char (&tokstr, *prog++);
 253               }
 254             if (*prog == 'e' || *prog == 'E')
 255               {
 256                 ds_put_char (&tokstr, *prog++);
 257                 if (*prog == '+' || *prog == '-')
 258                   ds_put_char (&tokstr, *prog++);
 259                 while (isdigit ((unsigned char) *prog))
 260                   ds_put_char (&tokstr, *prog++);
 261               }
 262
 263             /* Parse as floating point. */
 264             tokval = strtod (ds_cstr (&tokstr), &tail);
 265             if (*tail)
 266               {
 267                 msg (SE, _("%s does not form a valid number."),
 268                      ds_cstr (&tokstr));
 269                 tokval = 0.0;
 270
 271                 ds_clear (&tokstr);
 272                 ds_put_char (&tokstr, '0');
 273               }
 274
 275             break;
 276           }
 277
 278         case '\'': case '"':
 279           token = parse_string (CHARACTER_STRING);
 280           break;
 281
 282         case '(': case ')': case ',': case '=': case '+': case '/':
 283           token = *prog++;
 284           break;
 285
 286         case '*':
 287           if (*++prog == '*')
 288             {
 289               prog++;
 290               token = T_EXP;
 291             }
 292           else
 293             token = '*';
 294           break;
 295
 296         case '<':
 297           if (*++prog == '=')
 298             {
 299               prog++;
 300               token = T_LE;
 301             }
 302           else if (*prog == '>')
 303             {
 304               prog++;
 305               token = T_NE;
 306             }
 307           else
 308             token = T_LT;
 309           break;
 310
 311         case '>':
 312           if (*++prog == '=')
 313             {
 314               prog++;
 315               token = T_GE;
 316             }
 317           else
 318             token = T_GT;
 319           break;
 320
 321         case '~':
 322           if (*++prog == '=')
 323             {
 324               prog++;
 325               token = T_NE;
 326             }
 327           else
 328             token = T_NOT;
 329           break;
 330
 331         case '&':
 332           prog++;
 333           token = T_AND;
 334           break;
 335
 336         case '|':
 337           prog++;
 338           token = T_OR;
 339           break;
 340
 341         case 'b': case 'B':
 342           if (prog[1] == '\'' || prog[1] == '"')
 343             token = parse_string (BINARY_STRING);
 344           else
 345             token = parse_id ();
 346           break;
 347
 348         case 'o': case 'O':
 349           if (prog[1] == '\'' || prog[1] == '"')
 350             token = parse_string (OCTAL_STRING);
 351           else
 352             token = parse_id ();
 353           break;
 354
 355         case 'x': case 'X':
 356           if (prog[1] == '\'' || prog[1] == '"')
 357             token = parse_string (HEX_STRING);
 358           else
 359             token = parse_id ();
 360           break;
 361
 362         default:
 363           if (lex_is_id1 (*prog))
 364             {
 365               token = parse_id ();
 366               break;
 367             }
 368           else
 369             {
 370               if (isgraph ((unsigned char) *prog))
 371                 msg (SE, _("Bad character in input: `%c'."), *prog++);
 372               else
 373                 msg (SE, _("Bad character in input: `\\%o'."), *prog++);
 374               continue;
 375             }
 376         }
 377       break;
 378     }
 379
 380 #if DUMP_TOKENS
 381   dump_token ();
 382 #endif
 383 }
 384
 385 /* Parses an identifier at the current position into tokid and
 386    tokstr.
 387    Returns the correct token type. */
 388 static int
 389 parse_id (void)
 390 {
 391   const char *start = prog;
 392   prog = lex_skip_identifier (start);
 393
 394   ds_put_substring (&tokstr, ss_buffer (start, prog - start));
 395   str_copy_trunc (tokid, sizeof tokid, ds_cstr (&tokstr));
 396   return lex_id_to_token (ds_cstr (&tokstr), ds_length (&tokstr));
 397 }
 398
 399 /* Reports an error to the effect that subcommand SBC may only be
 400    specified once. */
 401 void
 402 lex_sbc_only_once (const char *sbc)
 403 {
 404   msg (SE, _("Subcommand %s may only be specified once."), sbc);
 405 }
 406
 407 /* Reports an error to the effect that subcommand SBC is
 408    missing. */
 409 void
 410 lex_sbc_missing (const char *sbc)
 411 {
 412   lex_error (_("missing required subcommand %s"), sbc);
 413 }
 414
 415 /* Prints a syntax error message containing the current token and
 416    given message MESSAGE (if non-null). */
 417 void
 418 lex_error (const char *message, ...)
 419 {
 420   char *token_rep;
 421   char where[128];
 422
 423   token_rep = lex_token_representation ();
 424   if (token == T_STOP)
 425     strcpy (where, "end of file");
 426   else if (token == '.')
 427     strcpy (where, "end of command");
 428   else
 429     snprintf (where, sizeof where, "`%s'", token_rep);
 430   free (token_rep);
 431
 432   if (message)
 433     {
 434       char buf[1024];
 435       va_list args;
 436
 437       va_start (args, message);
 438       vsnprintf (buf, 1024, message, args);
 439       va_end (args);
 440
 441       msg (SE, _("Syntax error %s at %s."), buf, where);
 442     }
 443   else
 444     msg (SE, _("Syntax error at %s."), where);
 445 }
 446
 447 /* Checks that we're at end of command.
 448    If so, returns a successful command completion code.
 449    If not, flags a syntax error and returns an error command
 450    completion code. */
 451 int
 452 lex_end_of_command (void)
 453 {
 454   if (token != '.')
 455     {
 456       lex_error (_("expecting end of command"));
 457       return CMD_FAILURE;
 458     }
 459   else
 460     return CMD_SUCCESS;
 461 }
 462 \f
 463 /* Token testing functions. */
 464
 465 /* Returns true if the current token is a number. */
 466 bool
 467 lex_is_number (void)
 468 {
 469   return token == T_POS_NUM || token == T_NEG_NUM;
 470 }
 471
 472 /* Returns the value of the current token, which must be a
 473    floating point number. */
 474 double
 475 lex_number (void)
 476 {
 477   assert (lex_is_number ());
 478   return tokval;
 479 }
 480
 481 /* Returns true iff the current token is an integer. */
 482 bool
 483 lex_is_integer (void)
 484 {
 485   return (lex_is_number ()
 486           && tokval != NOT_LONG
 487           && tokval >= LONG_MIN
 488           && tokval <= LONG_MAX
 489           && floor (tokval) == tokval);
 490 }
 491
 492 /* Returns the value of the current token, which must be an
 493    integer. */
 494 long
 495 lex_integer (void)
 496 {
 497   assert (lex_is_integer ());
 498   return tokval;
 499 }
 500 \f
 501 /* Token matching functions. */
 502
 503 /* If TOK is the current token, skips it and returns true
 504    Otherwise, returns false. */
 505 bool
 506 lex_match (int t)
 507 {
 508   if (token == t)
 509     {
 510       lex_get ();
 511       return true;
 512     }
 513   else
 514     return false;
 515 }
 516
 517 /* If the current token is the identifier S, skips it and returns
 518    true.  The identifier may be abbreviated to its first three
 519    letters.
 520    Otherwise, returns false. */
 521 bool
 522 lex_match_id (const char *s)
 523 {
 524   if (token == T_ID && lex_id_match (s, tokid))
 525     {
 526       lex_get ();
 527       return true;
 528     }
 529   else
 530     return false;
 531 }
 532
 533 /* If the current token is integer N, skips it and returns true.
 534    Otherwise, returns false. */
 535 bool
 536 lex_match_int (int x)
 537 {
 538   if (lex_is_integer () && lex_integer () == x)
 539     {
 540       lex_get ();
 541       return true;
 542     }
 543   else
 544     return false;
 545 }
 546 \f
 547 /* Forced matches. */
 548
 549 /* If this token is identifier S, fetches the next token and returns
 550    nonzero.
 551    Otherwise, reports an error and returns zero. */
 552 bool
 553 lex_force_match_id (const char *s)
 554 {
 555   if (token == T_ID && lex_id_match (s, tokid))
 556     {
 557       lex_get ();
 558       return true;
 559     }
 560   else
 561     {
 562       lex_error (_("expecting `%s'"), s);
 563       return false;
 564     }
 565 }
 566
 567 /* If the current token is T, skips the token.  Otherwise, reports an
 568    error and returns from the current function with return value false. */
 569 bool
 570 lex_force_match (int t)
 571 {
 572   if (token == t)
 573     {
 574       lex_get ();
 575       return true;
 576     }
 577   else
 578     {
 579       lex_error (_("expecting `%s'"), lex_token_name (t));
 580       return false;
 581     }
 582 }
 583
 584 /* If this token is a string, does nothing and returns true.
 585    Otherwise, reports an error and returns false. */
 586 bool
 587 lex_force_string (void)
 588 {
 589   if (token == T_STRING)
 590     return true;
 591   else
 592     {
 593       lex_error (_("expecting string"));
 594       return false;
 595     }
 596 }
 597
 598 /* If this token is an integer, does nothing and returns true.
 599    Otherwise, reports an error and returns false. */
 600 bool
 601 lex_force_int (void)
 602 {
 603   if (lex_is_integer ())
 604     return true;
 605   else
 606     {
 607       lex_error (_("expecting integer"));
 608       return false;
 609     }
 610 }
 611
 612 /* If this token is a number, does nothing and returns true.
 613    Otherwise, reports an error and returns false. */
 614 bool
 615 lex_force_num (void)
 616 {
 617   if (lex_is_number ())
 618     return true;
 619   else
 620     {
 621       lex_error (_("expecting number"));
 622       return false;
 623     }
 624 }
 625
 626 /* If this token is an identifier, does nothing and returns true.
 627    Otherwise, reports an error and returns false. */
 628 bool
 629 lex_force_id (void)
 630 {
 631   if (token == T_ID)
 632     return true;
 633   else
 634     {
 635       lex_error (_("expecting identifier"));
 636       return false;
 637     }
 638 }
 639 /* Weird token functions. */
 640
 641 /* Returns the first character of the next token, except that if the
 642    next token is not an identifier, the character returned will not be
 643    a character that can begin an identifier.  Specifically, the
 644    hexstring lead-in X' causes lookahead() to return '.  Note that an
 645    alphanumeric return value doesn't guarantee an ID token, it could
 646    also be a reserved-word token. */
 647 int
 648 lex_look_ahead (void)
 649 {
 650   if (put_token)
 651     return put_token;
 652
 653   for (;;)
 654     {
 655       if (eof)
 656         return 0;
 657
 658       for (;;)
 659         {
 660           while (isspace ((unsigned char) *prog))
 661             prog++;
 662           if (*prog)
 663             break;
 664
 665           if (dot)
 666             return '.';
 667           else if (!lex_get_line ())
 668             return 0;
 669
 670           if (put_token)
 671             return put_token;
 672         }
 673
 674       if ((toupper ((unsigned char) *prog) == 'X'
 675            || toupper ((unsigned char) *prog) == 'B'
 676            || toupper ((unsigned char) *prog) == 'O')
 677           && (prog[1] == '\'' || prog[1] == '"'))
 678         return '\'';
 679
 680       return *prog;
 681     }
 682 }
 683
 684 /* Makes the current token become the next token to be read; the
 685    current token is set to T. */
 686 void
 687 lex_put_back (int t)
 688 {
 689   save_token ();
 690   token = t;
 691 }
 692
 693 /* Makes the current token become the next token to be read; the
 694    current token is set to the identifier ID. */
 695 void
 696 lex_put_back_id (const char *id)
 697 {
 698   assert (lex_id_to_token (id, strlen (id)) == T_ID);
 699   save_token ();
 700   token = T_ID;
 701   ds_assign_cstr (&tokstr, id);
 702   str_copy_trunc (tokid, sizeof tokid, ds_cstr (&tokstr));
 703 }
 704 \f
 705 /* Weird line processing functions. */
 706
 707 /* Returns the entire contents of the current line. */
 708 const char *
 709 lex_entire_line (void)
 710 {
 711   return ds_cstr (&line_buffer);
 712 }
 713
 714 const struct string *
 715 lex_entire_line_ds (void)
 716 {
 717   return &line_buffer;
 718 }
 719
 720 /* As lex_entire_line(), but only returns the part of the current line
 721    that hasn't already been tokenized.
 722    If END_DOT is non-null, stores nonzero into *END_DOT if the line
 723    ends with a terminal dot, or zero if it doesn't. */
 724 const char *
 725 lex_rest_of_line (int *end_dot)
 726 {
 727   if (end_dot)
 728     *end_dot = dot;
 729   return prog;
 730 }
 731
 732 /* Causes the rest of the current input line to be ignored for
 733    tokenization purposes. */
 734 void
 735 lex_discard_line (void)
 736 {
 737   ds_cstr (&line_buffer);  /* Ensures ds_end points to something valid */
 738   prog = ds_end (&line_buffer);
 739   dot = false;
 740   put_token = 0;
 741 }
 742
 743
 744 /* Discards the rest of the current command.
 745    When we're reading commands from a file, we skip tokens until
 746    a terminal dot or EOF.
 747    When we're reading commands interactively from the user,
 748    that's just discarding the current line, because presumably
 749    the user doesn't want to finish typing a command that will be
 750    ignored anyway. */
 751 void
 752 lex_discard_rest_of_command (void)
 753 {
 754   if (!getl_is_interactive ())
 755     {
 756       while (token != T_STOP && token != '.')
 757         lex_get ();
 758     }
 759   else
 760     lex_discard_line ();
 761 }
 762 \f
 763 /* Weird line reading functions. */
 764
 765 /* Remove C-style comments in STRING, begun by slash-star and
 766    terminated by star-slash or newline. */
 767 static void
 768 strip_comments (struct string *string)
 769 {
 770   char *cp;
 771   int quote;
 772   bool in_comment;
 773
 774   in_comment = false;
 775   quote = EOF;
 776   for (cp = ds_cstr (string); *cp; )
 777     {
 778       /* If we're not in a comment, check for quote marks. */
 779       if (!in_comment)
 780         {
 781           if (*cp == quote)
 782             quote = EOF;
 783           else if (*cp == '\'' || *cp == '"')
 784             quote = *cp;
 785         }
 786
 787       /* If we're not inside a quotation, check for comment. */
 788       if (quote == EOF)
 789         {
 790           if (cp[0] == '/' && cp[1] == '*')
 791             {
 792               in_comment = true;
 793               *cp++ = ' ';
 794               *cp++ = ' ';
 795               continue;
 796             }
 797           else if (in_comment && cp[0] == '*' && cp[1] == '/')
 798             {
 799               in_comment = false;
 800               *cp++ = ' ';
 801               *cp++ = ' ';
 802               continue;
 803             }
 804         }
 805
 806       /* Check commenting. */
 807       if (in_comment)
 808         *cp = ' ';
 809       cp++;
 810     }
 811 }
 812
 813 /* Reads a line, without performing any preprocessing */
 814 bool
 815 lex_get_line_raw (void)
 816 {
 817   bool dummy;
 818   return lex_read_line (&line_buffer, &dummy);
 819 }
 820
 821 /* Reads a line for use by the tokenizer, and preprocesses it by
 822    removing comments, stripping trailing whitespace and the
 823    terminal dot, and removing leading indentors. */
 824 bool
 825 lex_get_line (void)
 826 {
 827   struct string *line = &line_buffer;
 828   bool interactive;
 829
 830   if (!lex_read_line (line, &interactive))
 831     return false;
 832
 833   strip_comments (line);
 834   ds_rtrim (line, ss_cstr (CC_SPACES));
 835
 836   /* Check for and remove terminal dot. */
 837   dot = (ds_chomp (line, get_endcmd ())
 838          || (ds_is_empty (line) && get_nulline ()));
 839
 840   /* Strip leading indentors or insert a terminal dot (unless the
 841      line was obtained interactively). */
 842   if (!interactive)
 843     {
 844       int first = ds_first (line);
 845
 846       if (first == '+' || first == '-')
 847         *ds_data (line) = ' ';
 848       else if (first != EOF && !isspace (first))
 849         put_token = '.';
 850     }
 851
 852   prog = ds_cstr (line);
 853
 854   return true;
 855 }
 856 \f
 857 /* Token names. */
 858
 859 /* Returns the name of a token in a static buffer. */
 860 const char *
 861 lex_token_name (int token)
 862 {
 863   if (token >= T_FIRST_KEYWORD && token <= T_LAST_KEYWORD)
 864     return keywords[token - T_FIRST_KEYWORD];
 865
 866   if (token < 256)
 867     {
 868       static char t[2];
 869       t[0] = token;
 870       return t;
 871     }
 872
 873   NOT_REACHED ();
 874 }
 875
 876 /* Returns an ASCII representation of the current token as a
 877    malloc()'d string. */
 878 char *
 879 lex_token_representation (void)
 880 {
 881   char *token_rep;
 882
 883   switch (token)
 884     {
 885     case T_ID:
 886     case T_POS_NUM:
 887     case T_NEG_NUM:
 888       return ds_xstrdup (&tokstr);
 889       break;
 890
 891     case T_STRING:
 892       {
 893         int hexstring = 0;
 894         char *sp, *dp;
 895
 896         for (sp = ds_cstr (&tokstr); sp < ds_end (&tokstr); sp++)
 897           if (!isprint ((unsigned char) *sp))
 898             {
 899               hexstring = 1;
 900               break;
 901             }
 902
 903         token_rep = xmalloc (2 + ds_length (&tokstr) * 2 + 1 + 1);
 904
 905         dp = token_rep;
 906         if (hexstring)
 907           *dp++ = 'X';
 908         *dp++ = '\'';
 909
 910         if (!hexstring)
 911           for (sp = ds_cstr (&tokstr); *sp; )
 912             {
 913               if (*sp == '\'')
 914                 *dp++ = '\'';
 915               *dp++ = (unsigned char) *sp++;
 916             }
 917         else
 918           for (sp = ds_cstr (&tokstr); sp < ds_end (&tokstr); sp++)
 919             {
 920               *dp++ = (((unsigned char) *sp) >> 4)["0123456789ABCDEF"];
 921               *dp++ = (((unsigned char) *sp) & 15)["0123456789ABCDEF"];
 922             }
 923         *dp++ = '\'';
 924         *dp = '\0';
 925
 926         return token_rep;
 927       }
 928     break;
 929
 930     case T_STOP:
 931       token_rep = xmalloc (1);
 932       *token_rep = '\0';
 933       return token_rep;
 934
 935     case T_EXP:
 936       return xstrdup ("**");
 937
 938     default:
 939       if (token >= T_FIRST_KEYWORD && token <= T_LAST_KEYWORD)
 940         return xstrdup (keywords [token - T_FIRST_KEYWORD]);
 941       else
 942         {
 943           token_rep = xmalloc (2);
 944           token_rep[0] = token;
 945           token_rep[1] = '\0';
 946           return token_rep;
 947         }
 948     }
 949
 950   NOT_REACHED ();
 951 }
 952 \f
 953 /* Really weird functions. */
 954
 955 /* Most of the time, a `-' is a lead-in to a negative number.  But
 956    sometimes it's actually part of the syntax.  If a dash can be part
 957    of syntax then this function is called to rip it off of a
 958    number. */
 959 void
 960 lex_negative_to_dash (void)
 961 {
 962   if (token == T_NEG_NUM)
 963     {
 964       token = T_POS_NUM;
 965       tokval = -tokval;
 966       ds_assign_substring (&tokstr, ds_substr (&tokstr, 1, SIZE_MAX));
 967       save_token ();
 968       token = '-';
 969     }
 970 }
 971
 972 /* We're not at eof any more. */
 973 void
 974 lex_reset_eof (void)
 975 {
 976   eof = false;
 977 }
 978
 979 /* Skip a COMMENT command. */
 980 void
 981 lex_skip_comment (void)
 982 {
 983   for (;;)
 984     {
 985       if (!lex_get_line ())
 986         {
 987           put_token = T_STOP;
 988           eof = true;
 989           return;
 990         }
 991
 992       if (put_token == '.')
 993         break;
 994
 995       ds_cstr (&line_buffer); /* Ensures ds_end will point to a valid char */
 996       prog = ds_end (&line_buffer);
 997       if (dot)
 998         break;
 999     }
1000 }
1001 \f
1002 /* Private functions. */
1003
1004 /* When invoked, tokstr contains a string of binary, octal, or
1005    hex digits, according to TYPE.  The string is converted to
1006    characters having the specified values. */
1007 static void
1008 convert_numeric_string_to_char_string (enum string_type type)
1009 {
1010   const char *base_name;
1011   int base;
1012   int chars_per_byte;
1013   size_t byte_cnt;
1014   size_t i;
1015   char *p;
1016
1017   switch (type)
1018     {
1019     case BINARY_STRING:
1020       base_name = _("binary");
1021       base = 2;
1022       chars_per_byte = 8;
1023       break;
1024     case OCTAL_STRING:
1025       base_name = _("octal");
1026       base = 8;
1027       chars_per_byte = 3;
1028       break;
1029     case HEX_STRING:
1030       base_name = _("hex");
1031       base = 16;
1032       chars_per_byte = 2;
1033       break;
1034     default:
1035       NOT_REACHED ();
1036     }
1037
1038   byte_cnt = ds_length (&tokstr) / chars_per_byte;
1039   if (ds_length (&tokstr) % chars_per_byte)
1040     msg (SE, _("String of %s digits has %d characters, which is not a "
1041                "multiple of %d."),
1042          base_name, ds_length (&tokstr), chars_per_byte);
1043
1044   p = ds_cstr (&tokstr);
1045   for (i = 0; i < byte_cnt; i++)
1046     {
1047       int value;
1048       int j;
1049
1050       value = 0;
1051       for (j = 0; j < chars_per_byte; j++, p++)
1052         {
1053           int v;
1054
1055           if (*p >= '0' && *p <= '9')
1056             v = *p - '0';
1057           else
1058             {
1059               static const char alpha[] = "abcdef";
1060               const char *q = strchr (alpha, tolower ((unsigned char) *p));
1061
1062               if (q)
1063                 v = q - alpha + 10;
1064               else
1065                 v = base;
1066             }
1067
1068           if (v >= base)
1069             msg (SE, _("`%c' is not a valid %s digit."), *p, base_name);
1070
1071           value = value * base + v;
1072         }
1073
1074       ds_cstr (&tokstr)[i] = (unsigned char) value;
1075     }
1076
1077   ds_truncate (&tokstr, byte_cnt);
1078 }
1079
1080 /* Parses a string from the input buffer into tokstr.  The input
1081    buffer pointer prog must point to the initial single or double
1082    quote.  TYPE indicates the type of string to be parsed.
1083    Returns token type. */
1084 static int
1085 parse_string (enum string_type type)
1086 {
1087   if (type != CHARACTER_STRING)
1088     prog++;
1089
1090   /* Accumulate the entire string, joining sections indicated by +
1091      signs. */
1092   for (;;)
1093     {
1094       /* Single or double quote. */
1095       int c = *prog++;
1096
1097       /* Accumulate section. */
1098       for (;;)
1099         {
1100           /* Check end of line. */
1101           if (*prog == '\0')
1102             {
1103               msg (SE, _("Unterminated string constant."));
1104               goto finish;
1105             }
1106
1107           /* Double quote characters to embed them in strings. */
1108           if (*prog == c)
1109             {
1110               if (prog[1] == c)
1111                 prog++;
1112               else
1113                 break;
1114             }
1115
1116           ds_put_char (&tokstr, *prog++);
1117         }
1118       prog++;
1119
1120       /* Skip whitespace after final quote mark. */
1121       if (eof)
1122         break;
1123       for (;;)
1124         {
1125           while (isspace ((unsigned char) *prog))
1126             prog++;
1127           if (*prog)
1128             break;
1129
1130           if (dot)
1131             goto finish;
1132
1133           if (!lex_get_line ())
1134             goto finish;
1135         }
1136
1137       /* Skip plus sign. */
1138       if (*prog != '+')
1139         break;
1140       prog++;
1141
1142       /* Skip whitespace after plus sign. */
1143       if (eof)
1144         break;
1145       for (;;)
1146         {
1147           while (isspace ((unsigned char) *prog))
1148             prog++;
1149           if (*prog)
1150             break;
1151
1152           if (dot)
1153             goto finish;
1154
1155           if (!lex_get_line ())
1156             {
1157               msg (SE, _("Unexpected end of file in string concatenation."));
1158               goto finish;
1159             }
1160         }
1161
1162       /* Ensure that a valid string follows. */
1163       if (*prog != '\'' && *prog != '"')
1164         {
1165           msg (SE, _("String expected following `+'."));
1166           goto finish;
1167         }
1168     }
1169
1170   /* We come here when we've finished concatenating all the string sections
1171      into one large string. */
1172 finish:
1173   if (type != CHARACTER_STRING)
1174     convert_numeric_string_to_char_string (type);
1175
1176   if (ds_length (&tokstr) > 255)
1177     {
1178       msg (SE, _("String exceeds 255 characters in length (%d characters)."),
1179            ds_length (&tokstr));
1180       ds_truncate (&tokstr, 255);
1181     }
1182
1183   return T_STRING;
1184 }
1185 \f
1186 #if DUMP_TOKENS
1187 /* Reads one token from the lexer and writes a textual representation
1188    on stdout for debugging purposes. */
1189 static void
1190 dump_token (void)
1191 {
1192   {
1193     const char *curfn;
1194     int curln;
1195
1196     getl_location (&curfn, &curln);
1197     if (curfn)
1198       fprintf (stderr, "%s:%d\t", curfn, curln);
1199   }
1200
1201   switch (token)
1202     {
1203     case T_ID:
1204       fprintf (stderr, "ID\t%s\n", tokid);
1205       break;
1206
1207     case T_POS_NUM:
1208     case T_NEG_NUM:
1209       fprintf (stderr, "NUM\t%f\n", tokval);
1210       break;
1211
1212     case T_STRING:
1213       fprintf (stderr, "STRING\t\"%s\"\n", ds_cstr (&tokstr));
1214       break;
1215
1216     case T_STOP:
1217       fprintf (stderr, "STOP\n");
1218       break;
1219
1220     case T_EXP:
1221       fprintf (stderr, "MISC\tEXP\"");
1222       break;
1223
1224     case 0:
1225       fprintf (stderr, "MISC\tEOF\n");
1226       break;
1227
1228     default:
1229       if (token >= T_FIRST_KEYWORD && token <= T_LAST_KEYWORD)
1230         fprintf (stderr, "KEYWORD\t%s\n", lex_token_name (token));
1231       else
1232         fprintf (stderr, "PUNCT\t%c\n", token);
1233       break;
1234     }
1235 }
1236 #endif /* DUMP_TOKENS */