pintos-os.org Git - pspp/blob - src/language/lexer/scan.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 2010, 2011, 2013 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "language/lexer/scan.h"
  20
  21 #include <limits.h>
  22 #include <unistr.h>
  23
  24 #include "data/identifier.h"
  25 #include "language/lexer/token.h"
  26 #include "libpspp/assertion.h"
  27 #include "libpspp/cast.h"
  28 #include "libpspp/i18n.h"
  29
  30 #include "gl/c-ctype.h"
  31 #include "gl/c-strtod.h"
  32 #include "gl/xmemdup0.h"
  33
  34 #include "gettext.h"
  35 #define _(msgid) gettext (msgid)
  36
  37 enum
  38   {
  39     S_START,
  40     S_DASH,
  41     S_STRING
  42   };
  43
  44 #define SS_NL_BEFORE_PLUS (1u << 0)
  45 #define SS_PLUS           (1u << 1)
  46 #define SS_NL_AFTER_PLUS  (1u << 2)
  47
  48 /* Returns the integer value of (hex) digit C. */
  49 static int
  50 digit_value (int c)
  51 {
  52   switch (c)
  53     {
  54     case '0': return 0;
  55     case '1': return 1;
  56     case '2': return 2;
  57     case '3': return 3;
  58     case '4': return 4;
  59     case '5': return 5;
  60     case '6': return 6;
  61     case '7': return 7;
  62     case '8': return 8;
  63     case '9': return 9;
  64     case 'a': case 'A': return 10;
  65     case 'b': case 'B': return 11;
  66     case 'c': case 'C': return 12;
  67     case 'd': case 'D': return 13;
  68     case 'e': case 'E': return 14;
  69     case 'f': case 'F': return 15;
  70     default: return INT_MAX;
  71     }
  72 }
  73
  74 static bool
  75 scan_quoted_string__ (struct substring s, struct token *token)
  76 {
  77   int quote;
  78
  79   /* Trim ' or " from front and back. */
  80   quote = s.string[s.length - 1];
  81   s.string++;
  82   s.length -= 2;
  83
  84   ss_realloc (&token->string, token->string.length + s.length + 1);
  85
  86   for (;;)
  87     {
  88       size_t pos = ss_find_byte (s, quote);
  89       if (pos == SIZE_MAX)
  90         break;
  91
  92       memcpy (ss_end (token->string), s.string, pos + 1);
  93       token->string.length += pos + 1;
  94       ss_advance (&s, pos + 2);
  95     }
  96
  97   memcpy (ss_end (token->string), s.string, ss_length (s));
  98   token->string.length += ss_length (s);
  99
 100   return true;
 101 }
 102
 103 static bool
 104 scan_hex_string__ (struct substring s, struct token *token)
 105 {
 106   uint8_t *dst;
 107   size_t i;
 108
 109   /* Trim X' from front and ' from back. */
 110   s.string += 2;
 111   s.length -= 3;
 112
 113   if (s.length % 2 != 0)
 114     {
 115       token->type = SCAN_BAD_HEX_LENGTH;
 116       token->number = s.length;
 117       return false;
 118     }
 119
 120   ss_realloc (&token->string, token->string.length + s.length / 2 + 1);
 121   dst = CHAR_CAST (uint8_t *, ss_end (token->string));
 122   token->string.length += s.length / 2;
 123   for (i = 0; i < s.length; i += 2)
 124     {
 125       int hi = digit_value (s.string[i]);
 126       int lo = digit_value (s.string[i + 1]);
 127
 128       if (hi >= 16 || lo >= 16)
 129         {
 130           token->type = SCAN_BAD_HEX_DIGIT;
 131           token->number = s.string[hi >= 16 ? i : i + 1];
 132           return false;
 133         }
 134
 135       *dst++ = hi * 16 + lo;
 136     }
 137
 138   return true;
 139 }
 140
 141 static bool
 142 scan_unicode_string__ (struct substring s, struct token *token)
 143 {
 144   uint8_t *dst;
 145   ucs4_t uc;
 146   size_t i;
 147
 148   /* Trim U' from front and ' from back. */
 149   s.string += 2;
 150   s.length -= 3;
 151
 152   if (s.length < 1 || s.length > 8)
 153     {
 154       token->type = SCAN_BAD_UNICODE_LENGTH;
 155       token->number = s.length;
 156       return 0;
 157     }
 158
 159   ss_realloc (&token->string, token->string.length + 4 + 1);
 160
 161   uc = 0;
 162   for (i = 0; i < s.length; i++)
 163     {
 164       int digit = digit_value (s.string[i]);
 165       if (digit >= 16)
 166         {
 167           token->type = SCAN_BAD_UNICODE_DIGIT;
 168           token->number = s.string[i];
 169           return 0;
 170         }
 171       uc = uc * 16 + digit;
 172     }
 173
 174   if ((uc >= 0xd800 && uc < 0xe000) || uc > 0x10ffff)
 175     {
 176       token->type = SCAN_BAD_UNICODE_CODE_POINT;
 177       token->number = uc;
 178       return 0;
 179     }
 180
 181   dst = CHAR_CAST (uint8_t *, ss_end (token->string));
 182   token->string.length += u8_uctomb (dst, uc, 4);
 183
 184   return true;
 185 }
 186
 187 static enum scan_result
 188 scan_string_segment__ (struct scanner *scanner, enum segment_type type,
 189                        struct substring s, struct token *token)
 190 {
 191   bool ok;
 192
 193   switch (type)
 194     {
 195     case SEG_QUOTED_STRING:
 196       ok = scan_quoted_string__ (s, token);
 197       break;
 198
 199     case SEG_HEX_STRING:
 200       ok = scan_hex_string__ (s, token);
 201       break;
 202
 203     case SEG_UNICODE_STRING:
 204       ok = scan_unicode_string__ (s, token);
 205       break;
 206
 207     default:
 208       NOT_REACHED ();
 209     }
 210
 211   if (ok)
 212     {
 213       token->type = T_STRING;
 214       token->string.string[token->string.length] = '\0';
 215       scanner->state = S_STRING;
 216       scanner->substate = 0;
 217       return SCAN_SAVE;
 218     }
 219   else
 220     {
 221       /* The function we called above should have filled in token->type and
 222          token->number properly to describe the error. */
 223       ss_dealloc (&token->string);
 224       token->string = ss_empty ();
 225       return SCAN_DONE;
 226     }
 227
 228 }
 229
 230 static enum scan_result
 231 add_bit (struct scanner *scanner, unsigned int bit)
 232 {
 233   if (!(scanner->substate & bit))
 234     {
 235       scanner->substate |= bit;
 236       return SCAN_MORE;
 237     }
 238   else
 239     return SCAN_BACK;
 240 }
 241
 242 static enum scan_result
 243 scan_string__ (struct scanner *scanner, enum segment_type type,
 244                struct substring s, struct token *token)
 245 {
 246   switch (type)
 247     {
 248     case SEG_SPACES:
 249     case SEG_COMMENT:
 250       return SCAN_MORE;
 251
 252     case SEG_NEWLINE:
 253       if (scanner->substate & SS_PLUS)
 254         return add_bit (scanner, SS_NL_AFTER_PLUS);
 255       else
 256         return add_bit (scanner, SS_NL_BEFORE_PLUS);
 257
 258     case SEG_PUNCT:
 259       return (s.length == 1 && s.string[0] == '+'
 260               ? add_bit (scanner, SS_PLUS)
 261               : SCAN_BACK);
 262
 263     case SEG_QUOTED_STRING:
 264     case SEG_HEX_STRING:
 265     case SEG_UNICODE_STRING:
 266       return (scanner->substate & SS_PLUS
 267               ? scan_string_segment__ (scanner, type, s, token)
 268               : SCAN_BACK);
 269
 270     default:
 271       return SCAN_BACK;
 272     }
 273 }
 274
 275 static enum token_type
 276 scan_reserved_word__ (struct substring word)
 277 {
 278   switch (c_toupper (word.string[0]))
 279     {
 280     case 'B':
 281       return T_BY;
 282
 283     case 'E':
 284       return T_EQ;
 285
 286     case 'G':
 287       return c_toupper (word.string[1]) == 'E' ? T_GE : T_GT;
 288
 289     case 'L':
 290       return c_toupper (word.string[1]) == 'E' ? T_LE : T_LT;
 291
 292     case 'N':
 293       return word.length == 2 ? T_NE : T_NOT;
 294
 295     case 'O':
 296       return T_OR;
 297
 298     case 'T':
 299       return T_TO;
 300
 301     case 'A':
 302       return c_toupper (word.string[1]) == 'L' ? T_ALL : T_AND;
 303
 304     case 'W':
 305       return T_WITH;
 306     }
 307
 308   NOT_REACHED ();
 309 }
 310
 311 static enum token_type
 312 scan_punct1__ (char c0)
 313 {
 314   switch (c0)
 315     {
 316     case '(': return T_LPAREN;
 317     case ')': return T_RPAREN;
 318     case ',': return T_COMMA;
 319     case '=': return T_EQUALS;
 320     case '-': return T_DASH;
 321     case '[': return T_LBRACK;
 322     case ']': return T_RBRACK;
 323     case '&': return T_AND;
 324     case '|': return T_OR;
 325     case '+': return T_PLUS;
 326     case '/': return T_SLASH;
 327     case '*': return T_ASTERISK;
 328     case '<': return T_LT;
 329     case '>': return T_GT;
 330     case '~': return T_NOT;
 331     default: return T_MACRO_PUNCT;
 332     }
 333
 334   NOT_REACHED ();
 335 }
 336
 337 static enum token_type
 338 scan_punct2__ (char c0, char c1)
 339 {
 340   switch (c0)
 341     {
 342     case '*':
 343       return T_EXP;
 344
 345     case '<':
 346       return c1 == '=' ? T_LE : T_NE;
 347
 348     case '>':
 349       return T_GE;
 350
 351     case '~':
 352       return T_NE;
 353
 354     case '&':
 355       return T_AND;
 356
 357     case '|':
 358       return T_OR;
 359     }
 360
 361   NOT_REACHED ();
 362 }
 363
 364 static enum token_type
 365 scan_punct__ (struct substring s)
 366 {
 367   return (s.length == 1
 368           ? scan_punct1__ (s.string[0])
 369           : scan_punct2__ (s.string[0], s.string[1]));
 370 }
 371
 372 static void
 373 scan_number__ (struct substring s, struct token *token)
 374 {
 375   char buf[128];
 376   char *p;
 377
 378   if (s.length < sizeof buf)
 379     {
 380       p = buf;
 381       memcpy (buf, s.string, s.length);
 382       buf[s.length] = '\0';
 383     }
 384   else
 385     p = xmemdup0 (s.string, s.length);
 386
 387   bool negative = *p == '-';
 388   double x = c_strtod (p + negative, NULL);
 389   *token = (struct token) {
 390     .type = negative ? T_NEG_NUM : T_POS_NUM,
 391     .number = negative ? -x : x,
 392   };
 393
 394   if (p != buf)
 395     free (p);
 396 }
 397
 398 static enum scan_result
 399 scan_unexpected_char (const struct substring *s, struct token *token)
 400 {
 401   ucs4_t uc;
 402
 403   token->type = SCAN_UNEXPECTED_CHAR;
 404   u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, s->string), s->length);
 405   token->number = uc;
 406
 407   return SCAN_DONE;
 408 }
 409
 410 const char *
 411 scan_type_to_string (enum scan_type type)
 412 {
 413   switch (type)
 414     {
 415 #define SCAN_TYPE(NAME) case SCAN_##NAME: return #NAME;
 416       SCAN_TYPES
 417 #undef SCAN_TYPE
 418
 419     default:
 420       return token_type_to_name ((enum token_type) type);
 421     }
 422 }
 423
 424 bool
 425 is_scan_type (enum scan_type type)
 426 {
 427   return type > SCAN_FIRST && type < SCAN_LAST;
 428 }
 429
 430 /* If TOKEN has the type of a scan error (a subset of those identified by
 431    is_scan_type()), returns an appropriate error message.  Otherwise, returns
 432    NULL. */
 433 char *
 434 scan_token_to_error (const struct token *token)
 435 {
 436   switch (token->type)
 437     {
 438     case SCAN_BAD_HEX_LENGTH:
 439       return xasprintf (_("String of hex digits has %d characters, which "
 440                           "is not a multiple of 2."), (int) token->number);
 441
 442     case SCAN_BAD_HEX_DIGIT:
 443     case SCAN_BAD_UNICODE_DIGIT:
 444       return xasprintf (_("`%c' is not a valid hex digit."),
 445                         (int) token->number);
 446
 447     case SCAN_BAD_UNICODE_LENGTH:
 448       return xasprintf (_("Unicode string contains %d bytes, which is "
 449                           "not in the valid range of 1 to 8 bytes."),
 450                         (int) token->number);
 451
 452     case SCAN_BAD_UNICODE_CODE_POINT:
 453       return xasprintf (_("U+%04X is not a valid Unicode code point."),
 454                         (int) token->number);
 455
 456     case SCAN_EXPECTED_QUOTE:
 457       return xasprintf (_("Unterminated string constant."));
 458
 459     case SCAN_EXPECTED_EXPONENT:
 460       return xasprintf (_("Missing exponent following `%s'."),
 461                         token->string.string);
 462
 463     case SCAN_UNEXPECTED_CHAR:
 464      {
 465       char c_name[16];
 466       return xasprintf (_("Bad character %s in input."),
 467                         uc_name (token->number, c_name));
 468      }
 469     }
 470
 471   return NULL;
 472 }
 473
 474 static enum scan_result
 475 scan_start__ (struct scanner *scanner, enum segment_type type,
 476               struct substring s, struct token *token)
 477 {
 478   switch (type)
 479     {
 480     case SEG_NUMBER:
 481       scan_number__ (s, token);
 482       return SCAN_DONE;
 483
 484     case SEG_QUOTED_STRING:
 485     case SEG_HEX_STRING:
 486     case SEG_UNICODE_STRING:
 487       return scan_string_segment__ (scanner, type, s, token);
 488
 489     case SEG_UNQUOTED_STRING:
 490     case SEG_DO_REPEAT_COMMAND:
 491     case SEG_INLINE_DATA:
 492     case SEG_DOCUMENT:
 493     case SEG_MACRO_BODY:
 494       token->type = T_STRING;
 495       ss_alloc_substring (&token->string, s);
 496       return SCAN_DONE;
 497
 498     case SEG_RESERVED_WORD:
 499       token->type = scan_reserved_word__ (s);
 500       return SCAN_DONE;
 501
 502     case SEG_IDENTIFIER:
 503       token->type = T_ID;
 504       ss_alloc_substring (&token->string, s);
 505       return SCAN_DONE;
 506
 507     case SEG_MACRO_ID:
 508       token->type = T_MACRO_ID;
 509       ss_alloc_substring (&token->string, s);
 510       return SCAN_DONE;
 511
 512     case SEG_PUNCT:
 513       if (s.length == 1 && s.string[0] == '-')
 514         {
 515           scanner->state = S_DASH;
 516           return SCAN_SAVE;
 517         }
 518       else
 519         {
 520           token->type = scan_punct__ (s);
 521           if (token->type == T_MACRO_PUNCT)
 522             ss_alloc_substring (&token->string, s);
 523           return SCAN_DONE;
 524         }
 525
 526     case SEG_SHBANG:
 527     case SEG_SPACES:
 528     case SEG_COMMENT:
 529     case SEG_NEWLINE:
 530     case SEG_COMMENT_COMMAND:
 531       token->type = SCAN_SKIP;
 532       return SCAN_DONE;
 533
 534     case SEG_START_DOCUMENT:
 535       token->type = T_ID;
 536       ss_alloc_substring (&token->string, ss_cstr ("DOCUMENT"));
 537       return SCAN_DONE;
 538
 539     case SEG_START_COMMAND:
 540     case SEG_SEPARATE_COMMANDS:
 541     case SEG_END_COMMAND:
 542       token->type = T_ENDCMD;
 543       return SCAN_DONE;
 544
 545     case SEG_END:
 546       token->type = T_STOP;
 547       return SCAN_DONE;
 548
 549     case SEG_EXPECTED_QUOTE:
 550       token->type = SCAN_EXPECTED_QUOTE;
 551       return SCAN_DONE;
 552
 553     case SEG_EXPECTED_EXPONENT:
 554       token->type = SCAN_EXPECTED_EXPONENT;
 555       ss_alloc_substring (&token->string, s);
 556       return SCAN_DONE;
 557
 558     case SEG_UNEXPECTED_CHAR:
 559       return scan_unexpected_char (&s, token);
 560     }
 561
 562   NOT_REACHED ();
 563 }
 564
 565 static enum scan_result
 566 scan_dash__ (enum segment_type type, struct substring s, struct token *token)
 567 {
 568   switch (type)
 569     {
 570     case SEG_SPACES:
 571     case SEG_COMMENT:
 572       return SCAN_MORE;
 573
 574     case SEG_NUMBER:
 575       scan_number__ (s, token);
 576       token->type = T_NEG_NUM;
 577       token->number = -token->number;
 578       return SCAN_DONE;
 579
 580     default:
 581       token->type = T_DASH;
 582       return SCAN_BACK;
 583     }
 584 }
 585
 586 /* Initializes SCANNER for scanning a token from a sequence of segments.
 587    Initializes TOKEN as the output token.  (The client retains ownership of
 588    TOKEN, but it must be preserved across subsequent calls to scanner_push()
 589    for SCANNER.)
 590
 591    A scanner only produces a single token.  To obtain the next token,
 592    re-initialize it by calling this function again.
 593
 594    A scanner does not contain any external references, so nothing needs to be
 595    done to destroy one.  For the same reason, scanners may be copied with plain
 596    struct assignment (or memcpy). */
 597 void
 598 scanner_init (struct scanner *scanner, struct token *token)
 599 {
 600   scanner->state = S_START;
 601   *token = (struct token) { .type = T_STOP };
 602 }
 603
 604 /* Adds the segment with type TYPE and UTF-8 text S to SCANNER.  TOKEN must be
 605    the same token passed to scanner_init() for SCANNER, or a copy of it.
 606    scanner_push() may modify TOKEN.  The client retains ownership of TOKEN,
 607
 608    The possible return values are:
 609
 610      - SCAN_DONE: All of the segments that have been passed to scanner_push()
 611        form the token now stored in TOKEN.  SCANNER is now "used up" and must
 612        be reinitialized with scanner_init() if it is to be used again.
 613
 614        Most tokens only consist of a single segment, so this is the most common
 615        return value.
 616
 617      - SCAN_MORE: The segments passed to scanner_push() don't yet determine a
 618        token.  The caller should call scanner_push() again with the next token.
 619        (This won't happen if TYPE is SEG_END indicating the end of input.)
 620
 621      - SCAN_SAVE: This is similar to SCAN_MORE, with one difference: the caller
 622        needs to "save its place" in the stream of segments for a possible
 623        future SCAN_BACK return.  This value can be returned more than once in a
 624        sequence of scanner_push() calls for SCANNER, but the caller only needs
 625        to keep track of the most recent position.
 626
 627      - SCAN_BACK: This is similar to SCAN_DONE, but the token consists of only
 628        the segments up to and including the segment for which SCAN_SAVE was
 629        most recently returned.  Segments following that one should be passed to
 630        the next scanner to be initialized.
 631 */
 632 enum scan_result
 633 scanner_push (struct scanner *scanner, enum segment_type type,
 634               struct substring s, struct token *token)
 635 {
 636   switch (scanner->state)
 637     {
 638     case S_START:
 639       return scan_start__ (scanner, type, s, token);
 640
 641     case S_DASH:
 642       return scan_dash__ (type, s, token);
 643
 644     case S_STRING:
 645       return scan_string__ (scanner, type, s, token);
 646     }
 647
 648   NOT_REACHED ();
 649 }
 650 \f
 651 /* Initializes SLEX for parsing INPUT, which is LENGTH bytes long, in the
 652    specified MODE.
 653
 654    SLEX has no internal state to free, but it retains a reference to INPUT, so
 655    INPUT must not be modified or freed while SLEX is still in use. */
 656 void
 657 string_lexer_init (struct string_lexer *slex, const char *input, size_t length,
 658                    enum segmenter_mode mode, bool is_snippet)
 659 {
 660   *slex = (struct string_lexer) {
 661     .input = input,
 662     .length = length,
 663     .offset = 0,
 664     .segmenter = segmenter_init (mode, is_snippet),
 665   };
 666 }
 667
 668 /*  */
 669 bool
 670 string_lexer_next (struct string_lexer *slex, struct token *token)
 671 {
 672   struct segmenter saved_segmenter;
 673   size_t saved_offset = 0;
 674
 675   struct scanner scanner;
 676
 677   scanner_init (&scanner, token);
 678   for (;;)
 679     {
 680       const char *s = slex->input + slex->offset;
 681       size_t left = slex->length - slex->offset;
 682       enum segment_type type;
 683       int n;
 684
 685       n = segmenter_push (&slex->segmenter, s, left, true, &type);
 686       assert (n >= 0);
 687
 688       slex->offset += n;
 689       switch (scanner_push (&scanner, type, ss_buffer (s, n), token))
 690         {
 691         case SCAN_BACK:
 692           slex->segmenter = saved_segmenter;
 693           slex->offset = saved_offset;
 694           /* Fall through. */
 695         case SCAN_DONE:
 696           return token->type != T_STOP;
 697
 698         case SCAN_MORE:
 699           break;
 700
 701         case SCAN_SAVE:
 702           saved_segmenter = slex->segmenter;
 703           saved_offset = slex->offset;
 704           break;
 705         }
 706     }
 707 }