pintos-os.org Git - pspp/blob - src/language/lexer/scan.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 2010, 2011, 2013 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "language/lexer/scan.h"
  20
  21 #include <limits.h>
  22 #include <unistr.h>
  23
  24 #include "data/identifier.h"
  25 #include "language/lexer/token.h"
  26 #include "libpspp/assertion.h"
  27 #include "libpspp/cast.h"
  28
  29 #include "gl/c-ctype.h"
  30 #include "gl/c-strtod.h"
  31 #include "gl/xmemdup0.h"
  32
  33 enum
  34   {
  35     S_START,
  36     S_DASH,
  37     S_STRING
  38   };
  39
  40 #define SS_NL_BEFORE_PLUS (1u << 0)
  41 #define SS_PLUS           (1u << 1)
  42 #define SS_NL_AFTER_PLUS  (1u << 2)
  43
  44 /* Returns the integer value of (hex) digit C. */
  45 static int
  46 digit_value (int c)
  47 {
  48   switch (c)
  49     {
  50     case '0': return 0;
  51     case '1': return 1;
  52     case '2': return 2;
  53     case '3': return 3;
  54     case '4': return 4;
  55     case '5': return 5;
  56     case '6': return 6;
  57     case '7': return 7;
  58     case '8': return 8;
  59     case '9': return 9;
  60     case 'a': case 'A': return 10;
  61     case 'b': case 'B': return 11;
  62     case 'c': case 'C': return 12;
  63     case 'd': case 'D': return 13;
  64     case 'e': case 'E': return 14;
  65     case 'f': case 'F': return 15;
  66     default: return INT_MAX;
  67     }
  68 }
  69
  70 static bool
  71 scan_quoted_string__ (struct substring s, struct token *token)
  72 {
  73   int quote;
  74
  75   /* Trim ' or " from front and back. */
  76   quote = s.string[s.length - 1];
  77   s.string++;
  78   s.length -= 2;
  79
  80   ss_realloc (&token->string, token->string.length + s.length + 1);
  81
  82   for (;;)
  83     {
  84       size_t pos = ss_find_byte (s, quote);
  85       if (pos == SIZE_MAX)
  86         break;
  87
  88       memcpy (ss_end (token->string), s.string, pos + 1);
  89       token->string.length += pos + 1;
  90       ss_advance (&s, pos + 2);
  91     }
  92
  93   memcpy (ss_end (token->string), s.string, ss_length (s));
  94   token->string.length += ss_length (s);
  95
  96   return true;
  97 }
  98
  99 static bool
 100 scan_hex_string__ (struct substring s, struct token *token)
 101 {
 102   uint8_t *dst;
 103   size_t i;
 104
 105   /* Trim X' from front and ' from back. */
 106   s.string += 2;
 107   s.length -= 3;
 108
 109   if (s.length % 2 != 0)
 110     {
 111       token->type = SCAN_BAD_HEX_LENGTH;
 112       token->number = s.length;
 113       return false;
 114     }
 115
 116   ss_realloc (&token->string, token->string.length + s.length / 2 + 1);
 117   dst = CHAR_CAST (uint8_t *, ss_end (token->string));
 118   token->string.length += s.length / 2;
 119   for (i = 0; i < s.length; i += 2)
 120     {
 121       int hi = digit_value (s.string[i]);
 122       int lo = digit_value (s.string[i + 1]);
 123
 124       if (hi >= 16 || lo >= 16)
 125         {
 126           token->type = SCAN_BAD_HEX_DIGIT;
 127           token->number = s.string[hi >= 16 ? i : i + 1];
 128           return false;
 129         }
 130
 131       *dst++ = hi * 16 + lo;
 132     }
 133
 134   return true;
 135 }
 136
 137 static bool
 138 scan_unicode_string__ (struct substring s, struct token *token)
 139 {
 140   uint8_t *dst;
 141   ucs4_t uc;
 142   size_t i;
 143
 144   /* Trim U' from front and ' from back. */
 145   s.string += 2;
 146   s.length -= 3;
 147
 148   if (s.length < 1 || s.length > 8)
 149     {
 150       token->type = SCAN_BAD_UNICODE_LENGTH;
 151       token->number = s.length;
 152       return 0;
 153     }
 154
 155   ss_realloc (&token->string, token->string.length + 4 + 1);
 156
 157   uc = 0;
 158   for (i = 0; i < s.length; i++)
 159     {
 160       int digit = digit_value (s.string[i]);
 161       if (digit >= 16)
 162         {
 163           token->type = SCAN_BAD_UNICODE_DIGIT;
 164           token->number = s.string[i];
 165           return 0;
 166         }
 167       uc = uc * 16 + digit;
 168     }
 169
 170   if ((uc >= 0xd800 && uc < 0xe000) || uc > 0x10ffff)
 171     {
 172       token->type = SCAN_BAD_UNICODE_CODE_POINT;
 173       token->number = uc;
 174       return 0;
 175     }
 176
 177   dst = CHAR_CAST (uint8_t *, ss_end (token->string));
 178   token->string.length += u8_uctomb (dst, uc, 4);
 179
 180   return true;
 181 }
 182
 183 static enum scan_result
 184 scan_string_segment__ (struct scanner *scanner, enum segment_type type,
 185                        struct substring s, struct token *token)
 186 {
 187   bool ok;
 188
 189   switch (type)
 190     {
 191     case SEG_QUOTED_STRING:
 192       ok = scan_quoted_string__ (s, token);
 193       break;
 194
 195     case SEG_HEX_STRING:
 196       ok = scan_hex_string__ (s, token);
 197       break;
 198
 199     case SEG_UNICODE_STRING:
 200       ok = scan_unicode_string__ (s, token);
 201       break;
 202
 203     default:
 204       NOT_REACHED ();
 205     }
 206
 207   if (ok)
 208     {
 209       token->type = T_STRING;
 210       token->string.string[token->string.length] = '\0';
 211       scanner->state = S_STRING;
 212       scanner->substate = 0;
 213       return SCAN_SAVE;
 214     }
 215   else
 216     {
 217       /* The function we called above should have filled in token->type and
 218          token->number properly to describe the error. */
 219       ss_dealloc (&token->string);
 220       token->string = ss_empty ();
 221       return SCAN_DONE;
 222     }
 223
 224 }
 225
 226 static enum scan_result
 227 add_bit (struct scanner *scanner, unsigned int bit)
 228 {
 229   if (!(scanner->substate & bit))
 230     {
 231       scanner->substate |= bit;
 232       return SCAN_MORE;
 233     }
 234   else
 235     return SCAN_BACK;
 236 }
 237
 238 static enum scan_result
 239 scan_string__ (struct scanner *scanner, enum segment_type type,
 240                struct substring s, struct token *token)
 241 {
 242   switch (type)
 243     {
 244     case SEG_SPACES:
 245     case SEG_COMMENT:
 246       return SCAN_MORE;
 247
 248     case SEG_NEWLINE:
 249       if (scanner->substate & SS_PLUS)
 250         return add_bit (scanner, SS_NL_AFTER_PLUS);
 251       else
 252         return add_bit (scanner, SS_NL_BEFORE_PLUS);
 253
 254     case SEG_PUNCT:
 255       return (s.length == 1 && s.string[0] == '+'
 256               ? add_bit (scanner, SS_PLUS)
 257               : SCAN_BACK);
 258
 259     case SEG_QUOTED_STRING:
 260     case SEG_HEX_STRING:
 261     case SEG_UNICODE_STRING:
 262       return (scanner->substate & SS_PLUS
 263               ? scan_string_segment__ (scanner, type, s, token)
 264               : SCAN_BACK);
 265
 266     default:
 267       return SCAN_BACK;
 268     }
 269 }
 270
 271 static enum token_type
 272 scan_reserved_word__ (struct substring word)
 273 {
 274   switch (c_toupper (word.string[0]))
 275     {
 276     case 'B':
 277       return T_BY;
 278
 279     case 'E':
 280       return T_EQ;
 281
 282     case 'G':
 283       return c_toupper (word.string[1]) == 'E' ? T_GE : T_GT;
 284
 285     case 'L':
 286       return c_toupper (word.string[1]) == 'E' ? T_LE : T_LT;
 287
 288     case 'N':
 289       return word.length == 2 ? T_NE : T_NOT;
 290
 291     case 'O':
 292       return T_OR;
 293
 294     case 'T':
 295       return T_TO;
 296
 297     case 'A':
 298       return c_toupper (word.string[1]) == 'L' ? T_ALL : T_AND;
 299
 300     case 'W':
 301       return T_WITH;
 302     }
 303
 304   NOT_REACHED ();
 305 }
 306
 307 static enum token_type
 308 scan_punct1__ (char c0)
 309 {
 310   switch (c0)
 311     {
 312     case '(': return T_LPAREN;
 313     case ')': return T_RPAREN;
 314     case ',': return T_COMMA;
 315     case '=': return T_EQUALS;
 316     case '-': return T_DASH;
 317     case '[': return T_LBRACK;
 318     case ']': return T_RBRACK;
 319     case '&': return T_AND;
 320     case '|': return T_OR;
 321     case '+': return T_PLUS;
 322     case '/': return T_SLASH;
 323     case '*': return T_ASTERISK;
 324     case '<': return T_LT;
 325     case '>': return T_GT;
 326     case '~': return T_NOT;
 327     }
 328
 329   NOT_REACHED ();
 330 }
 331
 332 static enum token_type
 333 scan_punct2__ (char c0, char c1)
 334 {
 335   switch (c0)
 336     {
 337     case '*':
 338       return T_EXP;
 339
 340     case '<':
 341       return c1 == '=' ? T_LE : T_NE;
 342
 343     case '>':
 344       return T_GE;
 345
 346     case '~':
 347       return T_NE;
 348
 349     case '&':
 350       return T_AND;
 351
 352     case '|':
 353       return T_OR;
 354     }
 355
 356   NOT_REACHED ();
 357 }
 358
 359 static enum token_type
 360 scan_punct__ (struct substring s)
 361 {
 362   return (s.length == 1
 363           ? scan_punct1__ (s.string[0])
 364           : scan_punct2__ (s.string[0], s.string[1]));
 365 }
 366
 367 static double
 368 scan_number__ (struct substring s)
 369 {
 370   char buf[128];
 371   double number;
 372   char *p;
 373
 374   if (s.length < sizeof buf)
 375     {
 376       p = buf;
 377       memcpy (buf, s.string, s.length);
 378       buf[s.length] = '\0';
 379     }
 380   else
 381     p = xmemdup0 (s.string, s.length);
 382
 383   number = c_strtod (p, NULL);
 384
 385   if (p != buf)
 386     free (p);
 387
 388   return number;
 389 }
 390
 391 static enum scan_result
 392 scan_unexpected_char (const struct substring *s, struct token *token)
 393 {
 394   ucs4_t uc;
 395
 396   token->type = SCAN_UNEXPECTED_CHAR;
 397   u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, s->string), s->length);
 398   token->number = uc;
 399
 400   return SCAN_DONE;
 401 }
 402
 403 const char *
 404 scan_type_to_string (enum scan_type type)
 405 {
 406   switch (type)
 407     {
 408 #define SCAN_TYPE(NAME) case SCAN_##NAME: return #NAME;
 409       SCAN_TYPES
 410 #undef SCAN_TYPE
 411
 412     default:
 413       return token_type_to_name ((enum token_type) type);
 414     }
 415 }
 416
 417 bool
 418 is_scan_type (enum scan_type type)
 419 {
 420   return type > SCAN_FIRST && type < SCAN_LAST;
 421 }
 422
 423 static enum scan_result
 424 scan_start__ (struct scanner *scanner, enum segment_type type,
 425               struct substring s, struct token *token)
 426 {
 427   switch (type)
 428     {
 429     case SEG_NUMBER:
 430       token->type = T_POS_NUM;
 431       token->number = scan_number__ (s);
 432       return SCAN_DONE;
 433
 434     case SEG_QUOTED_STRING:
 435     case SEG_HEX_STRING:
 436     case SEG_UNICODE_STRING:
 437       return scan_string_segment__ (scanner, type, s, token);
 438
 439     case SEG_UNQUOTED_STRING:
 440     case SEG_DO_REPEAT_COMMAND:
 441     case SEG_INLINE_DATA:
 442     case SEG_DOCUMENT:
 443       token->type = T_STRING;
 444       ss_alloc_substring (&token->string, s);
 445       return SCAN_DONE;
 446
 447     case SEG_RESERVED_WORD:
 448       token->type = scan_reserved_word__ (s);
 449       return SCAN_DONE;
 450
 451     case SEG_IDENTIFIER:
 452       token->type = T_ID;
 453       ss_alloc_substring (&token->string, s);
 454       return SCAN_DONE;
 455
 456     case SEG_MACRO_ID:
 457       token->type = T_MACRO_ID;
 458       ss_alloc_substring (&token->string, s);
 459       return SCAN_DONE;
 460
 461     case SEG_PUNCT:
 462       if (s.length == 1 && s.string[0] == '-')
 463         {
 464           scanner->state = S_DASH;
 465           return SCAN_SAVE;
 466         }
 467       else
 468         {
 469           token->type = scan_punct__ (s);
 470           return SCAN_DONE;
 471         }
 472
 473     case SEG_SHBANG:
 474     case SEG_SPACES:
 475     case SEG_COMMENT:
 476     case SEG_NEWLINE:
 477     case SEG_COMMENT_COMMAND:
 478       token->type = SCAN_SKIP;
 479       return SCAN_DONE;
 480
 481     case SEG_START_DOCUMENT:
 482       token->type = T_ID;
 483       ss_alloc_substring (&token->string, ss_cstr ("DOCUMENT"));
 484       return SCAN_DONE;
 485
 486     case SEG_START_COMMAND:
 487     case SEG_SEPARATE_COMMANDS:
 488     case SEG_END_COMMAND:
 489       token->type = T_ENDCMD;
 490       return SCAN_DONE;
 491
 492     case SEG_END:
 493       token->type = T_STOP;
 494       return SCAN_DONE;
 495
 496     case SEG_EXPECTED_QUOTE:
 497       token->type = SCAN_EXPECTED_QUOTE;
 498       return SCAN_DONE;
 499
 500     case SEG_EXPECTED_EXPONENT:
 501       token->type = SCAN_EXPECTED_EXPONENT;
 502       ss_alloc_substring (&token->string, s);
 503       return SCAN_DONE;
 504
 505     case SEG_UNEXPECTED_DOT:
 506       token->type = SCAN_UNEXPECTED_DOT;
 507       return SCAN_DONE;
 508
 509     case SEG_UNEXPECTED_CHAR:
 510       return scan_unexpected_char (&s, token);
 511     }
 512
 513   NOT_REACHED ();
 514 }
 515
 516 static enum scan_result
 517 scan_dash__ (enum segment_type type, struct substring s, struct token *token)
 518 {
 519   switch (type)
 520     {
 521     case SEG_SPACES:
 522     case SEG_COMMENT:
 523       return SCAN_MORE;
 524
 525     case SEG_NUMBER:
 526       token->type = T_NEG_NUM;
 527       token->number = -scan_number__ (s);
 528       return SCAN_DONE;
 529
 530     default:
 531       token->type = T_DASH;
 532       return SCAN_BACK;
 533     }
 534 }
 535
 536 /* Initializes SCANNER for scanning a token from a sequence of segments.
 537    Initializes TOKEN as the output token.  (The client retains ownership of
 538    TOKEN, but it must be preserved across subsequent calls to scanner_push()
 539    for SCANNER.)
 540
 541    A scanner only produces a single token.  To obtain the next token,
 542    re-initialize it by calling this function again.
 543
 544    A scanner does not contain any external references, so nothing needs to be
 545    done to destroy one.  For the same reason, scanners may be copied with plain
 546    struct assignment (or memcpy). */
 547 void
 548 scanner_init (struct scanner *scanner, struct token *token)
 549 {
 550   scanner->state = S_START;
 551   token_init (token);
 552 }
 553
 554 /* Adds the segment with type TYPE and UTF-8 text S to SCANNER.  TOKEN must be
 555    the same token passed to scanner_init() for SCANNER, or a copy of it.
 556    scanner_push() may modify TOKEN.  The client retains ownership of TOKEN,
 557
 558    The possible return values are:
 559
 560      - SCAN_DONE: All of the segments that have been passed to scanner_push()
 561        form the token now stored in TOKEN.  SCANNER is now "used up" and must
 562        be reinitialized with scanner_init() if it is to be used again.
 563
 564        Most tokens only consist of a single segment, so this is the most common
 565        return value.
 566
 567      - SCAN_MORE: The segments passed to scanner_push() don't yet determine a
 568        token.  The caller should call scanner_push() again with the next token.
 569        (This won't happen if TYPE is SEG_END indicating the end of input.)
 570
 571      - SCAN_SAVE: This is similar to SCAN_MORE, with one difference: the caller
 572        needs to "save its place" in the stream of segments for a possible
 573        future SCAN_BACK return.  This value can be returned more than once in a
 574        sequence of scanner_push() calls for SCANNER, but the caller only needs
 575        to keep track of the most recent position.
 576
 577      - SCAN_BACK: This is similar to SCAN_DONE, but the token consists of only
 578        the segments up to and including the segment for which SCAN_SAVE was
 579        most recently returned.  Segments following that one should be passed to
 580        the next scanner to be initialized.
 581 */
 582 enum scan_result
 583 scanner_push (struct scanner *scanner, enum segment_type type,
 584               struct substring s, struct token *token)
 585 {
 586   switch (scanner->state)
 587     {
 588     case S_START:
 589       return scan_start__ (scanner, type, s, token);
 590
 591     case S_DASH:
 592       return scan_dash__ (type, s, token);
 593
 594     case S_STRING:
 595       return scan_string__ (scanner, type, s, token);
 596     }
 597
 598   NOT_REACHED ();
 599 }
 600 \f
 601 /* Initializes SLEX for parsing INPUT, which is LENGTH bytes long, in the
 602    specified MODE.
 603
 604    SLEX has no internal state to free, but it retains a reference to INPUT, so
 605    INPUT must not be modified or freed while SLEX is still in use. */
 606 void
 607 string_lexer_init (struct string_lexer *slex, const char *input, size_t length,
 608                    enum segmenter_mode mode)
 609 {
 610   slex->input = input;
 611   slex->length = length;
 612   slex->offset = 0;
 613   segmenter_init (&slex->segmenter, mode);
 614 }
 615
 616 /*  */
 617 bool
 618 string_lexer_next (struct string_lexer *slex, struct token *token)
 619 {
 620   struct segmenter saved_segmenter;
 621   size_t saved_offset = 0;
 622
 623   struct scanner scanner;
 624
 625   scanner_init (&scanner, token);
 626   for (;;)
 627     {
 628       const char *s = slex->input + slex->offset;
 629       size_t left = slex->length - slex->offset;
 630       enum segment_type type;
 631       int n;
 632
 633       n = segmenter_push (&slex->segmenter, s, left, true, &type);
 634       assert (n >= 0);
 635
 636       slex->offset += n;
 637       switch (scanner_push (&scanner, type, ss_buffer (s, n), token))
 638         {
 639         case SCAN_BACK:
 640           slex->segmenter = saved_segmenter;
 641           slex->offset = saved_offset;
 642           /* Fall through. */
 643         case SCAN_DONE:
 644           return token->type != T_STOP;
 645
 646         case SCAN_MORE:
 647           break;
 648
 649         case SCAN_SAVE:
 650           saved_segmenter = slex->segmenter;
 651           saved_offset = slex->offset;
 652           break;
 653         }
 654     }
 655 }