src/language/lexer/scan.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 2010, 2011 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "language/lexer/scan.h"
  20
  21 #include <limits.h>
  22 #include <unistr.h>
  23
  24 #include "data/identifier.h"
  25 #include "language/lexer/token.h"
  26 #include "libpspp/assertion.h"
  27 #include "libpspp/cast.h"
  28
  29 #include "gl/c-ctype.h"
  30 #include "gl/xmemdup0.h"
  31
  32 enum
  33   {
  34     S_START,
  35     S_DASH,
  36     S_STRING
  37   };
  38
  39 #define SS_NL_BEFORE_PLUS (1u << 0)
  40 #define SS_PLUS           (1u << 1)
  41 #define SS_NL_AFTER_PLUS  (1u << 2)
  42
  43 /* Returns the integer value of (hex) digit C. */
  44 static int
  45 digit_value (int c)
  46 {
  47   switch (c)
  48     {
  49     case '0': return 0;
  50     case '1': return 1;
  51     case '2': return 2;
  52     case '3': return 3;
  53     case '4': return 4;
  54     case '5': return 5;
  55     case '6': return 6;
  56     case '7': return 7;
  57     case '8': return 8;
  58     case '9': return 9;
  59     case 'a': case 'A': return 10;
  60     case 'b': case 'B': return 11;
  61     case 'c': case 'C': return 12;
  62     case 'd': case 'D': return 13;
  63     case 'e': case 'E': return 14;
  64     case 'f': case 'F': return 15;
  65     default: return INT_MAX;
  66     }
  67 }
  68
  69 static bool
  70 scan_quoted_string__ (struct substring s, struct token *token)
  71 {
  72   int quote;
  73
  74   /* Trim ' or " from front and back. */
  75   quote = s.string[s.length - 1];
  76   s.string++;
  77   s.length -= 2;
  78
  79   ss_realloc (&token->string, token->string.length + s.length + 1);
  80
  81   for (;;)
  82     {
  83       size_t pos = ss_find_byte (s, quote);
  84       if (pos == SIZE_MAX)
  85         break;
  86
  87       memcpy (ss_end (token->string), s.string, pos + 1);
  88       token->string.length += pos + 1;
  89       ss_advance (&s, pos + 2);
  90     }
  91
  92   memcpy (ss_end (token->string), s.string, ss_length (s));
  93   token->string.length += ss_length (s);
  94
  95   return true;
  96 }
  97
  98 static bool
  99 scan_hex_string__ (struct substring s, struct token *token)
 100 {
 101   uint8_t *dst;
 102   size_t i;
 103
 104   /* Trim X' from front and ' from back. */
 105   s.string += 2;
 106   s.length -= 3;
 107
 108   if (s.length % 2 != 0)
 109     {
 110       token->type = SCAN_BAD_HEX_LENGTH;
 111       token->number = s.length;
 112       return false;
 113     }
 114
 115   ss_realloc (&token->string, token->string.length + s.length / 2 + 1);
 116   dst = CHAR_CAST (uint8_t *, ss_end (token->string));
 117   token->string.length += s.length / 2;
 118   for (i = 0; i < s.length; i += 2)
 119     {
 120       int hi = digit_value (s.string[i]);
 121       int lo = digit_value (s.string[i + 1]);
 122
 123       if (hi >= 16 || lo >= 16)
 124         {
 125           token->type = SCAN_BAD_HEX_DIGIT;
 126           token->number = s.string[hi >= 16 ? i : i + 1];
 127           return false;
 128         }
 129
 130       *dst++ = hi * 16 + lo;
 131     }
 132
 133   return true;
 134 }
 135
 136 static bool
 137 scan_unicode_string__ (struct substring s, struct token *token)
 138 {
 139   uint8_t *dst;
 140   ucs4_t uc;
 141   size_t i;
 142
 143   /* Trim U' from front and ' from back. */
 144   s.string += 2;
 145   s.length -= 3;
 146
 147   if (s.length < 1 || s.length > 8)
 148     {
 149       token->type = SCAN_BAD_UNICODE_LENGTH;
 150       token->number = s.length;
 151       return 0;
 152     }
 153
 154   ss_realloc (&token->string, token->string.length + 4 + 1);
 155
 156   uc = 0;
 157   for (i = 0; i < s.length; i++)
 158     {
 159       int digit = digit_value (s.string[i]);
 160       if (digit >= 16)
 161         {
 162           token->type = SCAN_BAD_UNICODE_DIGIT;
 163           token->number = s.string[i];
 164           return 0;
 165         }
 166       uc = uc * 16 + digit;
 167     }
 168
 169   if ((uc >= 0xd800 && uc < 0xe000) || uc > 0x10ffff)
 170     {
 171       token->type = SCAN_BAD_UNICODE_CODE_POINT;
 172       token->number = uc;
 173       return 0;
 174     }
 175
 176   dst = CHAR_CAST (uint8_t *, ss_end (token->string));
 177   token->string.length += u8_uctomb (dst, uc, 4);
 178
 179   return true;
 180 }
 181
 182 static enum scan_result
 183 scan_string_segment__ (struct scanner *scanner, enum segment_type type,
 184                        struct substring s, struct token *token)
 185 {
 186   bool ok;
 187
 188   switch (type)
 189     {
 190     case SEG_QUOTED_STRING:
 191       ok = scan_quoted_string__ (s, token);
 192       break;
 193
 194     case SEG_HEX_STRING:
 195       ok = scan_hex_string__ (s, token);
 196       break;
 197
 198     case SEG_UNICODE_STRING:
 199       ok = scan_unicode_string__ (s, token);
 200       break;
 201
 202     default:
 203       NOT_REACHED ();
 204     }
 205
 206   if (ok)
 207     {
 208       token->type = T_STRING;
 209       token->string.string[token->string.length] = '\0';
 210       scanner->state = S_STRING;
 211       scanner->substate = 0;
 212       return SCAN_SAVE;
 213     }
 214   else
 215     {
 216       /* The function we called above should have filled in token->type and
 217          token->number properly to describe the error. */
 218       ss_dealloc (&token->string);
 219       token->string = ss_empty ();
 220       return SCAN_DONE;
 221     }
 222
 223 }
 224
 225 static enum scan_result
 226 add_bit (struct scanner *scanner, unsigned int bit)
 227 {
 228   if (!(scanner->substate & bit))
 229     {
 230       scanner->substate |= bit;
 231       return SCAN_MORE;
 232     }
 233   else
 234     return SCAN_BACK;
 235 }
 236
 237 static enum scan_result
 238 scan_string__ (struct scanner *scanner, enum segment_type type,
 239                struct substring s, struct token *token)
 240 {
 241   switch (type)
 242     {
 243     case SEG_SPACES:
 244     case SEG_COMMENT:
 245       return SCAN_MORE;
 246
 247     case SEG_NEWLINE:
 248       if (scanner->substate & SS_PLUS)
 249         return add_bit (scanner, SS_NL_AFTER_PLUS);
 250       else
 251         return add_bit (scanner, SS_NL_BEFORE_PLUS);
 252
 253     case SEG_PUNCT:
 254       return (s.length == 1 && s.string[0] == '+'
 255               ? add_bit (scanner, SS_PLUS)
 256               : SCAN_BACK);
 257
 258     case SEG_QUOTED_STRING:
 259     case SEG_HEX_STRING:
 260     case SEG_UNICODE_STRING:
 261       return (scanner->substate & SS_PLUS
 262               ? scan_string_segment__ (scanner, type, s, token)
 263               : SCAN_BACK);
 264
 265     default:
 266       return SCAN_BACK;
 267     }
 268 }
 269
 270 static enum token_type
 271 scan_reserved_word__ (struct substring word)
 272 {
 273   switch (c_toupper (word.string[0]))
 274     {
 275     case 'B':
 276       return T_BY;
 277
 278     case 'E':
 279       return T_EQ;
 280
 281     case 'G':
 282       return c_toupper (word.string[1]) == 'E' ? T_GE : T_GT;
 283
 284     case 'L':
 285       return c_toupper (word.string[1]) == 'E' ? T_LE : T_LT;
 286
 287     case 'N':
 288       return word.length == 2 ? T_NE : T_NOT;
 289
 290     case 'O':
 291       return T_OR;
 292
 293     case 'T':
 294       return T_TO;
 295
 296     case 'A':
 297       return c_toupper (word.string[1]) == 'L' ? T_ALL : T_AND;
 298
 299     case 'W':
 300       return T_WITH;
 301     }
 302
 303   NOT_REACHED ();
 304 }
 305
 306 static enum token_type
 307 scan_punct1__ (char c0)
 308 {
 309   switch (c0)
 310     {
 311     case '(': return T_LPAREN;
 312     case ')': return T_RPAREN;
 313     case ',': return T_COMMA;
 314     case '=': return T_EQUALS;
 315     case '-': return T_DASH;
 316     case '[': return T_LBRACK;
 317     case ']': return T_RBRACK;
 318     case '&': return T_AND;
 319     case '|': return T_OR;
 320     case '+': return T_PLUS;
 321     case '/': return T_SLASH;
 322     case '*': return T_ASTERISK;
 323     case '<': return T_LT;
 324     case '>': return T_GT;
 325     case '~': return T_NOT;
 326     }
 327
 328   NOT_REACHED ();
 329 }
 330
 331 static enum token_type
 332 scan_punct2__ (char c0, char c1)
 333 {
 334   switch (c0)
 335     {
 336     case '*':
 337       return T_EXP;
 338
 339     case '<':
 340       return c1 == '=' ? T_LE : T_NE;
 341
 342     case '>':
 343       return T_GE;
 344
 345     case '~':
 346       return T_NE;
 347
 348     case '&':
 349       return T_AND;
 350
 351     case '|':
 352       return T_OR;
 353     }
 354
 355   NOT_REACHED ();
 356 }
 357
 358 static enum token_type
 359 scan_punct__ (struct substring s)
 360 {
 361   return (s.length == 1
 362           ? scan_punct1__ (s.string[0])
 363           : scan_punct2__ (s.string[0], s.string[1]));
 364 }
 365
 366 static double
 367 scan_number__ (struct substring s)
 368 {
 369   char buf[128];
 370   double number;
 371   char *p;
 372
 373   if (s.length < sizeof buf)
 374     {
 375       p = buf;
 376       memcpy (buf, s.string, s.length);
 377       buf[s.length] = '\0';
 378     }
 379   else
 380     p = xmemdup0 (s.string, s.length);
 381
 382   number = strtod (p, NULL);
 383
 384   if (p != buf)
 385     free (p);
 386
 387   return number;
 388 }
 389
 390 static enum scan_result
 391 scan_unexpected_char (const struct substring *s, struct token *token)
 392 {
 393   ucs4_t uc;
 394
 395   token->type = SCAN_UNEXPECTED_CHAR;
 396   u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, s->string), s->length);
 397   token->number = uc;
 398
 399   return SCAN_DONE;
 400 }
 401
 402 const char *
 403 scan_type_to_string (enum scan_type type)
 404 {
 405   switch (type)
 406     {
 407 #define SCAN_TYPE(NAME) case SCAN_##NAME: return #NAME;
 408       SCAN_TYPES
 409 #undef SCAN_TYPE
 410
 411     default:
 412       return token_type_to_name (type);
 413     }
 414 }
 415
 416 bool
 417 is_scan_type (enum scan_type type)
 418 {
 419   return type > SCAN_FIRST && type < SCAN_LAST;
 420 }
 421
 422 static enum scan_result
 423 scan_start__ (struct scanner *scanner, enum segment_type type,
 424               struct substring s, struct token *token)
 425 {
 426   switch (type)
 427     {
 428     case SEG_NUMBER:
 429       token->type = T_POS_NUM;
 430       token->number = scan_number__ (s);
 431       return SCAN_DONE;
 432
 433     case SEG_QUOTED_STRING:
 434     case SEG_HEX_STRING:
 435     case SEG_UNICODE_STRING:
 436       return scan_string_segment__ (scanner, type, s, token);
 437
 438     case SEG_UNQUOTED_STRING:
 439     case SEG_DO_REPEAT_COMMAND:
 440     case SEG_INLINE_DATA:
 441     case SEG_DOCUMENT:
 442       token->type = T_STRING;
 443       ss_alloc_substring (&token->string, s);
 444       return SCAN_DONE;
 445
 446     case SEG_RESERVED_WORD:
 447       token->type = scan_reserved_word__ (s);
 448       return SCAN_DONE;
 449
 450     case SEG_IDENTIFIER:
 451       token->type = T_ID;
 452       ss_alloc_substring (&token->string, s);
 453       return SCAN_DONE;
 454
 455     case SEG_PUNCT:
 456       if (s.length == 1 && s.string[0] == '-')
 457         {
 458           scanner->state = S_DASH;
 459           return SCAN_SAVE;
 460         }
 461       else
 462         {
 463           token->type = scan_punct__ (s);
 464           return SCAN_DONE;
 465         }
 466
 467     case SEG_SHBANG:
 468     case SEG_SPACES:
 469     case SEG_COMMENT:
 470     case SEG_NEWLINE:
 471     case SEG_COMMENT_COMMAND:
 472       token->type = SCAN_SKIP;
 473       return SCAN_DONE;
 474
 475     case SEG_START_DOCUMENT:
 476       token->type = T_ID;
 477       ss_alloc_substring (&token->string, ss_cstr ("DOCUMENT"));
 478       return SCAN_DONE;
 479
 480     case SEG_START_COMMAND:
 481     case SEG_SEPARATE_COMMANDS:
 482     case SEG_END_COMMAND:
 483       token->type = T_ENDCMD;
 484       return SCAN_DONE;
 485
 486     case SEG_END:
 487       token->type = T_STOP;
 488       return SCAN_DONE;
 489
 490     case SEG_EXPECTED_QUOTE:
 491       token->type = SCAN_EXPECTED_QUOTE;
 492       return SCAN_DONE;
 493
 494     case SEG_EXPECTED_EXPONENT:
 495       token->type = SCAN_EXPECTED_EXPONENT;
 496       ss_alloc_substring (&token->string, s);
 497       return SCAN_DONE;
 498
 499     case SEG_UNEXPECTED_DOT:
 500       token->type = SCAN_UNEXPECTED_DOT;
 501       return SCAN_DONE;
 502
 503     case SEG_UNEXPECTED_CHAR:
 504       return scan_unexpected_char (&s, token);
 505
 506     case SEG_N_TYPES:
 507       NOT_REACHED ();
 508     }
 509
 510   NOT_REACHED ();
 511 }
 512
 513 static enum scan_result
 514 scan_dash__ (enum segment_type type, struct substring s, struct token *token)
 515 {
 516   switch (type)
 517     {
 518     case SEG_SPACES:
 519     case SEG_COMMENT:
 520       return SCAN_MORE;
 521
 522     case SEG_NUMBER:
 523       token->type = T_NEG_NUM;
 524       token->number = -scan_number__ (s);
 525       return SCAN_DONE;
 526
 527     default:
 528       token->type = T_DASH;
 529       return SCAN_BACK;
 530     }
 531 }
 532
 533 /* Initializes SCANNER for scanning a token from a sequence of segments.
 534    Initializes TOKEN as the output token.  (The client retains ownership of
 535    TOKEN, but it must be preserved across subsequent calls to scanner_push()
 536    for SCANNER.)
 537
 538    A scanner only produces a single token.  To obtain the next token,
 539    re-initialize it by calling this function again.
 540
 541    A scanner does not contain any external references, so nothing needs to be
 542    done to destroy one.  For the same reason, scanners may be copied with plain
 543    struct assignment (or memcpy). */
 544 void
 545 scanner_init (struct scanner *scanner, struct token *token)
 546 {
 547   scanner->state = S_START;
 548   token_init (token);
 549 }
 550
 551 /* Adds the segment with type TYPE and UTF-8 text S to SCANNER.  TOKEN must be
 552    the same token passed to scanner_init() for SCANNER, or a copy of it.
 553    scanner_push() may modify TOKEN.  The client retains ownership of TOKEN,
 554
 555    The possible return values are:
 556
 557      - SCAN_DONE: All of the segments that have been passed to scanner_push()
 558        form the token now stored in TOKEN.  SCANNER is now "used up" and must
 559        be reinitialized with scanner_init() if it is to be used again.
 560
 561        Most tokens only consist of a single segment, so this is the most common
 562        return value.
 563
 564      - SCAN_MORE: The segments passed to scanner_push() don't yet determine a
 565        token.  The caller should call scanner_push() again with the next token.
 566        (This won't happen if TYPE is SEG_END indicating the end of input.)
 567
 568      - SCAN_SAVE: This is similar to SCAN_MORE, with one difference: the caller
 569        needs to "save its place" in the stream of segments for a possible
 570        future SCAN_BACK return.  This value can be returned more than once in a
 571        sequence of scanner_push() calls for SCANNER, but the caller only needs
 572        to keep track of the most recent position.
 573
 574      - SCAN_BACK: This is similar to SCAN_DONE, but the token consists of only
 575        the segments up to and including the segment for which SCAN_SAVE was
 576        most recently returned.  Segments following that one should be passed to
 577        the next scanner to be initialized.
 578 */
 579 enum scan_result
 580 scanner_push (struct scanner *scanner, enum segment_type type,
 581               struct substring s, struct token *token)
 582 {
 583   switch (scanner->state)
 584     {
 585     case S_START:
 586       return scan_start__ (scanner, type, s, token);
 587
 588     case S_DASH:
 589       return scan_dash__ (type, s, token);
 590
 591     case S_STRING:
 592       return scan_string__ (scanner, type, s, token);
 593     }
 594
 595   NOT_REACHED ();
 596 }