pintos-os.org Git - pspp/blob - src/language/lexer/scan.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 2010, 2011, 2013 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "language/lexer/scan.h"
  20
  21 #include <limits.h>
  22 #include <unistr.h>
  23
  24 #include "data/identifier.h"
  25 #include "language/lexer/token.h"
  26 #include "libpspp/assertion.h"
  27 #include "libpspp/cast.h"
  28 #include "libpspp/i18n.h"
  29
  30 #include "gl/c-ctype.h"
  31 #include "gl/c-strtod.h"
  32 #include "gl/xmemdup0.h"
  33
  34 #include "gettext.h"
  35 #define _(msgid) gettext (msgid)
  36
  37 /* Returns the integer value of (hex) digit C. */
  38 static int
  39 digit_value (int c)
  40 {
  41   switch (c)
  42     {
  43     case '0': return 0;
  44     case '1': return 1;
  45     case '2': return 2;
  46     case '3': return 3;
  47     case '4': return 4;
  48     case '5': return 5;
  49     case '6': return 6;
  50     case '7': return 7;
  51     case '8': return 8;
  52     case '9': return 9;
  53     case 'a': case 'A': return 10;
  54     case 'b': case 'B': return 11;
  55     case 'c': case 'C': return 12;
  56     case 'd': case 'D': return 13;
  57     case 'e': case 'E': return 14;
  58     case 'f': case 'F': return 15;
  59     default: return INT_MAX;
  60     }
  61 }
  62
  63 static void
  64 scan_quoted_string (struct substring in, struct token *token)
  65 {
  66   /* Trim ' or " from front and back. */
  67   int quote = in.string[0];
  68   in.string++;
  69   in.length -= 2;
  70
  71   struct substring out = { .string = xmalloc (in.length + 1) };
  72
  73   for (;;)
  74     {
  75       size_t pos = ss_find_byte (in, quote);
  76       if (pos == SIZE_MAX)
  77         break;
  78
  79       memcpy (ss_end (out), in.string, pos + 1);
  80       out.length += pos + 1;
  81       ss_advance (&in, pos + 2);
  82     }
  83
  84   memcpy (ss_end (out), in.string, in.length);
  85   out.length += in.length;
  86   out.string[out.length] = '\0';
  87
  88   *token = (struct token) { .type = T_STRING, .string = out };
  89 }
  90
  91 static char *
  92 scan_hex_string__ (struct substring in, struct substring *out)
  93 {
  94   if (in.length % 2 != 0)
  95     return xasprintf (_("String of hex digits has %zu characters, which "
  96                         "is not a multiple of 2."), in.length);
  97
  98   ss_realloc (out, in.length / 2 + 1);
  99   uint8_t *dst = CHAR_CAST (uint8_t *, out->string);
 100   out->length = in.length / 2;
 101   for (size_t i = 0; i < in.length; i += 2)
 102     {
 103       int hi = digit_value (in.string[i]);
 104       int lo = digit_value (in.string[i + 1]);
 105
 106       if (hi >= 16 || lo >= 16)
 107         return xasprintf (_("`%c' is not a valid hex digit."),
 108                           in.string[hi >= 16 ? i : i + 1]);
 109
 110       *dst++ = hi * 16 + lo;
 111     }
 112
 113   return NULL;
 114 }
 115
 116 static char *
 117 scan_unicode_string__ (struct substring in, struct substring *out)
 118 {
 119   if (in.length < 1 || in.length > 8)
 120     return xasprintf (_("Unicode string contains %zu bytes, which is "
 121                         "not in the valid range of 1 to 8 bytes."),
 122                       in.length);
 123
 124   ucs4_t uc = 0;
 125   for (size_t i = 0; i < in.length; i++)
 126     {
 127       int digit = digit_value (in.string[i]);
 128       if (digit >= 16)
 129         return xasprintf (_("`%c' is not a valid hex digit."), in.string[i]);
 130       uc = uc * 16 + digit;
 131     }
 132
 133   if ((uc >= 0xd800 && uc < 0xe000) || uc > 0x10ffff)
 134     return xasprintf (_("U+%04llX is not a valid Unicode code point."),
 135                       (long long) uc);
 136
 137   ss_realloc (out, 4 + 1);
 138   out->length = u8_uctomb (CHAR_CAST (uint8_t *, ss_end (*out)), uc, 4);
 139
 140   return NULL;
 141 }
 142
 143 static enum token_type
 144 scan_reserved_word__ (struct substring word)
 145 {
 146   switch (c_toupper (word.string[0]))
 147     {
 148     case 'B':
 149       return T_BY;
 150
 151     case 'E':
 152       return T_EQ;
 153
 154     case 'G':
 155       return c_toupper (word.string[1]) == 'E' ? T_GE : T_GT;
 156
 157     case 'L':
 158       return c_toupper (word.string[1]) == 'E' ? T_LE : T_LT;
 159
 160     case 'N':
 161       return word.length == 2 ? T_NE : T_NOT;
 162
 163     case 'O':
 164       return T_OR;
 165
 166     case 'T':
 167       return T_TO;
 168
 169     case 'A':
 170       return c_toupper (word.string[1]) == 'L' ? T_ALL : T_AND;
 171
 172     case 'W':
 173       return T_WITH;
 174     }
 175
 176   NOT_REACHED ();
 177 }
 178
 179 static enum token_type
 180 scan_punct1__ (char c0)
 181 {
 182   switch (c0)
 183     {
 184     case '(': return T_LPAREN;
 185     case ')': return T_RPAREN;
 186     case ',': return T_COMMA;
 187     case '=': return T_EQUALS;
 188     case '-': return T_DASH;
 189     case '[': return T_LBRACK;
 190     case ']': return T_RBRACK;
 191     case '&': return T_AND;
 192     case '|': return T_OR;
 193     case '+': return T_PLUS;
 194     case '/': return T_SLASH;
 195     case '*': return T_ASTERISK;
 196     case '<': return T_LT;
 197     case '>': return T_GT;
 198     case '~': return T_NOT;
 199     default: return T_MACRO_PUNCT;
 200     }
 201
 202   NOT_REACHED ();
 203 }
 204
 205 static enum token_type
 206 scan_punct2__ (char c0, char c1)
 207 {
 208   switch (c0)
 209     {
 210     case '*':
 211       return T_EXP;
 212
 213     case '<':
 214       return c1 == '=' ? T_LE : T_NE;
 215
 216     case '>':
 217       return T_GE;
 218
 219     case '~':
 220       return T_NE;
 221
 222     case '&':
 223       return T_AND;
 224
 225     case '|':
 226       return T_OR;
 227     }
 228
 229   NOT_REACHED ();
 230 }
 231
 232 static enum token_type
 233 scan_punct__ (struct substring s)
 234 {
 235   return (s.length == 1
 236           ? scan_punct1__ (s.string[0])
 237           : scan_punct2__ (s.string[0], s.string[1]));
 238 }
 239
 240 static void
 241 scan_number__ (struct substring s, struct token *token)
 242 {
 243   char buf[128];
 244   char *p;
 245
 246   if (s.length < sizeof buf)
 247     {
 248       p = buf;
 249       memcpy (buf, s.string, s.length);
 250       buf[s.length] = '\0';
 251     }
 252   else
 253     p = xmemdup0 (s.string, s.length);
 254
 255   bool negative = *p == '-';
 256   double x = c_strtod (p + negative, NULL);
 257   *token = (struct token) {
 258     .type = negative ? T_NEG_NUM : T_POS_NUM,
 259     .number = negative ? -x : x,
 260   };
 261
 262   if (p != buf)
 263     free (p);
 264 }
 265 \f
 266 static void
 267 tokenize_error__ (struct token *token, char *error)
 268 {
 269   *token = (struct token) { .type = T_STRING, .string = ss_cstr (error) };
 270 }
 271
 272 static enum tokenize_result
 273 tokenize_string_segment__ (enum segment_type type,
 274                            struct substring s, struct token *token)
 275 {
 276   /* Trim X' or U' from front and ' from back. */
 277   s.string += 2;
 278   s.length -= 3;
 279
 280   struct substring out = SS_EMPTY_INITIALIZER;
 281   char *error = (type == SEG_HEX_STRING
 282                  ? scan_hex_string__ (s, &out)
 283                  : scan_unicode_string__ (s, &out));
 284   if (!error)
 285     {
 286       out.string[out.length] = '\0';
 287       *token = (struct token) { .type = T_STRING, .string = out };
 288       return TOKENIZE_TOKEN;
 289     }
 290   else
 291     {
 292       tokenize_error__ (token, error);
 293       return TOKENIZE_ERROR;
 294     }
 295 }
 296
 297 static void
 298 tokenize_unexpected_char (const struct substring *s, struct token *token)
 299 {
 300   ucs4_t uc;
 301   u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, s->string), s->length);
 302
 303   char c_name[16];
 304   tokenize_error__ (token, xasprintf (_("Bad character %s in input."),
 305                                       uc_name (uc, c_name)));
 306 }
 307
 308 enum tokenize_result
 309 token_from_segment (enum segment_type type, struct substring s,
 310                     struct token *token)
 311 {
 312   switch (type)
 313     {
 314     case SEG_NUMBER:
 315       scan_number__ (s, token);
 316       return TOKENIZE_TOKEN;
 317
 318     case SEG_QUOTED_STRING:
 319       scan_quoted_string (s, token);
 320       return TOKENIZE_TOKEN;
 321
 322     case SEG_HEX_STRING:
 323     case SEG_UNICODE_STRING:
 324       return tokenize_string_segment__ (type, s, token);
 325
 326     case SEG_UNQUOTED_STRING:
 327     case SEG_DO_REPEAT_COMMAND:
 328     case SEG_INLINE_DATA:
 329     case SEG_DOCUMENT:
 330     case SEG_MACRO_BODY:
 331     case SEG_MACRO_NAME:
 332       *token = (struct token) { .type = T_STRING };
 333       ss_alloc_substring (&token->string, s);
 334       return TOKENIZE_TOKEN;
 335
 336     case SEG_RESERVED_WORD:
 337       *token = (struct token) { .type = scan_reserved_word__ (s) };
 338       return TOKENIZE_TOKEN;
 339
 340     case SEG_IDENTIFIER:
 341       *token = (struct token) { .type = T_ID };
 342       ss_alloc_substring (&token->string, s);
 343       return TOKENIZE_TOKEN;
 344
 345     case SEG_MACRO_ID:
 346       *token = (struct token) { .type = T_MACRO_ID };
 347       ss_alloc_substring (&token->string, s);
 348       return TOKENIZE_TOKEN;
 349
 350     case SEG_PUNCT:
 351       *token = (struct token) { .type = scan_punct__ (s) };
 352       if (token->type == T_MACRO_PUNCT)
 353         ss_alloc_substring (&token->string, s);
 354       return TOKENIZE_TOKEN;
 355
 356     case SEG_SHBANG:
 357     case SEG_SPACES:
 358     case SEG_COMMENT:
 359     case SEG_NEWLINE:
 360     case SEG_COMMENT_COMMAND:
 361       return TOKENIZE_EMPTY;
 362
 363     case SEG_START_DOCUMENT:
 364       *token = (struct token) { .type = T_ID };
 365       ss_alloc_substring (&token->string, ss_cstr ("DOCUMENT"));
 366       return TOKENIZE_TOKEN;
 367
 368     case SEG_START_COMMAND:
 369     case SEG_SEPARATE_COMMANDS:
 370     case SEG_END_COMMAND:
 371       *token = (struct token) { .type = T_ENDCMD };
 372       return TOKENIZE_TOKEN;
 373
 374     case SEG_END:
 375       *token = (struct token) { .type = T_STOP };
 376       return TOKENIZE_TOKEN;
 377
 378     case SEG_EXPECTED_QUOTE:
 379       tokenize_error__ (token, xasprintf (_("Unterminated string constant.")));
 380       return TOKENIZE_ERROR;
 381
 382     case SEG_EXPECTED_EXPONENT:
 383       tokenize_error__ (token,
 384                         xasprintf (_("Missing exponent following `%.*s'."),
 385                                    (int) s.length, s.string));
 386       return TOKENIZE_ERROR;
 387
 388     case SEG_UNEXPECTED_CHAR:
 389       tokenize_unexpected_char (&s, token);
 390       return TOKENIZE_ERROR;
 391     }
 392
 393   NOT_REACHED ();
 394 }
 395
 396 \f
 397 /* Initializes SLEX for parsing INPUT, which is LENGTH bytes long, in the
 398    specified MODE.
 399
 400    SLEX has no internal state to free, but it retains a reference to INPUT, so
 401    INPUT must not be modified or freed while SLEX is still in use. */
 402 void
 403 string_lexer_init (struct string_lexer *slex, const char *input, size_t length,
 404                    enum segmenter_mode mode, bool is_snippet)
 405 {
 406   *slex = (struct string_lexer) {
 407     .input = input,
 408     .length = length,
 409     .offset = 0,
 410     .segmenter = segmenter_init (mode, is_snippet),
 411   };
 412 }
 413
 414 /*  */
 415 enum string_lexer_result
 416 string_lexer_next (struct string_lexer *slex, struct token *token)
 417 {
 418   for (;;)
 419     {
 420       const char *s = slex->input + slex->offset;
 421       size_t left = slex->length - slex->offset;
 422       enum segment_type type;
 423       int n;
 424
 425       n = segmenter_push (&slex->segmenter, s, left, true, &type);
 426       assert (n >= 0);
 427
 428       slex->offset += n;
 429       switch (token_from_segment (type, ss_buffer (s, n), token))
 430         {
 431         case TOKENIZE_TOKEN:
 432           return token->type == T_STOP ? SLR_END : SLR_TOKEN;
 433
 434         case TOKENIZE_ERROR:
 435           return SLR_ERROR;
 436
 437         case TOKENIZE_EMPTY:
 438           break;
 439         }
 440     }
 441 }
 442
 443 static struct substring
 444 concat (struct substring a, struct substring b)
 445 {
 446   size_t length = a.length + b.length;
 447   struct substring out = { .string = xmalloc (length + 1), .length = length };
 448   memcpy (out.string, a.string, a.length);
 449   memcpy (out.string + a.length, b.string, b.length);
 450   out.string[length] = '\0';
 451   return out;
 452 }
 453
 454 /* Attempts to merge a sequence of tokens together into a single token.  The
 455    caller feeds tokens in one by one and the merger FSM reports progress.  The
 456    caller must supply a merger structure M that is set to MERGER_INIT before
 457    the first call.  The caller must also supply a token OUT for storage, which
 458    need not be initialized.
 459
 460    Returns:
 461
 462    * -1 if more tokens are needed.  Token OUT might be in use for temporary
 463       storage; to ensure that it is freed, continue calling merger_add() until
 464       it returns something other than -1.  (T_STOP or T_ENDCMD will make it do
 465       that.)
 466
 467    * 0 if the first token submitted to the merger is the output.  This is the
 468      common case for the first call, and it can be returned for subsequent
 469      calls as well.
 470
 471    * A positive number if OUT is initialized to the output token.  The return
 472      value is the number of tokens being merged to produce this one. */
 473 int
 474 merger_add (struct merger *m, const struct token *in, struct token *out)
 475 {
 476   /* We perform two different kinds of token merging:
 477
 478      - String concatenation, where syntax like "a" + "b" is converted into a
 479        single string token.  This is definitely needed because the parser
 480        relies on it.
 481
 482      - Negative number merging, where syntax like -5 is converted from a pair
 483        of tokens (T_DASH then T_POS_NUM) into a single token (T_NEG_NUM).  This
 484        might not be needed anymore because the segmenter directly treats a dash
 485        followed by a number, with optional intervening white space, as a
 486        negative number.  It's only needed if we want intervening comments to be
 487        allowed or for part of the negative number token to be produced by macro
 488        expansion. */
 489   switch (++m->state)
 490     {
 491     case 1:
 492       if (in->type == T_DASH || in->type == T_STRING)
 493         {
 494           *out = *in;
 495           return -1;
 496         }
 497       else
 498         return 0;
 499
 500     case 2:
 501       if (out->type == T_DASH)
 502         {
 503           if (in->type == T_POS_NUM)
 504             {
 505               *out = (struct token) {
 506                 .type = T_NEG_NUM,
 507                 .number = -in->number
 508               };
 509               return 2;
 510             }
 511           else
 512             return 0;
 513         }
 514       else
 515         return in->type == T_PLUS ? -1 : 0;
 516       NOT_REACHED ();
 517
 518     case 3:
 519       if (in->type == T_STRING)
 520         {
 521           out->string = concat (out->string, in->string);
 522           return -1;
 523         }
 524       else
 525         return 0;
 526       NOT_REACHED ();
 527
 528     default:
 529       if (!(m->state % 2))
 530         return in->type == T_PLUS ? -1 : m->state - 1;
 531       else
 532         {
 533           if (in->type == T_STRING)
 534             {
 535               struct substring s = concat (out->string, in->string);
 536               ss_swap (&s, &out->string);
 537               ss_dealloc (&s);
 538               return -1;
 539             }
 540           else
 541             return m->state - 2;
 542         }
 543       NOT_REACHED ();
 544     }
 545 }