pintos-os.org Git - pspp/blob - src/language/lexer/scan.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 2010, 2011, 2013 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "language/lexer/scan.h"
  20
  21 #include <limits.h>
  22 #include <unistr.h>
  23
  24 #include "data/identifier.h"
  25 #include "language/lexer/token.h"
  26 #include "libpspp/assertion.h"
  27 #include "libpspp/cast.h"
  28 #include "libpspp/i18n.h"
  29
  30 #include "gl/c-ctype.h"
  31 #include "gl/c-strtod.h"
  32 #include "gl/xmemdup0.h"
  33
  34 #include "gettext.h"
  35 #define _(msgid) gettext (msgid)
  36
  37 /* Returns the integer value of (hex) digit C. */
  38 static int
  39 digit_value (int c)
  40 {
  41   switch (c)
  42     {
  43     case '0': return 0;
  44     case '1': return 1;
  45     case '2': return 2;
  46     case '3': return 3;
  47     case '4': return 4;
  48     case '5': return 5;
  49     case '6': return 6;
  50     case '7': return 7;
  51     case '8': return 8;
  52     case '9': return 9;
  53     case 'a': case 'A': return 10;
  54     case 'b': case 'B': return 11;
  55     case 'c': case 'C': return 12;
  56     case 'd': case 'D': return 13;
  57     case 'e': case 'E': return 14;
  58     case 'f': case 'F': return 15;
  59     default: return INT_MAX;
  60     }
  61 }
  62
  63 static void
  64 scan_quoted_string (struct substring in, struct token *token)
  65 {
  66   /* Trim ' or " from front and back. */
  67   int quote = in.string[0];
  68   in.string++;
  69   in.length -= 2;
  70
  71   struct substring out = { .string = xmalloc (in.length + 1) };
  72
  73   for (;;)
  74     {
  75       size_t pos = ss_find_byte (in, quote);
  76       if (pos == SIZE_MAX)
  77         break;
  78
  79       memcpy (ss_end (out), in.string, pos + 1);
  80       out.length += pos + 1;
  81       ss_advance (&in, pos + 2);
  82     }
  83
  84   memcpy (ss_end (out), in.string, in.length);
  85   out.length += in.length;
  86   out.string[out.length] = '\0';
  87
  88   *token = (struct token) { .type = T_STRING, .string = out };
  89 }
  90
  91 static char *
  92 scan_hex_string__ (struct substring in, struct substring *out)
  93 {
  94   if (in.length % 2 != 0)
  95     return xasprintf (_("String of hex digits has %zu characters, which "
  96                         "is not a multiple of 2."), in.length);
  97
  98   ss_realloc (out, in.length / 2 + 1);
  99   uint8_t *dst = CHAR_CAST (uint8_t *, out->string);
 100   out->length = in.length / 2;
 101   for (size_t i = 0; i < in.length; i += 2)
 102     {
 103       int hi = digit_value (in.string[i]);
 104       int lo = digit_value (in.string[i + 1]);
 105
 106       if (hi >= 16 || lo >= 16)
 107         return xasprintf (_("`%c' is not a valid hex digit."),
 108                           in.string[hi >= 16 ? i : i + 1]);
 109
 110       *dst++ = hi * 16 + lo;
 111     }
 112
 113   return NULL;
 114 }
 115
 116 static char *
 117 scan_unicode_string__ (struct substring in, struct substring *out)
 118 {
 119   if (in.length < 1 || in.length > 8)
 120     return xasprintf (_("Unicode string contains %zu bytes, which is "
 121                         "not in the valid range of 1 to 8 bytes."),
 122                       in.length);
 123
 124   ucs4_t uc = 0;
 125   for (size_t i = 0; i < in.length; i++)
 126     {
 127       int digit = digit_value (in.string[i]);
 128       if (digit >= 16)
 129         return xasprintf (_("`%c' is not a valid hex digit."), in.string[i]);
 130       uc = uc * 16 + digit;
 131     }
 132
 133   if ((uc >= 0xd800 && uc < 0xe000) || uc > 0x10ffff)
 134     return xasprintf (_("U+%04llX is not a valid Unicode code point."),
 135                       (long long) uc);
 136
 137   ss_realloc (out, 4 + 1);
 138   out->length = u8_uctomb (CHAR_CAST (uint8_t *, ss_end (*out)), uc, 4);
 139
 140   return NULL;
 141 }
 142
 143 static enum token_type
 144 scan_reserved_word__ (struct substring word)
 145 {
 146   switch (c_toupper (word.string[0]))
 147     {
 148     case 'B':
 149       return T_BY;
 150
 151     case 'E':
 152       return T_EQ;
 153
 154     case 'G':
 155       return c_toupper (word.string[1]) == 'E' ? T_GE : T_GT;
 156
 157     case 'L':
 158       return c_toupper (word.string[1]) == 'E' ? T_LE : T_LT;
 159
 160     case 'N':
 161       return word.length == 2 ? T_NE : T_NOT;
 162
 163     case 'O':
 164       return T_OR;
 165
 166     case 'T':
 167       return T_TO;
 168
 169     case 'A':
 170       return c_toupper (word.string[1]) == 'L' ? T_ALL : T_AND;
 171
 172     case 'W':
 173       return T_WITH;
 174     }
 175
 176   NOT_REACHED ();
 177 }
 178
 179 static enum token_type
 180 scan_punct1__ (char c0)
 181 {
 182   switch (c0)
 183     {
 184     case '(': return T_LPAREN;
 185     case ')': return T_RPAREN;
 186     case ',': return T_COMMA;
 187     case '=': return T_EQUALS;
 188     case '-': return T_DASH;
 189     case '[': return T_LBRACK;
 190     case ']': return T_RBRACK;
 191     case '&': return T_AND;
 192     case '|': return T_OR;
 193     case '+': return T_PLUS;
 194     case '/': return T_SLASH;
 195     case '*': return T_ASTERISK;
 196     case '<': return T_LT;
 197     case '>': return T_GT;
 198     case '~': return T_NOT;
 199     default: return T_MACRO_PUNCT;
 200     }
 201
 202   NOT_REACHED ();
 203 }
 204
 205 static enum token_type
 206 scan_punct2__ (char c0, char c1)
 207 {
 208   switch (c0)
 209     {
 210     case '*':
 211       return T_EXP;
 212
 213     case '<':
 214       return c1 == '=' ? T_LE : T_NE;
 215
 216     case '>':
 217       return T_GE;
 218
 219     case '~':
 220       return T_NE;
 221
 222     case '&':
 223       return T_AND;
 224
 225     case '|':
 226       return T_OR;
 227     }
 228
 229   NOT_REACHED ();
 230 }
 231
 232 static enum token_type
 233 scan_punct__ (struct substring s)
 234 {
 235   return (s.length == 1
 236           ? scan_punct1__ (s.string[0])
 237           : scan_punct2__ (s.string[0], s.string[1]));
 238 }
 239
 240 static void
 241 scan_number__ (struct substring s, struct token *token)
 242 {
 243   char buf[128];
 244   char *p;
 245
 246   if (s.length < sizeof buf)
 247     {
 248       p = buf;
 249       memcpy (buf, s.string, s.length);
 250       buf[s.length] = '\0';
 251     }
 252   else
 253     p = xmemdup0 (s.string, s.length);
 254
 255   bool negative = *p == '-';
 256   double x = c_strtod (p + negative, NULL);
 257   *token = (struct token) {
 258     .type = negative ? T_NEG_NUM : T_POS_NUM,
 259     .number = negative ? -x : x,
 260   };
 261
 262   if (p != buf)
 263     free (p);
 264 }
 265 \f
 266 static void
 267 tokenize_error__ (struct token *token, char *error)
 268 {
 269   *token = (struct token) { .type = T_STRING, .string = ss_cstr (error) };
 270 }
 271
 272 static enum tokenize_result
 273 tokenize_string_segment__ (enum segment_type type,
 274                            struct substring s, struct token *token)
 275 {
 276   /* Trim X' or U' from front and ' from back. */
 277   s.string += 2;
 278   s.length -= 3;
 279
 280   struct substring out = SS_EMPTY_INITIALIZER;
 281   char *error = (type == SEG_HEX_STRING
 282                  ? scan_hex_string__ (s, &out)
 283                  : scan_unicode_string__ (s, &out));
 284   if (!error)
 285     {
 286       out.string[out.length] = '\0';
 287       *token = (struct token) { .type = T_STRING, .string = out };
 288       return TOKENIZE_TOKEN;
 289     }
 290   else
 291     {
 292       tokenize_error__ (token, error);
 293       ss_dealloc (&out);
 294       return TOKENIZE_ERROR;
 295     }
 296 }
 297
 298 static void
 299 tokenize_unexpected_char (const struct substring *s, struct token *token)
 300 {
 301   ucs4_t uc;
 302   u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, s->string), s->length);
 303
 304   char c_name[16];
 305   tokenize_error__ (token, xasprintf (_("Bad character %s in input."),
 306                                       uc_name (uc, c_name)));
 307 }
 308
 309 enum tokenize_result
 310 token_from_segment (enum segment_type type, struct substring s,
 311                     struct token *token)
 312 {
 313   switch (type)
 314     {
 315     case SEG_NUMBER:
 316       scan_number__ (s, token);
 317       return TOKENIZE_TOKEN;
 318
 319     case SEG_QUOTED_STRING:
 320       scan_quoted_string (s, token);
 321       return TOKENIZE_TOKEN;
 322
 323     case SEG_HEX_STRING:
 324     case SEG_UNICODE_STRING:
 325       return tokenize_string_segment__ (type, s, token);
 326
 327     case SEG_UNQUOTED_STRING:
 328     case SEG_DO_REPEAT_COMMAND:
 329     case SEG_INLINE_DATA:
 330     case SEG_DOCUMENT:
 331     case SEG_MACRO_BODY:
 332     case SEG_MACRO_NAME:
 333       *token = (struct token) { .type = T_STRING };
 334       ss_alloc_substring (&token->string, s);
 335       return TOKENIZE_TOKEN;
 336
 337     case SEG_RESERVED_WORD:
 338       *token = (struct token) { .type = scan_reserved_word__ (s) };
 339       return TOKENIZE_TOKEN;
 340
 341     case SEG_IDENTIFIER:
 342       *token = (struct token) { .type = T_ID };
 343       ss_alloc_substring (&token->string, s);
 344       return TOKENIZE_TOKEN;
 345
 346     case SEG_MACRO_ID:
 347       *token = (struct token) { .type = T_MACRO_ID };
 348       ss_alloc_substring (&token->string, s);
 349       return TOKENIZE_TOKEN;
 350
 351     case SEG_PUNCT:
 352       *token = (struct token) { .type = scan_punct__ (s) };
 353       if (token->type == T_MACRO_PUNCT)
 354         ss_alloc_substring (&token->string, s);
 355       return TOKENIZE_TOKEN;
 356
 357     case SEG_SHBANG:
 358     case SEG_SPACES:
 359     case SEG_COMMENT:
 360     case SEG_NEWLINE:
 361     case SEG_COMMENT_COMMAND:
 362       return TOKENIZE_EMPTY;
 363
 364     case SEG_START_DOCUMENT:
 365       *token = (struct token) { .type = T_ID };
 366       ss_alloc_substring (&token->string, ss_cstr ("DOCUMENT"));
 367       return TOKENIZE_TOKEN;
 368
 369     case SEG_START_COMMAND:
 370     case SEG_SEPARATE_COMMANDS:
 371     case SEG_END_COMMAND:
 372       *token = (struct token) { .type = T_ENDCMD };
 373       return TOKENIZE_TOKEN;
 374
 375     case SEG_END:
 376       *token = (struct token) { .type = T_STOP };
 377       return TOKENIZE_TOKEN;
 378
 379     case SEG_EXPECTED_QUOTE:
 380       tokenize_error__ (token, xasprintf (_("Unterminated string constant.")));
 381       return TOKENIZE_ERROR;
 382
 383     case SEG_EXPECTED_EXPONENT:
 384       tokenize_error__ (token,
 385                         xasprintf (_("Missing exponent following `%.*s'."),
 386                                    (int) s.length, s.string));
 387       return TOKENIZE_ERROR;
 388
 389     case SEG_UNEXPECTED_CHAR:
 390       tokenize_unexpected_char (&s, token);
 391       return TOKENIZE_ERROR;
 392     }
 393
 394   NOT_REACHED ();
 395 }
 396
 397 \f
 398 /* Initializes SLEX for parsing INPUT, which is LENGTH bytes long, in the
 399    specified MODE.
 400
 401    SLEX has no internal state to free, but it retains a reference to INPUT, so
 402    INPUT must not be modified or freed while SLEX is still in use. */
 403 void
 404 string_lexer_init (struct string_lexer *slex, const char *input, size_t length,
 405                    enum segmenter_mode mode, bool is_snippet)
 406 {
 407   *slex = (struct string_lexer) {
 408     .input = input,
 409     .length = length,
 410     .offset = 0,
 411     .segmenter = segmenter_init (mode, is_snippet),
 412   };
 413 }
 414
 415 /*  */
 416 enum string_lexer_result
 417 string_lexer_next (struct string_lexer *slex, struct token *token)
 418 {
 419   for (;;)
 420     {
 421       const char *s = slex->input + slex->offset;
 422       size_t left = slex->length - slex->offset;
 423       enum segment_type type;
 424       int n;
 425
 426       n = segmenter_push (&slex->segmenter, s, left, true, &type);
 427       assert (n >= 0);
 428
 429       slex->offset += n;
 430       switch (token_from_segment (type, ss_buffer (s, n), token))
 431         {
 432         case TOKENIZE_TOKEN:
 433           return token->type == T_STOP ? SLR_END : SLR_TOKEN;
 434
 435         case TOKENIZE_ERROR:
 436           return SLR_ERROR;
 437
 438         case TOKENIZE_EMPTY:
 439           break;
 440         }
 441     }
 442 }
 443
 444 static struct substring
 445 concat (struct substring a, struct substring b)
 446 {
 447   size_t length = a.length + b.length;
 448   struct substring out = { .string = xmalloc (length + 1), .length = length };
 449   memcpy (out.string, a.string, a.length);
 450   memcpy (out.string + a.length, b.string, b.length);
 451   out.string[length] = '\0';
 452   return out;
 453 }
 454
 455 /* Attempts to merge a sequence of tokens together into a single token.  The
 456    caller feeds tokens in one by one and the merger FSM reports progress.  The
 457    caller must supply a merger structure M that is set to MERGER_INIT before
 458    the first call.  The caller must also supply a token OUT for storage, which
 459    need not be initialized.
 460
 461    Returns:
 462
 463    * -1 if more tokens are needed.  Token OUT might be in use for temporary
 464       storage; to ensure that it is freed, continue calling merger_add() until
 465       it returns something other than -1.  (T_STOP or T_ENDCMD will make it do
 466       that.)
 467
 468    * 0 if the first token submitted to the merger is the output.  This is the
 469      common case for the first call, and it can be returned for subsequent
 470      calls as well.
 471
 472    * A positive number if OUT is initialized to the output token.  The return
 473      value is the number of tokens being merged to produce this one. */
 474 int
 475 merger_add (struct merger *m, const struct token *in, struct token *out)
 476 {
 477   /* We perform two different kinds of token merging:
 478
 479      - String concatenation, where syntax like "a" + "b" is converted into a
 480        single string token.  This is definitely needed because the parser
 481        relies on it.
 482
 483      - Negative number merging, where syntax like -5 is converted from a pair
 484        of tokens (T_DASH then T_POS_NUM) into a single token (T_NEG_NUM).  This
 485        might not be needed anymore because the segmenter directly treats a dash
 486        followed by a number, with optional intervening white space, as a
 487        negative number.  It's only needed if we want intervening comments to be
 488        allowed or for part of the negative number token to be produced by macro
 489        expansion. */
 490   switch (++m->state)
 491     {
 492     case 1:
 493       if (in->type == T_DASH || in->type == T_STRING)
 494         {
 495           *out = *in;
 496           return -1;
 497         }
 498       else
 499         return 0;
 500
 501     case 2:
 502       if (out->type == T_DASH)
 503         {
 504           if (in->type == T_POS_NUM)
 505             {
 506               *out = (struct token) {
 507                 .type = T_NEG_NUM,
 508                 .number = -in->number
 509               };
 510               return 2;
 511             }
 512           else
 513             return 0;
 514         }
 515       else
 516         return in->type == T_PLUS ? -1 : 0;
 517       NOT_REACHED ();
 518
 519     case 3:
 520       if (in->type == T_STRING)
 521         {
 522           out->string = concat (out->string, in->string);
 523           return -1;
 524         }
 525       else
 526         return 0;
 527       NOT_REACHED ();
 528
 529     default:
 530       if (!(m->state % 2))
 531         return in->type == T_PLUS ? -1 : m->state - 1;
 532       else
 533         {
 534           if (in->type == T_STRING)
 535             {
 536               struct substring s = concat (out->string, in->string);
 537               ss_swap (&s, &out->string);
 538               ss_dealloc (&s);
 539               return -1;
 540             }
 541           else
 542             return m->state - 2;
 543         }
 544       NOT_REACHED ();
 545     }
 546 }