pintos-os.org Git - pspp/blob - src/language/lexer/scan.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 2010, 2011, 2013 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "language/lexer/scan.h"
  20
  21 #include <limits.h>
  22 #include <unistr.h>
  23
  24 #include "data/identifier.h"
  25 #include "language/lexer/token.h"
  26 #include "libpspp/assertion.h"
  27 #include "libpspp/cast.h"
  28 #include "libpspp/i18n.h"
  29
  30 #include "gl/c-ctype.h"
  31 #include "gl/c-strtod.h"
  32 #include "gl/xmemdup0.h"
  33
  34 #include "gettext.h"
  35 #define _(msgid) gettext (msgid)
  36
  37 /* Returns the integer value of (hex) digit C. */
  38 static int
  39 digit_value (int c)
  40 {
  41   switch (c)
  42     {
  43     case '0': return 0;
  44     case '1': return 1;
  45     case '2': return 2;
  46     case '3': return 3;
  47     case '4': return 4;
  48     case '5': return 5;
  49     case '6': return 6;
  50     case '7': return 7;
  51     case '8': return 8;
  52     case '9': return 9;
  53     case 'a': case 'A': return 10;
  54     case 'b': case 'B': return 11;
  55     case 'c': case 'C': return 12;
  56     case 'd': case 'D': return 13;
  57     case 'e': case 'E': return 14;
  58     case 'f': case 'F': return 15;
  59     default: return INT_MAX;
  60     }
  61 }
  62
  63 static void
  64 scan_quoted_string (struct substring in, struct token *token)
  65 {
  66   /* Trim ' or " from front and back. */
  67   int quote = in.string[0];
  68   in.string++;
  69   in.length -= 2;
  70
  71   struct substring out = { .string = xmalloc (in.length + 1) };
  72
  73   for (;;)
  74     {
  75       size_t pos = ss_find_byte (in, quote);
  76       if (pos == SIZE_MAX)
  77         break;
  78
  79       memcpy (ss_end (out), in.string, pos + 1);
  80       out.length += pos + 1;
  81       ss_advance (&in, pos + 2);
  82     }
  83
  84   memcpy (ss_end (out), in.string, in.length);
  85   out.length += in.length;
  86   out.string[out.length] = '\0';
  87
  88   *token = (struct token) { .type = T_STRING, .string = out };
  89 }
  90
  91 static char *
  92 scan_hex_string__ (struct substring in, struct substring *out)
  93 {
  94   if (in.length % 2 != 0)
  95     return xasprintf (_("String of hex digits has %zu characters, which "
  96                         "is not a multiple of 2."), in.length);
  97
  98   ss_realloc (out, in.length / 2 + 1);
  99   uint8_t *dst = CHAR_CAST (uint8_t *, out->string);
 100   out->length = in.length / 2;
 101   for (size_t i = 0; i < in.length; i += 2)
 102     {
 103       int hi = digit_value (in.string[i]);
 104       int lo = digit_value (in.string[i + 1]);
 105
 106       if (hi >= 16 || lo >= 16)
 107         return xasprintf (_("`%c' is not a valid hex digit."),
 108                           in.string[hi >= 16 ? i : i + 1]);
 109
 110       *dst++ = hi * 16 + lo;
 111     }
 112
 113   return NULL;
 114 }
 115
 116 static char *
 117 scan_unicode_string__ (struct substring in, struct substring *out)
 118 {
 119   if (in.length < 1 || in.length > 8)
 120     return xasprintf (_("Unicode string contains %zu bytes, which is "
 121                         "not in the valid range of 1 to 8 bytes."),
 122                       in.length);
 123
 124   ucs4_t uc = 0;
 125   for (size_t i = 0; i < in.length; i++)
 126     {
 127       int digit = digit_value (in.string[i]);
 128       if (digit >= 16)
 129         return xasprintf (_("`%c' is not a valid hex digit."), in.string[i]);
 130       uc = uc * 16 + digit;
 131     }
 132
 133   if ((uc >= 0xd800 && uc < 0xe000) || uc > 0x10ffff)
 134     return xasprintf (_("U+%04llX is not a valid Unicode code point."),
 135                       (long long) uc);
 136
 137   ss_realloc (out, 4 + 1);
 138   out->length = u8_uctomb (CHAR_CAST (uint8_t *, ss_end (*out)), uc, 4);
 139
 140   return NULL;
 141 }
 142
 143 static enum token_type
 144 scan_reserved_word__ (struct substring word)
 145 {
 146   switch (c_toupper (word.string[0]))
 147     {
 148     case 'B':
 149       return T_BY;
 150
 151     case 'E':
 152       return T_EQ;
 153
 154     case 'G':
 155       return c_toupper (word.string[1]) == 'E' ? T_GE : T_GT;
 156
 157     case 'L':
 158       return c_toupper (word.string[1]) == 'E' ? T_LE : T_LT;
 159
 160     case 'N':
 161       return word.length == 2 ? T_NE : T_NOT;
 162
 163     case 'O':
 164       return T_OR;
 165
 166     case 'T':
 167       return T_TO;
 168
 169     case 'A':
 170       return c_toupper (word.string[1]) == 'L' ? T_ALL : T_AND;
 171
 172     case 'W':
 173       return T_WITH;
 174     }
 175
 176   NOT_REACHED ();
 177 }
 178
 179 static enum token_type
 180 scan_punct1__ (char c0)
 181 {
 182   switch (c0)
 183     {
 184     case '(': return T_LPAREN;
 185     case ')': return T_RPAREN;
 186     case ',': return T_COMMA;
 187     case '=': return T_EQUALS;
 188     case '-': return T_DASH;
 189     case '[': return T_LBRACK;
 190     case ']': return T_RBRACK;
 191     case '&': return T_AND;
 192     case '|': return T_OR;
 193     case '+': return T_PLUS;
 194     case '/': return T_SLASH;
 195     case '*': return T_ASTERISK;
 196     case '<': return T_LT;
 197     case '>': return T_GT;
 198     case '~': return T_NOT;
 199     default: return T_MACRO_PUNCT;
 200     }
 201
 202   NOT_REACHED ();
 203 }
 204
 205 static enum token_type
 206 scan_punct2__ (char c0, char c1)
 207 {
 208   switch (c0)
 209     {
 210     case '*':
 211       return T_EXP;
 212
 213     case '<':
 214       return c1 == '=' ? T_LE : T_NE;
 215
 216     case '>':
 217       return T_GE;
 218
 219     case '~':
 220       return T_NE;
 221
 222     case '&':
 223       return T_AND;
 224
 225     case '|':
 226       return T_OR;
 227     }
 228
 229   NOT_REACHED ();
 230 }
 231
 232 static enum token_type
 233 scan_punct__ (struct substring s)
 234 {
 235   return (s.length == 1
 236           ? scan_punct1__ (s.string[0])
 237           : scan_punct2__ (s.string[0], s.string[1]));
 238 }
 239
 240 static void
 241 scan_number__ (struct substring s, struct token *token)
 242 {
 243   char buf[128];
 244   char *p;
 245
 246   if (s.length < sizeof buf)
 247     {
 248       p = buf;
 249       memcpy (buf, s.string, s.length);
 250       buf[s.length] = '\0';
 251     }
 252   else
 253     p = xmemdup0 (s.string, s.length);
 254
 255   bool negative = *p == '-';
 256   double x = c_strtod (p + negative, NULL);
 257   *token = (struct token) {
 258     .type = negative ? T_NEG_NUM : T_POS_NUM,
 259     .number = negative ? -x : x,
 260   };
 261
 262   if (p != buf)
 263     free (p);
 264 }
 265 \f
 266 static void
 267 tokenize_error__ (struct token *token, char *error)
 268 {
 269   *token = (struct token) { .type = T_STRING, .string = ss_cstr (error) };
 270 }
 271
 272 static enum tokenize_result
 273 tokenize_string_segment__ (enum segment_type type,
 274                            struct substring s, struct token *token)
 275 {
 276   /* Trim X' or U' from front and ' from back. */
 277   s.string += 2;
 278   s.length -= 3;
 279
 280   struct substring out = SS_EMPTY_INITIALIZER;
 281   char *error = (type == SEG_HEX_STRING
 282                  ? scan_hex_string__ (s, &out)
 283                  : scan_unicode_string__ (s, &out));
 284   if (!error)
 285     {
 286       out.string[out.length] = '\0';
 287       *token = (struct token) { .type = T_STRING, .string = out };
 288       return TOKENIZE_TOKEN;
 289     }
 290   else
 291     {
 292       tokenize_error__ (token, error);
 293       return TOKENIZE_ERROR;
 294     }
 295 }
 296
 297 static void
 298 tokenize_unexpected_char (const struct substring *s, struct token *token)
 299 {
 300   ucs4_t uc;
 301   u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, s->string), s->length);
 302
 303   char c_name[16];
 304   tokenize_error__ (token, xasprintf (_("Bad character %s in input."),
 305                                       uc_name (uc, c_name)));
 306 }
 307
 308 enum tokenize_result
 309 token_from_segment (enum segment_type type, struct substring s,
 310                     struct token *token)
 311 {
 312   switch (type)
 313     {
 314     case SEG_NUMBER:
 315       scan_number__ (s, token);
 316       return TOKENIZE_TOKEN;
 317
 318     case SEG_QUOTED_STRING:
 319       scan_quoted_string (s, token);
 320       return TOKENIZE_TOKEN;
 321
 322     case SEG_HEX_STRING:
 323     case SEG_UNICODE_STRING:
 324       return tokenize_string_segment__ (type, s, token);
 325
 326     case SEG_UNQUOTED_STRING:
 327     case SEG_DO_REPEAT_COMMAND:
 328     case SEG_INLINE_DATA:
 329     case SEG_DOCUMENT:
 330     case SEG_MACRO_BODY:
 331       *token = (struct token) { .type = T_STRING };
 332       ss_alloc_substring (&token->string, s);
 333       return TOKENIZE_TOKEN;
 334
 335     case SEG_RESERVED_WORD:
 336       *token = (struct token) { .type = scan_reserved_word__ (s) };
 337       return TOKENIZE_TOKEN;
 338
 339     case SEG_IDENTIFIER:
 340       *token = (struct token) { .type = T_ID };
 341       ss_alloc_substring (&token->string, s);
 342       return TOKENIZE_TOKEN;
 343
 344     case SEG_MACRO_ID:
 345       *token = (struct token) { .type = T_MACRO_ID };
 346       ss_alloc_substring (&token->string, s);
 347       return TOKENIZE_TOKEN;
 348
 349     case SEG_PUNCT:
 350       *token = (struct token) { .type = scan_punct__ (s) };
 351       if (token->type == T_MACRO_PUNCT)
 352         ss_alloc_substring (&token->string, s);
 353       return TOKENIZE_TOKEN;
 354
 355     case SEG_SHBANG:
 356     case SEG_SPACES:
 357     case SEG_COMMENT:
 358     case SEG_NEWLINE:
 359     case SEG_COMMENT_COMMAND:
 360       return TOKENIZE_EMPTY;
 361
 362     case SEG_START_DOCUMENT:
 363       *token = (struct token) { .type = T_ID };
 364       ss_alloc_substring (&token->string, ss_cstr ("DOCUMENT"));
 365       return TOKENIZE_TOKEN;
 366
 367     case SEG_START_COMMAND:
 368     case SEG_SEPARATE_COMMANDS:
 369     case SEG_END_COMMAND:
 370       *token = (struct token) { .type = T_ENDCMD };
 371       return TOKENIZE_TOKEN;
 372
 373     case SEG_END:
 374       *token = (struct token) { .type = T_STOP };
 375       return TOKENIZE_TOKEN;
 376
 377     case SEG_EXPECTED_QUOTE:
 378       tokenize_error__ (token, xasprintf (_("Unterminated string constant.")));
 379       return TOKENIZE_ERROR;
 380
 381     case SEG_EXPECTED_EXPONENT:
 382       tokenize_error__ (token,
 383                         xasprintf (_("Missing exponent following `%.*s'."),
 384                                    (int) s.length, s.string));
 385       return TOKENIZE_ERROR;
 386
 387     case SEG_UNEXPECTED_CHAR:
 388       tokenize_unexpected_char (&s, token);
 389       return TOKENIZE_ERROR;
 390     }
 391
 392   NOT_REACHED ();
 393 }
 394
 395 \f
 396 /* Initializes SLEX for parsing INPUT, which is LENGTH bytes long, in the
 397    specified MODE.
 398
 399    SLEX has no internal state to free, but it retains a reference to INPUT, so
 400    INPUT must not be modified or freed while SLEX is still in use. */
 401 void
 402 string_lexer_init (struct string_lexer *slex, const char *input, size_t length,
 403                    enum segmenter_mode mode, bool is_snippet)
 404 {
 405   *slex = (struct string_lexer) {
 406     .input = input,
 407     .length = length,
 408     .offset = 0,
 409     .segmenter = segmenter_init (mode, is_snippet),
 410   };
 411 }
 412
 413 /*  */
 414 enum string_lexer_result
 415 string_lexer_next (struct string_lexer *slex, struct token *token)
 416 {
 417   for (;;)
 418     {
 419       const char *s = slex->input + slex->offset;
 420       size_t left = slex->length - slex->offset;
 421       enum segment_type type;
 422       int n;
 423
 424       n = segmenter_push (&slex->segmenter, s, left, true, &type);
 425       assert (n >= 0);
 426
 427       slex->offset += n;
 428       switch (token_from_segment (type, ss_buffer (s, n), token))
 429         {
 430         case TOKENIZE_TOKEN:
 431           return token->type == T_STOP ? SLR_END : SLR_TOKEN;
 432
 433         case TOKENIZE_ERROR:
 434           return SLR_ERROR;
 435
 436         case TOKENIZE_EMPTY:
 437           break;
 438         }
 439     }
 440 }
 441
 442 static struct substring
 443 concat (struct substring a, struct substring b)
 444 {
 445   size_t length = a.length + b.length;
 446   struct substring out = { .string = xmalloc (length + 1), .length = length };
 447   memcpy (out.string, a.string, a.length);
 448   memcpy (out.string + a.length, b.string, b.length);
 449   out.string[length] = '\0';
 450   return out;
 451 }
 452
 453 /* Attempts to merge a sequence of tokens together into a single token.  The
 454    caller feeds tokens in one by one and the merger FSM reports progress.  The
 455    caller must supply a merger structure M that is set to MERGER_INIT before
 456    the first call.  The caller must also supply a token OUT for storage, which
 457    need not be initialized.
 458
 459    Returns:
 460
 461    * -1 if more tokens are needed.  Token OUT might be in use for temporary
 462       storage; to ensure that it is freed, continue calling merger_add() until
 463       it returns something other than -1.  (T_STOP or T_ENDCMD will make it do
 464       that.)
 465
 466    * 0 if the first token submitted to the merger is the output.  This is the
 467      common case for the first call, and it can be returned for subsequent
 468      calls as well.
 469
 470    * A positive number if OUT is initialized to the output token.  The return
 471      value is the number of tokens being merged to produce this one. */
 472 int
 473 merger_add (struct merger *m, const struct token *in, struct token *out)
 474 {
 475   /* We perform two different kinds of token merging:
 476
 477      - String concatenation, where syntax like "a" + "b" is converted into a
 478        single string token.  This is definitely needed because the parser
 479        relies on it.
 480
 481      - Negative number merging, where syntax like -5 is converted from a pair
 482        of tokens (T_DASH then T_POS_NUM) into a single token (T_NEG_NUM).  This
 483        might not be needed anymore because the segmenter directly treats a dash
 484        followed by a number, with optional intervening white space, as a
 485        negative number.  It's only needed if we want intervening comments to be
 486        allowed or for part of the negative number token to be produced by macro
 487        expansion. */
 488   switch (++m->state)
 489     {
 490     case 1:
 491       if (in->type == T_DASH || in->type == T_STRING)
 492         {
 493           *out = *in;
 494           return -1;
 495         }
 496       else
 497         return 0;
 498
 499     case 2:
 500       if (out->type == T_DASH)
 501         {
 502           if (in->type == T_POS_NUM)
 503             {
 504               *out = (struct token) {
 505                 .type = T_NEG_NUM,
 506                 .number = -in->number
 507               };
 508               return 2;
 509             }
 510           else
 511             return 0;
 512         }
 513       else
 514         return in->type == T_PLUS ? -1 : 0;
 515       NOT_REACHED ();
 516
 517     case 3:
 518       if (in->type == T_STRING)
 519         {
 520           out->string = concat (out->string, in->string);
 521           return -1;
 522         }
 523       else
 524         return 0;
 525       NOT_REACHED ();
 526
 527     default:
 528       if (!(m->state % 2))
 529         return in->type == T_PLUS ? -1 : m->state - 1;
 530       else
 531         {
 532           if (in->type == T_STRING)
 533             {
 534               struct substring s = concat (out->string, in->string);
 535               ss_swap (&s, &out->string);
 536               ss_dealloc (&s);
 537               return -1;
 538             }
 539           else
 540             return m->state - 2;
 541         }
 542       NOT_REACHED ();
 543     }
 544 }