pintos-os.org Git - pspp/blob - src/language/lexer/scan.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 2010, 2011, 2013 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "language/lexer/scan.h"
  20
  21 #include <limits.h>
  22 #include <unistr.h>
  23
  24 #include "data/identifier.h"
  25 #include "language/lexer/token.h"
  26 #include "libpspp/assertion.h"
  27 #include "libpspp/cast.h"
  28 #include "libpspp/i18n.h"
  29
  30 #include "gl/c-ctype.h"
  31 #include "gl/c-strtod.h"
  32 #include "gl/xmemdup0.h"
  33
  34 #include "gettext.h"
  35 #define _(msgid) gettext (msgid)
  36
  37 /* Returns the integer value of (hex) digit C. */
  38 static int
  39 digit_value (int c)
  40 {
  41   switch (c)
  42     {
  43     case '0': return 0;
  44     case '1': return 1;
  45     case '2': return 2;
  46     case '3': return 3;
  47     case '4': return 4;
  48     case '5': return 5;
  49     case '6': return 6;
  50     case '7': return 7;
  51     case '8': return 8;
  52     case '9': return 9;
  53     case 'a': case 'A': return 10;
  54     case 'b': case 'B': return 11;
  55     case 'c': case 'C': return 12;
  56     case 'd': case 'D': return 13;
  57     case 'e': case 'E': return 14;
  58     case 'f': case 'F': return 15;
  59     default: return INT_MAX;
  60     }
  61 }
  62
  63 static void
  64 scan_quoted_string (struct substring in, struct token *token)
  65 {
  66   /* Trim ' or " from front and back. */
  67   int quote = in.string[0];
  68   in.string++;
  69   in.length -= 2;
  70
  71   struct substring out = { .string = xmalloc (in.length + 1) };
  72
  73   for (;;)
  74     {
  75       size_t pos = ss_find_byte (in, quote);
  76       if (pos == SIZE_MAX)
  77         break;
  78
  79       memcpy (ss_end (out), in.string, pos + 1);
  80       out.length += pos + 1;
  81       ss_advance (&in, pos + 2);
  82     }
  83
  84   memcpy (ss_end (out), in.string, in.length);
  85   out.length += in.length;
  86   out.string[out.length] = '\0';
  87
  88   *token = (struct token) { .type = T_STRING, .string = out };
  89 }
  90
  91 static char *
  92 scan_hex_string__ (struct substring in, struct substring *out)
  93 {
  94   if (in.length % 2 != 0)
  95     return xasprintf (_("String of hex digits has %zu characters, which "
  96                         "is not a multiple of 2."), in.length);
  97
  98   ss_realloc (out, in.length / 2 + 1);
  99   uint8_t *dst = CHAR_CAST (uint8_t *, out->string);
 100   out->length = in.length / 2;
 101   for (size_t i = 0; i < in.length; i += 2)
 102     {
 103       int hi = digit_value (in.string[i]);
 104       int lo = digit_value (in.string[i + 1]);
 105
 106       if (hi >= 16 || lo >= 16)
 107         return xasprintf (_("`%c' is not a valid hex digit."),
 108                           in.string[hi >= 16 ? i : i + 1]);
 109
 110       *dst++ = hi * 16 + lo;
 111     }
 112
 113   return NULL;
 114 }
 115
 116 static char *
 117 scan_unicode_string__ (struct substring in, struct substring *out)
 118 {
 119   if (in.length < 1 || in.length > 8)
 120     return xasprintf (_("Unicode string contains %zu bytes, which is "
 121                         "not in the valid range of 1 to 8 bytes."),
 122                       in.length);
 123
 124   ucs4_t uc = 0;
 125   for (size_t i = 0; i < in.length; i++)
 126     {
 127       int digit = digit_value (in.string[i]);
 128       if (digit >= 16)
 129         return xasprintf (_("`%c' is not a valid hex digit."), in.string[i]);
 130       uc = uc * 16 + digit;
 131     }
 132
 133   if ((uc >= 0xd800 && uc < 0xe000) || uc > 0x10ffff)
 134     return xasprintf (_("U+%04llX is not a valid Unicode code point."),
 135                       (long long) uc);
 136
 137   ss_realloc (out, 4 + 1);
 138   out->length = u8_uctomb (CHAR_CAST (uint8_t *, ss_end (*out)), uc, 4);
 139
 140   return NULL;
 141 }
 142
 143 static enum token_type
 144 scan_reserved_word__ (struct substring word)
 145 {
 146   switch (c_toupper (word.string[0]))
 147     {
 148     case 'B':
 149       return T_BY;
 150
 151     case 'E':
 152       return T_EQ;
 153
 154     case 'G':
 155       return c_toupper (word.string[1]) == 'E' ? T_GE : T_GT;
 156
 157     case 'L':
 158       return c_toupper (word.string[1]) == 'E' ? T_LE : T_LT;
 159
 160     case 'N':
 161       return word.length == 2 ? T_NE : T_NOT;
 162
 163     case 'O':
 164       return T_OR;
 165
 166     case 'T':
 167       return T_TO;
 168
 169     case 'A':
 170       return c_toupper (word.string[1]) == 'L' ? T_ALL : T_AND;
 171
 172     case 'W':
 173       return T_WITH;
 174     }
 175
 176   NOT_REACHED ();
 177 }
 178
 179 static enum token_type
 180 scan_punct1__ (char c0)
 181 {
 182   switch (c0)
 183     {
 184     case '(': return T_LPAREN;
 185     case ')': return T_RPAREN;
 186     case ',': return T_COMMA;
 187     case '=': return T_EQUALS;
 188     case '-': return T_DASH;
 189     case '[': return T_LBRACK;
 190     case ']': return T_RBRACK;
 191     case '{': return T_LCURLY;
 192     case '}': return T_RCURLY;
 193     case '&': return T_AND;
 194     case '|': return T_OR;
 195     case '+': return T_PLUS;
 196     case '/': return T_SLASH;
 197     case '*': return T_ASTERISK;
 198     case '<': return T_LT;
 199     case '>': return T_GT;
 200     case '~': return T_NOT;
 201     case ';': return T_SEMICOLON;
 202     case ':': return T_COLON;
 203     default: return T_MACRO_PUNCT;
 204     }
 205
 206   NOT_REACHED ();
 207 }
 208
 209 static enum token_type
 210 scan_punct2__ (char c0, char c1)
 211 {
 212   switch (c0)
 213     {
 214     case '*':
 215       return T_EXP;
 216
 217     case '<':
 218       return c1 == '=' ? T_LE : T_NE;
 219
 220     case '>':
 221       return T_GE;
 222
 223     case '~':
 224       return T_NE;
 225
 226     case '&':
 227       return T_AND;
 228
 229     case '|':
 230       return T_OR;
 231     }
 232
 233   NOT_REACHED ();
 234 }
 235
 236 static enum token_type
 237 scan_punct__ (struct substring s)
 238 {
 239   return (s.length == 1
 240           ? scan_punct1__ (s.string[0])
 241           : scan_punct2__ (s.string[0], s.string[1]));
 242 }
 243
 244 static void
 245 scan_number__ (struct substring s, struct token *token)
 246 {
 247   char buf[128];
 248   char *p;
 249
 250   if (s.length < sizeof buf)
 251     {
 252       p = buf;
 253       memcpy (buf, s.string, s.length);
 254       buf[s.length] = '\0';
 255     }
 256   else
 257     p = xmemdup0 (s.string, s.length);
 258
 259   bool negative = *p == '-';
 260   double x = c_strtod (p + negative, NULL);
 261   *token = (struct token) {
 262     .type = negative ? T_NEG_NUM : T_POS_NUM,
 263     .number = negative ? -x : x,
 264   };
 265
 266   if (p != buf)
 267     free (p);
 268 }
 269 \f
 270 static void
 271 tokenize_error__ (struct token *token, char *error)
 272 {
 273   *token = (struct token) { .type = T_STRING, .string = ss_cstr (error) };
 274 }
 275
 276 static enum tokenize_result
 277 tokenize_string_segment__ (enum segment_type type,
 278                            struct substring s, struct token *token)
 279 {
 280   /* Trim X' or U' from front and ' from back. */
 281   s.string += 2;
 282   s.length -= 3;
 283
 284   struct substring out = SS_EMPTY_INITIALIZER;
 285   char *error = (type == SEG_HEX_STRING
 286                  ? scan_hex_string__ (s, &out)
 287                  : scan_unicode_string__ (s, &out));
 288   if (!error)
 289     {
 290       out.string[out.length] = '\0';
 291       *token = (struct token) { .type = T_STRING, .string = out };
 292       return TOKENIZE_TOKEN;
 293     }
 294   else
 295     {
 296       tokenize_error__ (token, error);
 297       return TOKENIZE_ERROR;
 298     }
 299 }
 300
 301 static void
 302 tokenize_unexpected_char (const struct substring *s, struct token *token)
 303 {
 304   ucs4_t uc;
 305   u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, s->string), s->length);
 306
 307   char c_name[16];
 308   tokenize_error__ (token, xasprintf (_("Bad character %s in input."),
 309                                       uc_name (uc, c_name)));
 310 }
 311
 312 enum tokenize_result
 313 token_from_segment (enum segment_type type, struct substring s,
 314                     struct token *token)
 315 {
 316   switch (type)
 317     {
 318     case SEG_NUMBER:
 319       scan_number__ (s, token);
 320       return TOKENIZE_TOKEN;
 321
 322     case SEG_QUOTED_STRING:
 323       scan_quoted_string (s, token);
 324       return TOKENIZE_TOKEN;
 325
 326     case SEG_HEX_STRING:
 327     case SEG_UNICODE_STRING:
 328       return tokenize_string_segment__ (type, s, token);
 329
 330     case SEG_UNQUOTED_STRING:
 331     case SEG_DO_REPEAT_COMMAND:
 332     case SEG_INLINE_DATA:
 333     case SEG_DOCUMENT:
 334     case SEG_MACRO_BODY:
 335     case SEG_MACRO_NAME:
 336       *token = (struct token) { .type = T_STRING };
 337       ss_alloc_substring (&token->string, s);
 338       return TOKENIZE_TOKEN;
 339
 340     case SEG_RESERVED_WORD:
 341       *token = (struct token) { .type = scan_reserved_word__ (s) };
 342       return TOKENIZE_TOKEN;
 343
 344     case SEG_IDENTIFIER:
 345       *token = (struct token) { .type = T_ID };
 346       ss_alloc_substring (&token->string, s);
 347       return TOKENIZE_TOKEN;
 348
 349     case SEG_MACRO_ID:
 350       *token = (struct token) { .type = T_MACRO_ID };
 351       ss_alloc_substring (&token->string, s);
 352       return TOKENIZE_TOKEN;
 353
 354     case SEG_PUNCT:
 355       *token = (struct token) { .type = scan_punct__ (s) };
 356       if (token->type == T_MACRO_PUNCT)
 357         ss_alloc_substring (&token->string, s);
 358       return TOKENIZE_TOKEN;
 359
 360     case SEG_SHBANG:
 361     case SEG_SPACES:
 362     case SEG_COMMENT:
 363     case SEG_NEWLINE:
 364     case SEG_COMMENT_COMMAND:
 365       return TOKENIZE_EMPTY;
 366
 367     case SEG_START_DOCUMENT:
 368       *token = (struct token) { .type = T_ID };
 369       ss_alloc_substring (&token->string, ss_cstr ("DOCUMENT"));
 370       return TOKENIZE_TOKEN;
 371
 372     case SEG_START_COMMAND:
 373     case SEG_SEPARATE_COMMANDS:
 374     case SEG_END_COMMAND:
 375       *token = (struct token) { .type = T_ENDCMD };
 376       return TOKENIZE_TOKEN;
 377
 378     case SEG_END:
 379       *token = (struct token) { .type = T_STOP };
 380       return TOKENIZE_TOKEN;
 381
 382     case SEG_EXPECTED_QUOTE:
 383       tokenize_error__ (token, xasprintf (_("Unterminated string constant.")));
 384       return TOKENIZE_ERROR;
 385
 386     case SEG_EXPECTED_EXPONENT:
 387       tokenize_error__ (token,
 388                         xasprintf (_("Missing exponent following `%.*s'."),
 389                                    (int) s.length, s.string));
 390       return TOKENIZE_ERROR;
 391
 392     case SEG_UNEXPECTED_CHAR:
 393       tokenize_unexpected_char (&s, token);
 394       return TOKENIZE_ERROR;
 395     }
 396
 397   NOT_REACHED ();
 398 }
 399
 400 \f
 401 /* Initializes SLEX for parsing INPUT, which is LENGTH bytes long, in the
 402    specified MODE.
 403
 404    SLEX has no internal state to free, but it retains a reference to INPUT, so
 405    INPUT must not be modified or freed while SLEX is still in use. */
 406 void
 407 string_lexer_init (struct string_lexer *slex, const char *input, size_t length,
 408                    enum segmenter_mode mode, bool is_snippet)
 409 {
 410   *slex = (struct string_lexer) {
 411     .input = input,
 412     .length = length,
 413     .offset = 0,
 414     .segmenter = segmenter_init (mode, is_snippet),
 415   };
 416 }
 417
 418 /*  */
 419 enum string_lexer_result
 420 string_lexer_next (struct string_lexer *slex, struct token *token)
 421 {
 422   for (;;)
 423     {
 424       const char *s = slex->input + slex->offset;
 425       size_t left = slex->length - slex->offset;
 426       enum segment_type type;
 427       int n;
 428
 429       n = segmenter_push (&slex->segmenter, s, left, true, &type);
 430       assert (n >= 0);
 431
 432       slex->offset += n;
 433       switch (token_from_segment (type, ss_buffer (s, n), token))
 434         {
 435         case TOKENIZE_TOKEN:
 436           return token->type == T_STOP ? SLR_END : SLR_TOKEN;
 437
 438         case TOKENIZE_ERROR:
 439           return SLR_ERROR;
 440
 441         case TOKENIZE_EMPTY:
 442           break;
 443         }
 444     }
 445 }
 446
 447 static struct substring
 448 concat (struct substring a, struct substring b)
 449 {
 450   size_t length = a.length + b.length;
 451   struct substring out = { .string = xmalloc (length + 1), .length = length };
 452   memcpy (out.string, a.string, a.length);
 453   memcpy (out.string + a.length, b.string, b.length);
 454   out.string[length] = '\0';
 455   return out;
 456 }
 457
 458 /* Attempts to merge a sequence of tokens together into a single token.  The
 459    caller feeds tokens in one by one and the merger FSM reports progress.  The
 460    caller must supply a merger structure M that is set to MERGER_INIT before
 461    the first call.  The caller must also supply a token OUT for storage, which
 462    need not be initialized.
 463
 464    Returns:
 465
 466    * -1 if more tokens are needed.  Token OUT might be in use for temporary
 467       storage; to ensure that it is freed, continue calling merger_add() until
 468       it returns something other than -1.  (T_STOP or T_ENDCMD will make it do
 469       that.)
 470
 471    * 0 if the first token submitted to the merger is the output.  This is the
 472      common case for the first call, and it can be returned for subsequent
 473      calls as well.
 474
 475    * A positive number if OUT is initialized to the output token.  The return
 476      value is the number of tokens being merged to produce this one. */
 477 int
 478 merger_add (struct merger *m, const struct token *in, struct token *out)
 479 {
 480   /* We perform two different kinds of token merging:
 481
 482      - String concatenation, where syntax like "a" + "b" is converted into a
 483        single string token.  This is definitely needed because the parser
 484        relies on it.
 485
 486      - Negative number merging, where syntax like -5 is converted from a pair
 487        of tokens (T_DASH then T_POS_NUM) into a single token (T_NEG_NUM).  This
 488        might not be needed anymore because the segmenter directly treats a dash
 489        followed by a number, with optional intervening white space, as a
 490        negative number.  It's only needed if we want intervening comments to be
 491        allowed or for part of the negative number token to be produced by macro
 492        expansion. */
 493   switch (++m->state)
 494     {
 495     case 1:
 496       if (in->type == T_DASH || in->type == T_STRING)
 497         {
 498           *out = *in;
 499           return -1;
 500         }
 501       else
 502         return 0;
 503
 504     case 2:
 505       if (out->type == T_DASH)
 506         {
 507           if (in->type == T_POS_NUM)
 508             {
 509               *out = (struct token) {
 510                 .type = T_NEG_NUM,
 511                 .number = -in->number
 512               };
 513               return 2;
 514             }
 515           else
 516             return 0;
 517         }
 518       else
 519         return in->type == T_PLUS ? -1 : 0;
 520       NOT_REACHED ();
 521
 522     case 3:
 523       if (in->type == T_STRING)
 524         {
 525           out->string = concat (out->string, in->string);
 526           return -1;
 527         }
 528       else
 529         return 0;
 530       NOT_REACHED ();
 531
 532     default:
 533       if (!(m->state % 2))
 534         return in->type == T_PLUS ? -1 : m->state - 1;
 535       else
 536         {
 537           if (in->type == T_STRING)
 538             {
 539               struct substring s = concat (out->string, in->string);
 540               ss_swap (&s, &out->string);
 541               ss_dealloc (&s);
 542               return -1;
 543             }
 544           else
 545             return m->state - 2;
 546         }
 547       NOT_REACHED ();
 548     }
 549 }