pintos-os.org Git - pspp/blob - src/language/lexer/scan.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 2010, 2011, 2013 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "language/lexer/scan.h"
  20
  21 #include <limits.h>
  22 #include <unistr.h>
  23
  24 #include "data/identifier.h"
  25 #include "language/lexer/token.h"
  26 #include "libpspp/assertion.h"
  27 #include "libpspp/cast.h"
  28 #include "libpspp/i18n.h"
  29
  30 #include "gl/c-ctype.h"
  31 #include "gl/c-strtod.h"
  32 #include "gl/xmemdup0.h"
  33
  34 #include "gettext.h"
  35 #define _(msgid) gettext (msgid)
  36
  37 /* Returns the integer value of (hex) digit C. */
  38 static int
  39 digit_value (int c)
  40 {
  41   switch (c)
  42     {
  43     case '0': return 0;
  44     case '1': return 1;
  45     case '2': return 2;
  46     case '3': return 3;
  47     case '4': return 4;
  48     case '5': return 5;
  49     case '6': return 6;
  50     case '7': return 7;
  51     case '8': return 8;
  52     case '9': return 9;
  53     case 'a': case 'A': return 10;
  54     case 'b': case 'B': return 11;
  55     case 'c': case 'C': return 12;
  56     case 'd': case 'D': return 13;
  57     case 'e': case 'E': return 14;
  58     case 'f': case 'F': return 15;
  59     default: return INT_MAX;
  60     }
  61 }
  62
  63 static void
  64 scan_quoted_string (struct substring in, struct token *token)
  65 {
  66   /* Trim ' or " from front and back. */
  67   int quote = in.string[0];
  68   in.string++;
  69   in.length -= 2;
  70
  71   struct substring out = { .string = xmalloc (in.length + 1) };
  72
  73   for (;;)
  74     {
  75       size_t pos = ss_find_byte (in, quote);
  76       if (pos == SIZE_MAX)
  77         break;
  78
  79       memcpy (ss_end (out), in.string, pos + 1);
  80       out.length += pos + 1;
  81       ss_advance (&in, pos + 2);
  82     }
  83
  84   memcpy (ss_end (out), in.string, in.length);
  85   out.length += in.length;
  86   out.string[out.length] = '\0';
  87
  88   *token = (struct token) { .type = T_STRING, .string = out };
  89 }
  90
  91 static char *
  92 scan_hex_string__ (struct substring in, struct substring *out)
  93 {
  94   if (in.length % 2 != 0)
  95     return xasprintf (_("String of hex digits has %zu characters, which "
  96                         "is not a multiple of 2."), in.length);
  97
  98   ss_realloc (out, in.length / 2 + 1);
  99   uint8_t *dst = CHAR_CAST (uint8_t *, out->string);
 100   out->length = in.length / 2;
 101   for (size_t i = 0; i < in.length; i += 2)
 102     {
 103       int hi = digit_value (in.string[i]);
 104       int lo = digit_value (in.string[i + 1]);
 105
 106       if (hi >= 16 || lo >= 16)
 107         return xasprintf (_("`%c' is not a valid hex digit."),
 108                           in.string[hi >= 16 ? i : i + 1]);
 109
 110       *dst++ = hi * 16 + lo;
 111     }
 112
 113   return NULL;
 114 }
 115
 116 static char *
 117 scan_unicode_string__ (struct substring in, struct substring *out)
 118 {
 119   if (in.length < 1 || in.length > 8)
 120     return xasprintf (_("Unicode string contains %zu bytes, which is "
 121                         "not in the valid range of 1 to 8 bytes."),
 122                       in.length);
 123
 124   ucs4_t uc = 0;
 125   for (size_t i = 0; i < in.length; i++)
 126     {
 127       int digit = digit_value (in.string[i]);
 128       if (digit >= 16)
 129         return xasprintf (_("`%c' is not a valid hex digit."), in.string[i]);
 130       uc = uc * 16 + digit;
 131     }
 132
 133   if ((uc >= 0xd800 && uc < 0xe000) || uc > 0x10ffff)
 134     return xasprintf (_("U+%04llX is not a valid Unicode code point."),
 135                       (long long) uc);
 136
 137   ss_realloc (out, 4 + 1);
 138   out->length = u8_uctomb (CHAR_CAST (uint8_t *, ss_end (*out)), uc, 4);
 139
 140   return NULL;
 141 }
 142
 143 static enum token_type
 144 scan_reserved_word__ (struct substring word)
 145 {
 146   switch (c_toupper (word.string[0]))
 147     {
 148     case 'B':
 149       return T_BY;
 150
 151     case 'E':
 152       return T_EQ;
 153
 154     case 'G':
 155       return c_toupper (word.string[1]) == 'E' ? T_GE : T_GT;
 156
 157     case 'L':
 158       return c_toupper (word.string[1]) == 'E' ? T_LE : T_LT;
 159
 160     case 'N':
 161       return word.length == 2 ? T_NE : T_NOT;
 162
 163     case 'O':
 164       return T_OR;
 165
 166     case 'T':
 167       return T_TO;
 168
 169     case 'A':
 170       return c_toupper (word.string[1]) == 'L' ? T_ALL : T_AND;
 171
 172     case 'W':
 173       return T_WITH;
 174     }
 175
 176   NOT_REACHED ();
 177 }
 178
 179 static enum token_type
 180 scan_punct1__ (char c0)
 181 {
 182   switch (c0)
 183     {
 184     case '(': return T_LPAREN;
 185     case ')': return T_RPAREN;
 186     case ',': return T_COMMA;
 187     case '=': return T_EQUALS;
 188     case '-': return T_DASH;
 189     case '[': return T_LBRACK;
 190     case ']': return T_RBRACK;
 191     case '{': return T_LCURLY;
 192     case '}': return T_RCURLY;
 193     case '&': return T_AND;
 194     case '|': return T_OR;
 195     case '+': return T_PLUS;
 196     case '/': return T_SLASH;
 197     case '*': return T_ASTERISK;
 198     case '<': return T_LT;
 199     case '>': return T_GT;
 200     case '~': return T_NOT;
 201     case ';': return T_SEMICOLON;
 202     case ':': return T_COLON;
 203     default: return T_MACRO_PUNCT;
 204     }
 205
 206   NOT_REACHED ();
 207 }
 208
 209 static enum token_type
 210 scan_punct2__ (char c0, char c1)
 211 {
 212   switch (c0)
 213     {
 214     case '*':
 215       return T_EXP;
 216
 217     case '<':
 218       return c1 == '=' ? T_LE : T_NE;
 219
 220     case '>':
 221       return T_GE;
 222
 223     case '~':
 224       return T_NE;
 225
 226     case '&':
 227       return T_AND;
 228
 229     case '|':
 230       return T_OR;
 231     }
 232
 233   NOT_REACHED ();
 234 }
 235
 236 static enum token_type
 237 scan_punct__ (struct substring s)
 238 {
 239   return (s.length == 1
 240           ? scan_punct1__ (s.string[0])
 241           : scan_punct2__ (s.string[0], s.string[1]));
 242 }
 243
 244 static void
 245 scan_number__ (struct substring s, struct token *token)
 246 {
 247   char buf[128];
 248   char *p;
 249
 250   if (s.length < sizeof buf)
 251     {
 252       p = buf;
 253       memcpy (buf, s.string, s.length);
 254       buf[s.length] = '\0';
 255     }
 256   else
 257     p = xmemdup0 (s.string, s.length);
 258
 259   bool negative = *p == '-';
 260   double x = c_strtod (p + negative, NULL);
 261   *token = (struct token) {
 262     .type = negative ? T_NEG_NUM : T_POS_NUM,
 263     .number = negative ? -x : x,
 264   };
 265
 266   if (p != buf)
 267     free (p);
 268 }
 269 \f
 270 static void
 271 tokenize_error__ (struct token *token, char *error)
 272 {
 273   *token = (struct token) { .type = T_STRING, .string = ss_cstr (error) };
 274 }
 275
 276 static enum tokenize_result
 277 tokenize_string_segment__ (enum segment_type type,
 278                            struct substring s, struct token *token)
 279 {
 280   /* Trim X' or U' from front and ' from back. */
 281   s.string += 2;
 282   s.length -= 3;
 283
 284   struct substring out = SS_EMPTY_INITIALIZER;
 285   char *error = (type == SEG_HEX_STRING
 286                  ? scan_hex_string__ (s, &out)
 287                  : scan_unicode_string__ (s, &out));
 288   if (!error)
 289     {
 290       out.string[out.length] = '\0';
 291       *token = (struct token) { .type = T_STRING, .string = out };
 292       return TOKENIZE_TOKEN;
 293     }
 294   else
 295     {
 296       tokenize_error__ (token, error);
 297       ss_dealloc (&out);
 298       return TOKENIZE_ERROR;
 299     }
 300 }
 301
 302 static void
 303 tokenize_unexpected_char (const struct substring *s, struct token *token)
 304 {
 305   ucs4_t uc;
 306   u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, s->string), s->length);
 307
 308   char c_name[16];
 309   tokenize_error__ (token, xasprintf (_("Bad character %s in input."),
 310                                       uc_name (uc, c_name)));
 311 }
 312
 313 enum tokenize_result
 314 token_from_segment (enum segment_type type, struct substring s,
 315                     struct token *token)
 316 {
 317   switch (type)
 318     {
 319     case SEG_NUMBER:
 320       scan_number__ (s, token);
 321       return TOKENIZE_TOKEN;
 322
 323     case SEG_QUOTED_STRING:
 324       scan_quoted_string (s, token);
 325       return TOKENIZE_TOKEN;
 326
 327     case SEG_HEX_STRING:
 328     case SEG_UNICODE_STRING:
 329       return tokenize_string_segment__ (type, s, token);
 330
 331     case SEG_UNQUOTED_STRING:
 332     case SEG_DO_REPEAT_COMMAND:
 333     case SEG_INLINE_DATA:
 334     case SEG_DOCUMENT:
 335     case SEG_MACRO_BODY:
 336     case SEG_MACRO_NAME:
 337       *token = (struct token) { .type = T_STRING };
 338       ss_alloc_substring (&token->string, s);
 339       return TOKENIZE_TOKEN;
 340
 341     case SEG_RESERVED_WORD:
 342       *token = (struct token) { .type = scan_reserved_word__ (s) };
 343       return TOKENIZE_TOKEN;
 344
 345     case SEG_IDENTIFIER:
 346       *token = (struct token) { .type = T_ID };
 347       ss_alloc_substring (&token->string, s);
 348       return TOKENIZE_TOKEN;
 349
 350     case SEG_MACRO_ID:
 351       *token = (struct token) { .type = T_MACRO_ID };
 352       ss_alloc_substring (&token->string, s);
 353       return TOKENIZE_TOKEN;
 354
 355     case SEG_PUNCT:
 356       *token = (struct token) { .type = scan_punct__ (s) };
 357       if (token->type == T_MACRO_PUNCT)
 358         ss_alloc_substring (&token->string, s);
 359       return TOKENIZE_TOKEN;
 360
 361     case SEG_SHBANG:
 362     case SEG_SPACES:
 363     case SEG_COMMENT:
 364     case SEG_NEWLINE:
 365     case SEG_COMMENT_COMMAND:
 366       return TOKENIZE_EMPTY;
 367
 368     case SEG_START_DOCUMENT:
 369       *token = (struct token) { .type = T_ID };
 370       ss_alloc_substring (&token->string, ss_cstr ("DOCUMENT"));
 371       return TOKENIZE_TOKEN;
 372
 373     case SEG_START_COMMAND:
 374     case SEG_SEPARATE_COMMANDS:
 375     case SEG_END_COMMAND:
 376       *token = (struct token) { .type = T_ENDCMD };
 377       return TOKENIZE_TOKEN;
 378
 379     case SEG_END:
 380       *token = (struct token) { .type = T_STOP };
 381       return TOKENIZE_TOKEN;
 382
 383     case SEG_EXPECTED_QUOTE:
 384       tokenize_error__ (token, xasprintf (_("Unterminated string constant.")));
 385       return TOKENIZE_ERROR;
 386
 387     case SEG_EXPECTED_EXPONENT:
 388       tokenize_error__ (token,
 389                         xasprintf (_("Missing exponent following `%.*s'."),
 390                                    (int) s.length, s.string));
 391       return TOKENIZE_ERROR;
 392
 393     case SEG_UNEXPECTED_CHAR:
 394       tokenize_unexpected_char (&s, token);
 395       return TOKENIZE_ERROR;
 396     }
 397
 398   NOT_REACHED ();
 399 }
 400
 401 \f
 402 /* Initializes SLEX for parsing INPUT, which is LENGTH bytes long, in the
 403    specified MODE.
 404
 405    SLEX has no internal state to free, but it retains a reference to INPUT, so
 406    INPUT must not be modified or freed while SLEX is still in use. */
 407 void
 408 string_lexer_init (struct string_lexer *slex, const char *input, size_t length,
 409                    enum segmenter_mode mode, bool is_snippet)
 410 {
 411   *slex = (struct string_lexer) {
 412     .input = input,
 413     .length = length,
 414     .offset = 0,
 415     .segmenter = segmenter_init (mode, is_snippet),
 416   };
 417 }
 418
 419 /*  */
 420 enum string_lexer_result
 421 string_lexer_next (struct string_lexer *slex, struct token *token)
 422 {
 423   for (;;)
 424     {
 425       const char *s = slex->input + slex->offset;
 426       size_t left = slex->length - slex->offset;
 427       enum segment_type type;
 428       int n;
 429
 430       n = segmenter_push (&slex->segmenter, s, left, true, &type);
 431       assert (n >= 0);
 432
 433       slex->offset += n;
 434       switch (token_from_segment (type, ss_buffer (s, n), token))
 435         {
 436         case TOKENIZE_TOKEN:
 437           return token->type == T_STOP ? SLR_END : SLR_TOKEN;
 438
 439         case TOKENIZE_ERROR:
 440           return SLR_ERROR;
 441
 442         case TOKENIZE_EMPTY:
 443           break;
 444         }
 445     }
 446 }
 447
 448 static struct substring
 449 concat (struct substring a, struct substring b)
 450 {
 451   size_t length = a.length + b.length;
 452   struct substring out = { .string = xmalloc (length + 1), .length = length };
 453   memcpy (out.string, a.string, a.length);
 454   memcpy (out.string + a.length, b.string, b.length);
 455   out.string[length] = '\0';
 456   return out;
 457 }
 458
 459 /* Attempts to merge a sequence of tokens together into a single token.  The
 460    caller feeds tokens in one by one and the merger FSM reports progress.  The
 461    caller must supply a merger structure M that is set to MERGER_INIT before
 462    the first call.  The caller must also supply a token OUT for storage, which
 463    need not be initialized.
 464
 465    Returns:
 466
 467    * -1 if more tokens are needed.  Token OUT might be in use for temporary
 468       storage; to ensure that it is freed, continue calling merger_add() until
 469       it returns something other than -1.  (T_STOP or T_ENDCMD will make it do
 470       that.)
 471
 472    * 0 if the first token submitted to the merger is the output.  This is the
 473      common case for the first call, and it can be returned for subsequent
 474      calls as well.
 475
 476    * A positive number if OUT is initialized to the output token.  The return
 477      value is the number of tokens being merged to produce this one. */
 478 int
 479 merger_add (struct merger *m, const struct token *in, struct token *out)
 480 {
 481   /* We perform two different kinds of token merging:
 482
 483      - String concatenation, where syntax like "a" + "b" is converted into a
 484        single string token.  This is definitely needed because the parser
 485        relies on it.
 486
 487      - Negative number merging, where syntax like -5 is converted from a pair
 488        of tokens (T_DASH then T_POS_NUM) into a single token (T_NEG_NUM).  This
 489        might not be needed anymore because the segmenter directly treats a dash
 490        followed by a number, with optional intervening white space, as a
 491        negative number.  It's only needed if we want intervening comments to be
 492        allowed or for part of the negative number token to be produced by macro
 493        expansion. */
 494   switch (++m->state)
 495     {
 496     case 1:
 497       if (in->type == T_DASH || in->type == T_STRING)
 498         {
 499           *out = *in;
 500           return -1;
 501         }
 502       else
 503         return 0;
 504
 505     case 2:
 506       if (out->type == T_DASH)
 507         {
 508           if (in->type == T_POS_NUM)
 509             {
 510               *out = (struct token) {
 511                 .type = T_NEG_NUM,
 512                 .number = -in->number
 513               };
 514               return 2;
 515             }
 516           else
 517             return 0;
 518         }
 519       else
 520         return in->type == T_PLUS ? -1 : 0;
 521       NOT_REACHED ();
 522
 523     case 3:
 524       if (in->type == T_STRING)
 525         {
 526           out->string = concat (out->string, in->string);
 527           return -1;
 528         }
 529       else
 530         return 0;
 531       NOT_REACHED ();
 532
 533     default:
 534       if (!(m->state % 2))
 535         return in->type == T_PLUS ? -1 : m->state - 1;
 536       else
 537         {
 538           if (in->type == T_STRING)
 539             {
 540               struct substring s = concat (out->string, in->string);
 541               ss_swap (&s, &out->string);
 542               ss_dealloc (&s);
 543               return -1;
 544             }
 545           else
 546             return m->state - 2;
 547         }
 548       NOT_REACHED ();
 549     }
 550 }