pintos-os.org Git - pspp/blob - src/data/identifier.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 1997-9, 2000, 2005, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 /*
  18    This file is concerned with the definition of the PSPP syntax, NOT the
  19    action of scanning/parsing code .
  20 */
  21
  22 #include <config.h>
  23
  24 #include "data/identifier.h"
  25
  26 #include <string.h>
  27 #include <unistr.h>
  28 #include <unictype.h>
  29
  30 #include "libpspp/assertion.h"
  31 #include "libpspp/cast.h"
  32
  33 #include "gl/c-ctype.h"
  34
  35 #include "gettext.h"
  36 #define _(msgid) gettext (msgid)
  37
  38 /* Tokens. */
  39
  40 /* Returns TYPE as a string, e.g. "ID" for T_ID. */
  41 const char *
  42 token_type_to_name (enum token_type type)
  43 {
  44   switch (type)
  45     {
  46 #define TOKEN_TYPE(TYPE) case T_##TYPE: return #TYPE;
  47       TOKEN_TYPES
  48 #undef TOKEN_TYPE
  49     case TOKEN_N_TYPES:
  50     default:
  51       return "unknown token type";
  52     }
  53 }
  54
  55 /* Returns an ASCII string that yields TOKEN if it appeared in a syntax file,
  56    as a statically allocated constant string.  This function returns NULL for
  57    tokens that don't have any fixed string representation, such as identifier
  58    and number tokens. */
  59 const char *
  60 token_type_to_string (enum token_type token)
  61 {
  62   switch (token)
  63     {
  64     case T_ID:
  65     case T_POS_NUM:
  66     case T_NEG_NUM:
  67     case T_STRING:
  68     case T_MACRO_ID:
  69     case T_MACRO_PUNCT:
  70     case T_STOP:
  71       return NULL;
  72
  73     case T_ENDCMD:
  74       return ".";
  75
  76     case T_PLUS:
  77       return "+";
  78
  79     case T_DASH:
  80       return "-";
  81
  82     case T_ASTERISK:
  83       return "*";
  84
  85     case T_SLASH:
  86       return "/";
  87
  88     case T_EQUALS:
  89       return "=";
  90
  91     case T_LPAREN:
  92       return "(";
  93
  94     case T_RPAREN:
  95       return ")";
  96
  97     case T_LBRACK:
  98       return "[";
  99
 100     case T_RBRACK:
 101       return "]";
 102
 103     case T_COMMA:
 104       return ",";
 105
 106     case T_AND:
 107       return "AND";
 108
 109     case T_OR:
 110       return "OR";
 111
 112     case T_NOT:
 113       return "NOT";
 114
 115     case T_EQ:
 116       return "EQ";
 117
 118     case T_GE:
 119       return ">=";
 120
 121     case T_GT:
 122       return ">";
 123
 124     case T_LE:
 125       return "<=";
 126
 127     case T_LT:
 128       return "<";
 129
 130     case T_NE:
 131       return "~=";
 132
 133     case T_ALL:
 134       return "ALL";
 135
 136     case T_BY:
 137       return "BY";
 138
 139     case T_TO:
 140       return "TO";
 141
 142     case T_WITH:
 143       return "WITH";
 144
 145     case T_EXP:
 146       return "**";
 147
 148     case TOKEN_N_TYPES:
 149       NOT_REACHED ();
 150     }
 151
 152   NOT_REACHED ();
 153 }
 154
 155 /* Recognizing identifiers. */
 156
 157 static bool
 158 is_ascii_id1 (unsigned char c)
 159 {
 160   return c_isalpha (c) || c == '@' || c == '#' || c == '$';
 161 }
 162
 163 static bool
 164 is_ascii_idn (unsigned char c)
 165 {
 166   return is_ascii_id1 (c) || isdigit (c) || c == '.' || c == '_';
 167 }
 168
 169 /* Returns true if C may be the first byte in an identifier in the current
 170    locale.
 171
 172    (PSPP is transitioning to using Unicode internally for syntax, so please
 173    use lex_uc_is_id1() instead, if possible.) */
 174 bool
 175 lex_is_id1 (char c)
 176 {
 177   return is_ascii_id1 (c) || (unsigned char) c >= 128;
 178 }
 179
 180 /* Returns true if C may be a byte in an identifier other than the first.
 181
 182    (PSPP is transitioning to using Unicode internally for syntax, so please
 183    use lex_uc_is_idn() instead, if possible.) */
 184 bool
 185 lex_is_idn (char c)
 186 {
 187   return is_ascii_idn (c) || (unsigned char) c >= 128;
 188 }
 189
 190 /* Returns true if Unicode code point UC may be the first character in an
 191    identifier in the current locale. */
 192 bool
 193 lex_uc_is_id1 (ucs4_t uc)
 194 {
 195   return (uc < 0x80
 196           ? is_ascii_id1 (uc)
 197           : (uc_is_general_category_withtable (uc,
 198                                                UC_CATEGORY_MASK_L |
 199                                                UC_CATEGORY_MASK_M |
 200                                                UC_CATEGORY_MASK_S)
 201              && uc != 0xfffc && uc != 0xfffd));
 202 }
 203
 204 /* Returns true if Unicode code point UC may be a character in an identifier
 205    other than the first. */
 206 bool
 207 lex_uc_is_idn (ucs4_t uc)
 208 {
 209   return (uc < 0x80
 210           ? is_ascii_id1 (uc) || isdigit (uc) || uc == '.' || uc == '_'
 211           : (uc_is_general_category_withtable (uc,
 212                                                UC_CATEGORY_MASK_L |
 213                                                UC_CATEGORY_MASK_M |
 214                                                UC_CATEGORY_MASK_S |
 215                                                UC_CATEGORY_MASK_N)
 216              && uc != 0xfffc && uc != 0xfffd));
 217 }
 218
 219 /* Returns true if Unicode code point UC is a space that separates tokens. */
 220 bool
 221 lex_uc_is_space (ucs4_t uc)
 222 {
 223   /* These are all of the Unicode characters in category Zs, Zl, or Zp.  */
 224   return (uc == ' ' || (uc <= 0x000d && uc >= 0x0009)
 225           || (uc >= 0x80
 226               && (uc == 0xa0 || uc == 0x85 || uc == 0x1680 || uc == 0x180e
 227                   || (uc >= 0x2000 && uc <= 0x200a)
 228                   || uc == 0x2028 || uc == 0x2029 || uc == 0x202f
 229                   || uc == 0x205f || uc == 0x3000)));
 230 }
 231
 232
 233 /* Returns the length of the longest prefix of STRING that forms
 234    a valid identifier.  Returns zero if STRING does not begin
 235    with a valid identifier.  */
 236 size_t
 237 lex_id_get_length (struct substring string)
 238 {
 239   const uint8_t *s = CHAR_CAST (const uint8_t *, string.string);
 240   size_t len = string.length;
 241   size_t ofs;
 242   int mblen;
 243
 244   for (ofs = 0; ofs < string.length; ofs += mblen)
 245     {
 246       ucs4_t uc;
 247
 248       mblen = u8_mbtouc (&uc, s + ofs, len - ofs);
 249       if (!(ofs == 0 ? lex_uc_is_id1 (uc) : lex_uc_is_idn (uc)))
 250         break;
 251     }
 252
 253   return ofs;
 254 }
 255 \f
 256 /* Comparing identifiers. */
 257
 258 /* Returns true if TOKEN is a case-insensitive match for KEYWORD.
 259
 260    Keywords match if one of the following is true: KEYWORD and
 261    TOKEN are identical, or TOKEN is at least 3 characters long
 262    and those characters are identical to KEYWORD.  (Letters that
 263    differ only in case are considered identical.)
 264
 265    KEYWORD must be ASCII, but TOKEN may be ASCII or UTF-8. */
 266 bool
 267 lex_id_match (struct substring keyword, struct substring token)
 268 {
 269   return lex_id_match_n (keyword, token, 3);
 270 }
 271
 272 /* Returns true if TOKEN is a case-insensitive match for at least
 273    the first N characters of KEYWORD.
 274
 275    KEYWORD must be ASCII, but TOKEN may be ASCII or UTF-8. */
 276 bool
 277 lex_id_match_n (struct substring keyword, struct substring token, size_t n)
 278 {
 279   size_t token_len = ss_length (token);
 280   size_t keyword_len = ss_length (keyword);
 281
 282   if (token_len >= n && token_len < keyword_len)
 283     return ss_equals_case (ss_head (keyword, token_len), token);
 284   else
 285     return ss_equals_case (keyword, token);
 286 }
 287 \f
 288 /* Table of keywords. */
 289 struct keyword
 290   {
 291     int token;
 292     const struct substring identifier;
 293   };
 294
 295 static const struct keyword keywords[] =
 296   {
 297     { T_AND,  SS_LITERAL_INITIALIZER ("AND") },
 298     { T_OR,   SS_LITERAL_INITIALIZER ("OR") },
 299     { T_NOT,  SS_LITERAL_INITIALIZER ("NOT") },
 300     { T_EQ,   SS_LITERAL_INITIALIZER ("EQ") },
 301     { T_GE,   SS_LITERAL_INITIALIZER ("GE") },
 302     { T_GT,   SS_LITERAL_INITIALIZER ("GT") },
 303     { T_LE,   SS_LITERAL_INITIALIZER ("LE") },
 304     { T_LT,   SS_LITERAL_INITIALIZER ("LT") },
 305     { T_NE,   SS_LITERAL_INITIALIZER ("NE") },
 306     { T_ALL,  SS_LITERAL_INITIALIZER ("ALL") },
 307     { T_BY,   SS_LITERAL_INITIALIZER ("BY") },
 308     { T_TO,   SS_LITERAL_INITIALIZER ("TO") },
 309     { T_WITH, SS_LITERAL_INITIALIZER ("WITH") },
 310   };
 311 static const size_t keyword_cnt = sizeof keywords / sizeof *keywords;
 312
 313 /* Returns true if TOKEN is representable as a keyword. */
 314 bool
 315 lex_is_keyword (enum token_type token)
 316 {
 317   const struct keyword *kw;
 318   for (kw = keywords; kw < &keywords[keyword_cnt]; kw++)
 319     if (kw->token == token)
 320       return true;
 321   return false;
 322 }
 323
 324 /* Returns the proper token type, either T_ID or a reserved
 325    keyword enum, for ID. */
 326 int
 327 lex_id_to_token (struct substring id)
 328 {
 329   if (ss_length (id) >= 2 && ss_length (id) <= 4)
 330     {
 331       const struct keyword *kw;
 332       for (kw = keywords; kw < &keywords[keyword_cnt]; kw++)
 333         if (ss_equals_case (kw->identifier, id))
 334           return kw->token;
 335     }
 336
 337   return T_ID;
 338 }