pintos-os.org Git - pspp/blob - src/data/identifier.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 1997-9, 2000, 2005, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 /*
  18    This file is concerned with the definition of the PSPP syntax, NOT the
  19    action of scanning/parsing code .
  20 */
  21
  22 #include <config.h>
  23
  24 #include "data/identifier.h"
  25
  26 #include <string.h>
  27 #include <unictype.h>
  28
  29 #include "libpspp/assertion.h"
  30
  31 #include "gl/c-ctype.h"
  32
  33 #include "gettext.h"
  34 #define _(msgid) gettext (msgid)
  35
  36 /* Tokens. */
  37
  38 /* Returns TYPE as a string, e.g. "ID" for T_ID. */
  39 const char *
  40 token_type_to_name (enum token_type type)
  41 {
  42   switch (type)
  43     {
  44 #define TOKEN_TYPE(TYPE) case T_##TYPE: return #TYPE;
  45       TOKEN_TYPES
  46 #undef TOKEN_TYPE
  47     case TOKEN_N_TYPES:
  48     default:
  49       return "unknown token type";
  50     }
  51 }
  52
  53 /* Returns an ASCII string that yields TOKEN if it appeared in a syntax file,
  54    as a statically allocated constant string.  This function returns NULL for
  55    tokens that don't have any fixed string representation, such as identifier
  56    and number tokens. */
  57 const char *
  58 token_type_to_string (enum token_type token)
  59 {
  60   switch (token)
  61     {
  62     case T_ID:
  63     case T_POS_NUM:
  64     case T_NEG_NUM:
  65     case T_STRING:
  66     case T_STOP:
  67       return NULL;
  68
  69     case T_ENDCMD:
  70       return ".";
  71
  72     case T_PLUS:
  73       return "+";
  74
  75     case T_DASH:
  76       return "-";
  77
  78     case T_ASTERISK:
  79       return "*";
  80
  81     case T_SLASH:
  82       return "/";
  83
  84     case T_EQUALS:
  85       return "=";
  86
  87     case T_LPAREN:
  88       return "(";
  89
  90     case T_RPAREN:
  91       return ")";
  92
  93     case T_LBRACK:
  94       return "[";
  95
  96     case T_RBRACK:
  97       return "]";
  98
  99     case T_COMMA:
 100       return ",";
 101
 102     case T_AND:
 103       return "AND";
 104
 105     case T_OR:
 106       return "OR";
 107
 108     case T_NOT:
 109       return "NOT";
 110
 111     case T_EQ:
 112       return "EQ";
 113
 114     case T_GE:
 115       return ">=";
 116
 117     case T_GT:
 118       return ">";
 119
 120     case T_LE:
 121       return "<=";
 122
 123     case T_LT:
 124       return "<";
 125
 126     case T_NE:
 127       return "~=";
 128
 129     case T_ALL:
 130       return "ALL";
 131
 132     case T_BY:
 133       return "BY";
 134
 135     case T_TO:
 136       return "TO";
 137
 138     case T_WITH:
 139       return "WITH";
 140
 141     case T_EXP:
 142       return "**";
 143
 144     case TOKEN_N_TYPES:
 145       NOT_REACHED ();
 146     }
 147
 148   NOT_REACHED ();
 149 }
 150
 151 /* Recognizing identifiers. */
 152
 153 static bool
 154 is_ascii_id1 (unsigned char c)
 155 {
 156   return c_isalpha (c) || c == '@' || c == '#' || c == '$';
 157 }
 158
 159 static bool
 160 is_ascii_idn (unsigned char c)
 161 {
 162   return is_ascii_id1 (c) || isdigit (c) || c == '.' || c == '_';
 163 }
 164
 165 /* Returns true if C may be the first byte in an identifier in the current
 166    locale.
 167
 168    (PSPP is transitioning to using Unicode internally for syntax, so please
 169    use lex_uc_is_id1() instead, if possible.) */
 170 bool
 171 lex_is_id1 (char c)
 172 {
 173   return is_ascii_id1 (c) || (unsigned char) c >= 128;
 174 }
 175
 176 /* Returns true if C may be a byte in an identifier other than the first.
 177
 178    (PSPP is transitioning to using Unicode internally for syntax, so please
 179    use lex_uc_is_idn() instead, if possible.) */
 180 bool
 181 lex_is_idn (char c)
 182 {
 183   return is_ascii_idn (c) || (unsigned char) c >= 128;
 184 }
 185
 186 /* Returns true if Unicode code point UC may be the first character in an
 187    identifier in the current locale. */
 188 bool
 189 lex_uc_is_id1 (ucs4_t uc)
 190 {
 191   return is_ascii_id1 (uc) || (uc >= 0x80 && uc_is_property_id_start (uc));
 192 }
 193
 194 /* Returns true if Unicode code point UC may be a character in an identifier
 195    other than the first. */
 196 bool
 197 lex_uc_is_idn (ucs4_t uc)
 198 {
 199   return (uc < 0x80
 200           ? is_ascii_id1 (uc) || isdigit (uc) || uc == '.' || uc == '_'
 201           : uc >= 0x80 && uc_is_property_id_continue (uc));
 202 }
 203
 204 /* Returns true if Unicode code point UC is a space that separates tokens. */
 205 bool
 206 lex_uc_is_space (ucs4_t uc)
 207 {
 208   /* These are all of the Unicode characters in category Zs, Zl, or Zp.  */
 209   return (uc == ' ' || (uc <= 0x000d && uc >= 0x0009)
 210           || (uc >= 0x80
 211               && (uc == 0xa0 || uc == 0x85 || uc == 0x1680 || uc == 0x180e
 212                   || (uc >= 0x2000 && uc <= 0x200a)
 213                   || uc == 0x2028 || uc == 0x2029 || uc == 0x202f
 214                   || uc == 0x205f || uc == 0x3000)));
 215 }
 216
 217
 218 /* Returns the length of the longest prefix of STRING that forms
 219    a valid identifier.  Returns zero if STRING does not begin
 220    with a valid identifier.  */
 221 size_t
 222 lex_id_get_length (struct substring string)
 223 {
 224   size_t length = 0;
 225   if (!ss_is_empty (string) && lex_is_id1 (ss_first (string)))
 226     {
 227       length = 1;
 228       while (length < ss_length (string)
 229              && lex_is_idn (ss_at (string, length)))
 230         length++;
 231     }
 232   return length;
 233 }
 234 \f
 235 /* Comparing identifiers. */
 236
 237 /* Returns true if TOKEN is a case-insensitive match for KEYWORD.
 238
 239    Keywords match if one of the following is true: KEYWORD and
 240    TOKEN are identical, or TOKEN is at least 3 characters long
 241    and those characters are identical to KEYWORD.  (Letters that
 242    differ only in case are considered identical.)
 243
 244    KEYWORD must be ASCII, but TOKEN may be ASCII or UTF-8. */
 245 bool
 246 lex_id_match (struct substring keyword, struct substring token)
 247 {
 248   return lex_id_match_n (keyword, token, 3);
 249 }
 250
 251 /* Returns true if TOKEN is a case-insensitive match for at least
 252    the first N characters of KEYWORD.
 253
 254    KEYWORD must be ASCII, but TOKEN may be ASCII or UTF-8. */
 255 bool
 256 lex_id_match_n (struct substring keyword, struct substring token, size_t n)
 257 {
 258   size_t token_len = ss_length (token);
 259   size_t keyword_len = ss_length (keyword);
 260
 261   if (token_len >= n && token_len < keyword_len)
 262     return ss_equals_case (ss_head (keyword, token_len), token);
 263   else
 264     return ss_equals_case (keyword, token);
 265 }
 266 \f
 267 /* Table of keywords. */
 268 struct keyword
 269   {
 270     int token;
 271     const struct substring identifier;
 272   };
 273
 274 static const struct keyword keywords[] =
 275   {
 276     { T_AND,  SS_LITERAL_INITIALIZER ("AND") },
 277     { T_OR,   SS_LITERAL_INITIALIZER ("OR") },
 278     { T_NOT,  SS_LITERAL_INITIALIZER ("NOT") },
 279     { T_EQ,   SS_LITERAL_INITIALIZER ("EQ") },
 280     { T_GE,   SS_LITERAL_INITIALIZER ("GE") },
 281     { T_GT,   SS_LITERAL_INITIALIZER ("GT") },
 282     { T_LE,   SS_LITERAL_INITIALIZER ("LE") },
 283     { T_LT,   SS_LITERAL_INITIALIZER ("LT") },
 284     { T_NE,   SS_LITERAL_INITIALIZER ("NE") },
 285     { T_ALL,  SS_LITERAL_INITIALIZER ("ALL") },
 286     { T_BY,   SS_LITERAL_INITIALIZER ("BY") },
 287     { T_TO,   SS_LITERAL_INITIALIZER ("TO") },
 288     { T_WITH, SS_LITERAL_INITIALIZER ("WITH") },
 289   };
 290 static const size_t keyword_cnt = sizeof keywords / sizeof *keywords;
 291
 292 /* Returns true if TOKEN is representable as a keyword. */
 293 bool
 294 lex_is_keyword (enum token_type token)
 295 {
 296   const struct keyword *kw;
 297   for (kw = keywords; kw < &keywords[keyword_cnt]; kw++)
 298     if (kw->token == token)
 299       return true;
 300   return false;
 301 }
 302
 303 /* Returns the proper token type, either T_ID or a reserved
 304    keyword enum, for ID. */
 305 int
 306 lex_id_to_token (struct substring id)
 307 {
 308   if (ss_length (id) >= 2 && ss_length (id) <= 4)
 309     {
 310       const struct keyword *kw;
 311       for (kw = keywords; kw < &keywords[keyword_cnt]; kw++)
 312         if (ss_equals_case (kw->identifier, id))
 313           return kw->token;
 314     }
 315
 316   return T_ID;
 317 }