src/data/identifier.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 1997-9, 2000, 2005, 2009, 2010 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 /*
  18    This file is concerned with the definition of the PSPP syntax, NOT the
  19    action of scanning/parsing code .
  20 */
  21
  22 #include <config.h>
  23
  24 #include "data/identifier.h"
  25
  26 #include <assert.h>
  27 #include <string.h>
  28 #include <unictype.h>
  29 #include <unistr.h>
  30
  31 #include "libpspp/assertion.h"
  32 #include "libpspp/cast.h"
  33 #include "libpspp/i18n.h"
  34 #include "libpspp/message.h"
  35
  36 #include "gl/c-ctype.h"
  37
  38 #include "gettext.h"
  39 #define _(msgid) gettext (msgid)
  40
  41 /* Tokens. */
  42
  43 /* Returns TYPE as a string, e.g. "ID" for T_ID. */
  44 const char *
  45 token_type_to_name (enum token_type type)
  46 {
  47   switch (type)
  48     {
  49 #define TOKEN_TYPE(TYPE) case T_##TYPE: return #TYPE;
  50       TOKEN_TYPES
  51 #undef TOKEN_TYPE
  52     case TOKEN_N_TYPES:
  53     default:
  54       return "unknown token type";
  55     }
  56 }
  57
  58 /* Returns an ASCII string that yields TOKEN if it appeared in a syntax file,
  59    as a statically allocated constant string.  This function returns NULL for
  60    tokens that don't have any fixed string representation, such as identifier
  61    and number tokens. */
  62 const char *
  63 token_type_to_string (enum token_type token)
  64 {
  65   switch (token)
  66     {
  67     case T_ID:
  68     case T_POS_NUM:
  69     case T_NEG_NUM:
  70     case T_STRING:
  71     case T_STOP:
  72       return NULL;
  73
  74     case T_ENDCMD:
  75       return ".";
  76
  77     case T_PLUS:
  78       return "+";
  79
  80     case T_DASH:
  81       return "-";
  82
  83     case T_ASTERISK:
  84       return "*";
  85
  86     case T_SLASH:
  87       return "/";
  88
  89     case T_EQUALS:
  90       return "=";
  91
  92     case T_LPAREN:
  93       return "(";
  94
  95     case T_RPAREN:
  96       return ")";
  97
  98     case T_LBRACK:
  99       return "[";
 100
 101     case T_RBRACK:
 102       return "]";
 103
 104     case T_COMMA:
 105       return ",";
 106
 107     case T_AND:
 108       return "AND";
 109
 110     case T_OR:
 111       return "OR";
 112
 113     case T_NOT:
 114       return "NOT";
 115
 116     case T_EQ:
 117       return "EQ";
 118
 119     case T_GE:
 120       return ">=";
 121
 122     case T_GT:
 123       return ">";
 124
 125     case T_LE:
 126       return "<=";
 127
 128     case T_LT:
 129       return "<";
 130
 131     case T_NE:
 132       return "~=";
 133
 134     case T_ALL:
 135       return "ALL";
 136
 137     case T_BY:
 138       return "BY";
 139
 140     case T_TO:
 141       return "TO";
 142
 143     case T_WITH:
 144       return "WITH";
 145
 146     case T_EXP:
 147       return "**";
 148
 149     case TOKEN_N_TYPES:
 150       NOT_REACHED ();
 151     }
 152
 153   NOT_REACHED ();
 154 }
 155
 156 /* Recognizing identifiers. */
 157
 158 static bool
 159 is_ascii_id1 (unsigned char c)
 160 {
 161   return c_isalpha (c) || c == '@' || c == '#' || c == '$';
 162 }
 163
 164 static bool
 165 is_ascii_idn (unsigned char c)
 166 {
 167   return is_ascii_id1 (c) || isdigit (c) || c == '.' || c == '_';
 168 }
 169
 170 /* Returns true if C may be the first byte in an identifier in the current
 171    locale.
 172
 173    (PSPP is transitioning to using Unicode internally for syntax, so please
 174    use lex_uc_is_id1() instead, if possible.) */
 175 bool
 176 lex_is_id1 (char c)
 177 {
 178   return is_ascii_id1 (c) || (unsigned char) c >= 128;
 179 }
 180
 181 /* Returns true if C may be a byte in an identifier other than the first.
 182
 183    (PSPP is transitioning to using Unicode internally for syntax, so please
 184    use lex_uc_is_idn() instead, if possible.) */
 185 bool
 186 lex_is_idn (char c)
 187 {
 188   return is_ascii_idn (c) || (unsigned char) c >= 128;
 189 }
 190
 191 /* Returns true if Unicode code point UC may be the first character in an
 192    identifier in the current locale. */
 193 bool
 194 lex_uc_is_id1 (ucs4_t uc)
 195 {
 196   return is_ascii_id1 (uc) || (uc >= 0x80 && uc_is_property_id_start (uc));
 197 }
 198
 199 /* Returns true if Unicode code point UC may be a character in an identifier
 200    other than the first. */
 201 bool
 202 lex_uc_is_idn (ucs4_t uc)
 203 {
 204   return (is_ascii_id1 (uc) || isdigit (uc) || uc == '.' || uc == '_'
 205           || (uc >= 0x80 && uc_is_property_id_continue (uc)));
 206 }
 207
 208 /* Returns true if Unicode code point UC is a space that separates tokens. */
 209 bool
 210 lex_uc_is_space (ucs4_t uc)
 211 {
 212   /* These are all of the Unicode characters in category Zs, Zl, or Zp.  */
 213   return (uc == ' ' || (uc <= 0x000d && uc >= 0x0009)
 214           || (uc >= 0x80
 215               && (uc == 0xa0 || uc == 0x85 || uc == 0x1680 || uc == 0x180e
 216                   || (uc >= 0x2000 && uc <= 0x200a)
 217                   || uc == 0x2028 || uc == 0x2029 || uc == 0x202f
 218                   || uc == 0x205f || uc == 0x3000)));
 219 }
 220
 221
 222 /* Returns the length of the longest prefix of STRING that forms
 223    a valid identifier.  Returns zero if STRING does not begin
 224    with a valid identifier.  */
 225 size_t
 226 lex_id_get_length (struct substring string)
 227 {
 228   size_t length = 0;
 229   if (!ss_is_empty (string) && lex_is_id1 (ss_first (string)))
 230     {
 231       length = 1;
 232       while (length < ss_length (string)
 233              && lex_is_idn (ss_at (string, length)))
 234         length++;
 235     }
 236   return length;
 237 }
 238 \f
 239 /* Comparing identifiers. */
 240
 241 /* Returns true if TOKEN is a case-insensitive match for KEYWORD.
 242
 243    Keywords match if one of the following is true: KEYWORD and
 244    TOKEN are identical, or TOKEN is at least 3 characters long
 245    and those characters are identical to KEYWORD.  (Letters that
 246    differ only in case are considered identical.)
 247
 248    KEYWORD must be ASCII, but TOKEN may be ASCII or UTF-8. */
 249 bool
 250 lex_id_match (struct substring keyword, struct substring token)
 251 {
 252   return lex_id_match_n (keyword, token, 3);
 253 }
 254
 255 /* Returns true if TOKEN is a case-insensitive match for at least
 256    the first N characters of KEYWORD.
 257
 258    KEYWORD must be ASCII, but TOKEN may be ASCII or UTF-8. */
 259 bool
 260 lex_id_match_n (struct substring keyword, struct substring token, size_t n)
 261 {
 262   size_t token_len = ss_length (token);
 263   size_t keyword_len = ss_length (keyword);
 264
 265   if (token_len >= n && token_len < keyword_len)
 266     return ss_equals_case (ss_head (keyword, token_len), token);
 267   else
 268     return ss_equals_case (keyword, token);
 269 }
 270 \f
 271 /* Table of keywords. */
 272 struct keyword
 273   {
 274     int token;
 275     const struct substring identifier;
 276   };
 277
 278 static const struct keyword keywords[] =
 279   {
 280     { T_AND,  SS_LITERAL_INITIALIZER ("AND") },
 281     { T_OR,   SS_LITERAL_INITIALIZER ("OR") },
 282     { T_NOT,  SS_LITERAL_INITIALIZER ("NOT") },
 283     { T_EQ,   SS_LITERAL_INITIALIZER ("EQ") },
 284     { T_GE,   SS_LITERAL_INITIALIZER ("GE") },
 285     { T_GT,   SS_LITERAL_INITIALIZER ("GT") },
 286     { T_LE,   SS_LITERAL_INITIALIZER ("LE") },
 287     { T_LT,   SS_LITERAL_INITIALIZER ("LT") },
 288     { T_NE,   SS_LITERAL_INITIALIZER ("NE") },
 289     { T_ALL,  SS_LITERAL_INITIALIZER ("ALL") },
 290     { T_BY,   SS_LITERAL_INITIALIZER ("BY") },
 291     { T_TO,   SS_LITERAL_INITIALIZER ("TO") },
 292     { T_WITH, SS_LITERAL_INITIALIZER ("WITH") },
 293   };
 294 static const size_t keyword_cnt = sizeof keywords / sizeof *keywords;
 295
 296 /* Returns true if TOKEN is representable as a keyword. */
 297 bool
 298 lex_is_keyword (enum token_type token)
 299 {
 300   const struct keyword *kw;
 301   for (kw = keywords; kw < &keywords[keyword_cnt]; kw++)
 302     if (kw->token == token)
 303       return true;
 304   return false;
 305 }
 306
 307 /* Returns the proper token type, either T_ID or a reserved
 308    keyword enum, for ID. */
 309 int
 310 lex_id_to_token (struct substring id)
 311 {
 312   if (ss_length (id) >= 2 && ss_length (id) <= 4)
 313     {
 314       const struct keyword *kw;
 315       for (kw = keywords; kw < &keywords[keyword_cnt]; kw++)
 316         if (ss_equals_case (kw->identifier, id))
 317           return kw->token;
 318     }
 319
 320   return T_ID;
 321 }
 322
 323 /* Returns the name for the given keyword token type. */
 324 const char *
 325 lex_id_name (enum token_type token)
 326 {
 327   const struct keyword *kw;
 328
 329   for (kw = keywords; kw < &keywords[keyword_cnt]; kw++)
 330     if (kw->token == token)
 331       {
 332         /* A "struct substring" is not guaranteed to be
 333            null-terminated, as our caller expects, but in this
 334            case it always will be. */
 335         return ss_data (kw->identifier);
 336       }
 337   NOT_REACHED ();
 338 }