src/data/identifier.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 1997-9, 2000, 2005, 2009, 2010 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 /*
  18    This file is concerned with the definition of the PSPP syntax, NOT the
  19    action of scanning/parsing code .
  20 */
  21
  22 #include <config.h>
  23
  24 #include "data/identifier.h"
  25
  26 #include <assert.h>
  27 #include <string.h>
  28 #include <unictype.h>
  29
  30 #include "libpspp/assertion.h"
  31
  32 #include "gl/c-ctype.h"
  33
  34 /* Tokens. */
  35
  36 const char *
  37 token_type_to_string (enum token_type type)
  38 {
  39   switch (type)
  40     {
  41 #define TOKEN_TYPE(TYPE) case T_##TYPE: return #TYPE;
  42       TOKEN_TYPES
  43 #undef TOKEN_TYPE
  44     case TOKEN_N_TYPES:
  45     default:
  46       return "unknown token type";
  47     }
  48 }
  49
  50 /* Recognizing identifiers. */
  51
  52 static bool
  53 is_ascii_id1 (unsigned char c)
  54 {
  55   return c_isalpha (c) || c == '@' || c == '#' || c == '$';
  56 }
  57
  58 static bool
  59 is_ascii_idn (unsigned char c)
  60 {
  61   return is_ascii_id1 (c) || isdigit (c) || c == '.' || c == '_';
  62 }
  63
  64 /* Returns true if C may be the first byte in an identifier in the current
  65    locale.
  66
  67    (PSPP is transitioning to using Unicode internally for syntax, so please
  68    use lex_uc_is_id1() instead, if possible.) */
  69 bool
  70 lex_is_id1 (char c)
  71 {
  72   return is_ascii_id1 (c) || (unsigned char) c >= 128;
  73 }
  74
  75 /* Returns true if C may be a byte in an identifier other than the first.
  76
  77    (PSPP is transitioning to using Unicode internally for syntax, so please
  78    use lex_uc_is_idn() instead, if possible.) */
  79 bool
  80 lex_is_idn (char c)
  81 {
  82   return is_ascii_idn (c) || (unsigned char) c >= 128;
  83 }
  84
  85 /* Returns true if Unicode code point UC may be the first character in an
  86    identifier in the current locale. */
  87 bool
  88 lex_uc_is_id1 (ucs4_t uc)
  89 {
  90   return is_ascii_id1 (uc) || (uc >= 0x80 && uc_is_property_id_start (uc));
  91 }
  92
  93 /* Returns true if Unicode code point UC may be a character in an identifier
  94    other than the first. */
  95 bool
  96 lex_uc_is_idn (ucs4_t uc)
  97 {
  98   return (is_ascii_id1 (uc) || isdigit (uc) || uc == '.' || uc == '_'
  99           || (uc >= 0x80 && uc_is_property_id_continue (uc)));
 100 }
 101
 102 /* Returns true if Unicode code point UC is a space that separates tokens. */
 103 bool
 104 lex_uc_is_space (ucs4_t uc)
 105 {
 106   /* These are all of the Unicode characters in category Zs, Zl, or Zp.  */
 107   return (uc == ' ' || (uc <= 0x000d && uc >= 0x0009)
 108           || (uc >= 0x80
 109               && (uc == 0xa0 || uc == 0x85 || uc == 0x1680 || uc == 0x180e
 110                   || (uc >= 0x2000 && uc <= 0x200a)
 111                   || uc == 0x2028 || uc == 0x2029 || uc == 0x202f
 112                   || uc == 0x205f || uc == 0x3000)));
 113 }
 114
 115
 116 /* Returns the length of the longest prefix of STRING that forms
 117    a valid identifier.  Returns zero if STRING does not begin
 118    with a valid identifier.  */
 119 size_t
 120 lex_id_get_length (struct substring string)
 121 {
 122   size_t length = 0;
 123   if (!ss_is_empty (string) && lex_is_id1 (ss_first (string)))
 124     {
 125       length = 1;
 126       while (length < ss_length (string)
 127              && lex_is_idn (ss_at (string, length)))
 128         length++;
 129     }
 130   return length;
 131 }
 132 \f
 133 /* Comparing identifiers. */
 134
 135 /* Returns true if TOKEN is a case-insensitive match for KEYWORD.
 136
 137    Keywords match if one of the following is true: KEYWORD and
 138    TOKEN are identical, or TOKEN is at least 3 characters long
 139    and those characters are identical to KEYWORD.  (Letters that
 140    differ only in case are considered identical.)
 141
 142    KEYWORD must be ASCII, but TOKEN may be ASCII or UTF-8. */
 143 bool
 144 lex_id_match (struct substring keyword, struct substring token)
 145 {
 146   return lex_id_match_n (keyword, token, 3);
 147 }
 148
 149 /* Returns true if TOKEN is a case-insensitive match for at least
 150    the first N characters of KEYWORD.
 151
 152    KEYWORD must be ASCII, but TOKEN may be ASCII or UTF-8. */
 153 bool
 154 lex_id_match_n (struct substring keyword, struct substring token, size_t n)
 155 {
 156   size_t token_len = ss_length (token);
 157   size_t keyword_len = ss_length (keyword);
 158
 159   if (token_len >= n && token_len < keyword_len)
 160     return ss_equals_case (ss_head (keyword, token_len), token);
 161   else
 162     return ss_equals_case (keyword, token);
 163 }
 164 \f
 165 /* Table of keywords. */
 166 struct keyword
 167   {
 168     int token;
 169     const struct substring identifier;
 170   };
 171
 172 static const struct keyword keywords[] =
 173   {
 174     { T_AND,  SS_LITERAL_INITIALIZER ("AND") },
 175     { T_OR,   SS_LITERAL_INITIALIZER ("OR") },
 176     { T_NOT,  SS_LITERAL_INITIALIZER ("NOT") },
 177     { T_EQ,   SS_LITERAL_INITIALIZER ("EQ") },
 178     { T_GE,   SS_LITERAL_INITIALIZER ("GE") },
 179     { T_GT,   SS_LITERAL_INITIALIZER ("GT") },
 180     { T_LE,   SS_LITERAL_INITIALIZER ("LE") },
 181     { T_LT,   SS_LITERAL_INITIALIZER ("LT") },
 182     { T_NE,   SS_LITERAL_INITIALIZER ("NE") },
 183     { T_ALL,  SS_LITERAL_INITIALIZER ("ALL") },
 184     { T_BY,   SS_LITERAL_INITIALIZER ("BY") },
 185     { T_TO,   SS_LITERAL_INITIALIZER ("TO") },
 186     { T_WITH, SS_LITERAL_INITIALIZER ("WITH") },
 187   };
 188 static const size_t keyword_cnt = sizeof keywords / sizeof *keywords;
 189
 190 /* Returns true if TOKEN is representable as a keyword. */
 191 bool
 192 lex_is_keyword (enum token_type token)
 193 {
 194   const struct keyword *kw;
 195   for (kw = keywords; kw < &keywords[keyword_cnt]; kw++)
 196     if (kw->token == token)
 197       return true;
 198   return false;
 199 }
 200
 201 /* Returns the proper token type, either T_ID or a reserved
 202    keyword enum, for ID. */
 203 int
 204 lex_id_to_token (struct substring id)
 205 {
 206   if (ss_length (id) >= 2 && ss_length (id) <= 4)
 207     {
 208       const struct keyword *kw;
 209       for (kw = keywords; kw < &keywords[keyword_cnt]; kw++)
 210         if (ss_equals_case (kw->identifier, id))
 211           return kw->token;
 212     }
 213
 214   return T_ID;
 215 }
 216
 217 /* Returns the name for the given keyword token type. */
 218 const char *
 219 lex_id_name (enum token_type token)
 220 {
 221   const struct keyword *kw;
 222
 223   for (kw = keywords; kw < &keywords[keyword_cnt]; kw++)
 224     if (kw->token == token)
 225       {
 226         /* A "struct substring" is not guaranteed to be
 227            null-terminated, as our caller expects, but in this
 228            case it always will be. */
 229         return ss_data (kw->identifier);
 230       }
 231   NOT_REACHED ();
 232 }