pintos-os.org Git - pspp/blob - src/data/identifier.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 1997-9, 2000, 2005, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 /*
  18    This file is concerned with the definition of the PSPP syntax, NOT the
  19    action of scanning/parsing code .
  20 */
  21
  22 #include <config.h>
  23
  24 #include "data/identifier.h"
  25
  26 #include <string.h>
  27 #include <unistr.h>
  28 #include <unictype.h>
  29
  30 #include "libpspp/assertion.h"
  31 #include "libpspp/cast.h"
  32
  33 #include "gl/c-ctype.h"
  34
  35 #include "gettext.h"
  36 #define _(msgid) gettext (msgid)
  37
  38 /* Tokens. */
  39
  40 /* Returns TYPE as a string, e.g. "ID" for T_ID. */
  41 const char *
  42 token_type_to_name (enum token_type type)
  43 {
  44   switch (type)
  45     {
  46 #define TOKEN_TYPE(TYPE) case T_##TYPE: return #TYPE;
  47       TOKEN_TYPES
  48 #undef TOKEN_TYPE
  49     case TOKEN_N_TYPES:
  50     default:
  51       return "unknown token type";
  52     }
  53 }
  54
  55 /* Returns an ASCII string that yields TOKEN if it appeared in a syntax file,
  56    as a statically allocated constant string.  This function returns NULL for
  57    tokens that don't have any fixed string representation, such as identifier
  58    and number tokens. */
  59 const char *
  60 token_type_to_string (enum token_type token)
  61 {
  62   switch (token)
  63     {
  64     case T_ID:
  65     case T_MACRO_ID:
  66     case T_POS_NUM:
  67     case T_NEG_NUM:
  68     case T_STRING:
  69     case T_STOP:
  70       return NULL;
  71
  72     case T_ENDCMD:
  73       return ".";
  74
  75     case T_PLUS:
  76       return "+";
  77
  78     case T_DASH:
  79       return "-";
  80
  81     case T_ASTERISK:
  82       return "*";
  83
  84     case T_SLASH:
  85       return "/";
  86
  87     case T_EQUALS:
  88       return "=";
  89
  90     case T_LPAREN:
  91       return "(";
  92
  93     case T_RPAREN:
  94       return ")";
  95
  96     case T_LBRACK:
  97       return "[";
  98
  99     case T_RBRACK:
 100       return "]";
 101
 102     case T_COMMA:
 103       return ",";
 104
 105     case T_AND:
 106       return "AND";
 107
 108     case T_OR:
 109       return "OR";
 110
 111     case T_NOT:
 112       return "NOT";
 113
 114     case T_EQ:
 115       return "EQ";
 116
 117     case T_GE:
 118       return ">=";
 119
 120     case T_GT:
 121       return ">";
 122
 123     case T_LE:
 124       return "<=";
 125
 126     case T_LT:
 127       return "<";
 128
 129     case T_NE:
 130       return "~=";
 131
 132     case T_ALL:
 133       return "ALL";
 134
 135     case T_BY:
 136       return "BY";
 137
 138     case T_TO:
 139       return "TO";
 140
 141     case T_WITH:
 142       return "WITH";
 143
 144     case T_EXP:
 145       return "**";
 146
 147     case TOKEN_N_TYPES:
 148       NOT_REACHED ();
 149     }
 150
 151   NOT_REACHED ();
 152 }
 153
 154 /* Recognizing identifiers. */
 155
 156 static bool
 157 is_ascii_id1 (unsigned char c)
 158 {
 159   return c_isalpha (c) || c == '@' || c == '#' || c == '$';
 160 }
 161
 162 static bool
 163 is_ascii_idn (unsigned char c)
 164 {
 165   return is_ascii_id1 (c) || isdigit (c) || c == '.' || c == '_';
 166 }
 167
 168 /* Returns true if C may be the first byte in an identifier in the current
 169    locale.
 170
 171    (PSPP is transitioning to using Unicode internally for syntax, so please
 172    use lex_uc_is_id1() instead, if possible.) */
 173 bool
 174 lex_is_id1 (char c)
 175 {
 176   return is_ascii_id1 (c) || (unsigned char) c >= 128;
 177 }
 178
 179 /* Returns true if C may be a byte in an identifier other than the first.
 180
 181    (PSPP is transitioning to using Unicode internally for syntax, so please
 182    use lex_uc_is_idn() instead, if possible.) */
 183 bool
 184 lex_is_idn (char c)
 185 {
 186   return is_ascii_idn (c) || (unsigned char) c >= 128;
 187 }
 188
 189 /* Returns true if Unicode code point UC may be the first character in an
 190    identifier in the current locale. */
 191 bool
 192 lex_uc_is_id1 (ucs4_t uc)
 193 {
 194   return (uc < 0x80
 195           ? is_ascii_id1 (uc)
 196           : (uc_is_general_category_withtable (uc,
 197                                                UC_CATEGORY_MASK_L |
 198                                                UC_CATEGORY_MASK_M |
 199                                                UC_CATEGORY_MASK_S)
 200              && uc != 0xfffc && uc != 0xfffd));
 201 }
 202
 203 /* Returns true if Unicode code point UC may be a character in an identifier
 204    other than the first. */
 205 bool
 206 lex_uc_is_idn (ucs4_t uc)
 207 {
 208   return (uc < 0x80
 209           ? is_ascii_id1 (uc) || isdigit (uc) || uc == '.' || uc == '_'
 210           : (uc_is_general_category_withtable (uc,
 211                                                UC_CATEGORY_MASK_L |
 212                                                UC_CATEGORY_MASK_M |
 213                                                UC_CATEGORY_MASK_S |
 214                                                UC_CATEGORY_MASK_N)
 215              && uc != 0xfffc && uc != 0xfffd));
 216 }
 217
 218 /* Returns true if Unicode code point UC is a space that separates tokens. */
 219 bool
 220 lex_uc_is_space (ucs4_t uc)
 221 {
 222   /* These are all of the Unicode characters in category Zs, Zl, or Zp.  */
 223   return (uc == ' ' || (uc <= 0x000d && uc >= 0x0009)
 224           || (uc >= 0x80
 225               && (uc == 0xa0 || uc == 0x85 || uc == 0x1680 || uc == 0x180e
 226                   || (uc >= 0x2000 && uc <= 0x200a)
 227                   || uc == 0x2028 || uc == 0x2029 || uc == 0x202f
 228                   || uc == 0x205f || uc == 0x3000)));
 229 }
 230
 231
 232 /* Returns the length of the longest prefix of STRING that forms
 233    a valid identifier.  Returns zero if STRING does not begin
 234    with a valid identifier.  */
 235 size_t
 236 lex_id_get_length (struct substring string)
 237 {
 238   const uint8_t *s = CHAR_CAST (const uint8_t *, string.string);
 239   size_t len = string.length;
 240   size_t ofs;
 241   int mblen;
 242
 243   for (ofs = 0; ofs < string.length; ofs += mblen)
 244     {
 245       ucs4_t uc;
 246
 247       mblen = u8_mbtouc (&uc, s + ofs, len - ofs);
 248       if (!(ofs == 0 ? lex_uc_is_id1 (uc) : lex_uc_is_idn (uc)))
 249         break;
 250     }
 251
 252   return ofs;
 253 }
 254 \f
 255 /* Comparing identifiers. */
 256
 257 /* Returns true if TOKEN is a case-insensitive match for KEYWORD.
 258
 259    Keywords match if one of the following is true: KEYWORD and
 260    TOKEN are identical, or TOKEN is at least 3 characters long
 261    and those characters are identical to KEYWORD.  (Letters that
 262    differ only in case are considered identical.)
 263
 264    KEYWORD must be ASCII, but TOKEN may be ASCII or UTF-8. */
 265 bool
 266 lex_id_match (struct substring keyword, struct substring token)
 267 {
 268   return lex_id_match_n (keyword, token, 3);
 269 }
 270
 271 /* Returns true if TOKEN is a case-insensitive match for at least
 272    the first N characters of KEYWORD.
 273
 274    KEYWORD must be ASCII, but TOKEN may be ASCII or UTF-8. */
 275 bool
 276 lex_id_match_n (struct substring keyword, struct substring token, size_t n)
 277 {
 278   size_t token_len = ss_length (token);
 279   size_t keyword_len = ss_length (keyword);
 280
 281   if (token_len >= n && token_len < keyword_len)
 282     return ss_equals_case (ss_head (keyword, token_len), token);
 283   else
 284     return ss_equals_case (keyword, token);
 285 }
 286 \f
 287 /* Table of keywords. */
 288 struct keyword
 289   {
 290     int token;
 291     const struct substring identifier;
 292   };
 293
 294 static const struct keyword keywords[] =
 295   {
 296     { T_AND,  SS_LITERAL_INITIALIZER ("AND") },
 297     { T_OR,   SS_LITERAL_INITIALIZER ("OR") },
 298     { T_NOT,  SS_LITERAL_INITIALIZER ("NOT") },
 299     { T_EQ,   SS_LITERAL_INITIALIZER ("EQ") },
 300     { T_GE,   SS_LITERAL_INITIALIZER ("GE") },
 301     { T_GT,   SS_LITERAL_INITIALIZER ("GT") },
 302     { T_LE,   SS_LITERAL_INITIALIZER ("LE") },
 303     { T_LT,   SS_LITERAL_INITIALIZER ("LT") },
 304     { T_NE,   SS_LITERAL_INITIALIZER ("NE") },
 305     { T_ALL,  SS_LITERAL_INITIALIZER ("ALL") },
 306     { T_BY,   SS_LITERAL_INITIALIZER ("BY") },
 307     { T_TO,   SS_LITERAL_INITIALIZER ("TO") },
 308     { T_WITH, SS_LITERAL_INITIALIZER ("WITH") },
 309   };
 310 static const size_t keyword_cnt = sizeof keywords / sizeof *keywords;
 311
 312 /* Returns true if TOKEN is representable as a keyword. */
 313 bool
 314 lex_is_keyword (enum token_type token)
 315 {
 316   const struct keyword *kw;
 317   for (kw = keywords; kw < &keywords[keyword_cnt]; kw++)
 318     if (kw->token == token)
 319       return true;
 320   return false;
 321 }
 322
 323 /* Returns the proper token type, either T_ID or a reserved
 324    keyword enum, for ID. */
 325 int
 326 lex_id_to_token (struct substring id)
 327 {
 328   if (ss_length (id) >= 2 && ss_length (id) <= 4)
 329     {
 330       const struct keyword *kw;
 331       for (kw = keywords; kw < &keywords[keyword_cnt]; kw++)
 332         if (ss_equals_case (kw->identifier, id))
 333           return kw->token;
 334     }
 335
 336   return T_ID;
 337 }