pintos-os.org Git - pspp/blob - src/data/identifier.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 1997-9, 2000, 2005, 2009, 2010 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 /*
  18    This file is concerned with the definition of the PSPP syntax, NOT the
  19    action of scanning/parsing code .
  20 */
  21
  22 #include <config.h>
  23
  24 #include "data/identifier.h"
  25
  26 #include <assert.h>
  27 #include <string.h>
  28 #include <unictype.h>
  29
  30 #include "libpspp/assertion.h"
  31
  32 #include "gl/c-ctype.h"
  33
  34 /* Recognizing identifiers. */
  35
  36 static bool
  37 is_ascii_id1 (unsigned char c)
  38 {
  39   return c_isalpha (c) || c == '@' || c == '#' || c == '$';
  40 }
  41
  42 static bool
  43 is_ascii_idn (unsigned char c)
  44 {
  45   return is_ascii_id1 (c) || isdigit (c) || c == '.' || c == '_';
  46 }
  47
  48 /* Returns true if C may be the first byte in an identifier in the current
  49    locale.
  50
  51    (PSPP is transitioning to using Unicode internally for syntax, so please
  52    use lex_uc_is_id1() instead, if possible.) */
  53 bool
  54 lex_is_id1 (char c)
  55 {
  56   return is_ascii_id1 (c) || (unsigned char) c >= 128;
  57 }
  58
  59 /* Returns true if C may be a byte in an identifier other than the first.
  60
  61    (PSPP is transitioning to using Unicode internally for syntax, so please
  62    use lex_uc_is_idn() instead, if possible.) */
  63 bool
  64 lex_is_idn (char c)
  65 {
  66   return is_ascii_idn (c) || (unsigned char) c >= 128;
  67 }
  68
  69 /* Returns true if Unicode code point UC may be the first character in an
  70    identifier in the current locale. */
  71 bool
  72 lex_uc_is_id1 (ucs4_t uc)
  73 {
  74   return is_ascii_id1 (uc) || (uc >= 0x80 && uc_is_property_id_start (uc));
  75 }
  76
  77 /* Returns true if Unicode code point UC may be a character in an identifier
  78    other than the first. */
  79 bool
  80 lex_uc_is_idn (ucs4_t uc)
  81 {
  82   return (is_ascii_id1 (uc) || isdigit (uc) || uc == '.' || uc == '_'
  83           || (uc >= 0x80 && uc_is_property_id_continue (uc)));
  84 }
  85
  86 /* Returns true if Unicode code point UC is a space that separates tokens. */
  87 bool
  88 lex_uc_is_space (ucs4_t uc)
  89 {
  90   /* These are all of the Unicode characters in category Zs, Zl, or Zp.  */
  91   return (uc == ' ' || (uc <= 0x000d && uc >= 0x0009)
  92           || (uc >= 0x80
  93               && (uc == 0xa0 || uc == 0x85 || uc == 0x1680 || uc == 0x180e
  94                   || (uc >= 0x2000 && uc <= 0x200a)
  95                   || uc == 0x2028 || uc == 0x2029 || uc == 0x202f
  96                   || uc == 0x205f || uc == 0x3000)));
  97 }
  98
  99
 100 /* Returns the length of the longest prefix of STRING that forms
 101    a valid identifier.  Returns zero if STRING does not begin
 102    with a valid identifier.  */
 103 size_t
 104 lex_id_get_length (struct substring string)
 105 {
 106   size_t length = 0;
 107   if (!ss_is_empty (string) && lex_is_id1 (ss_first (string)))
 108     {
 109       length = 1;
 110       while (length < ss_length (string)
 111              && lex_is_idn (ss_at (string, length)))
 112         length++;
 113     }
 114   return length;
 115 }
 116 \f
 117 /* Comparing identifiers. */
 118
 119 /* Returns true if TOKEN is a case-insensitive match for KEYWORD.
 120
 121    Keywords match if one of the following is true: KEYWORD and
 122    TOKEN are identical, or TOKEN is at least 3 characters long
 123    and those characters are identical to KEYWORD.  (Letters that
 124    differ only in case are considered identical.)
 125
 126    KEYWORD must be ASCII, but TOKEN may be ASCII or UTF-8. */
 127 bool
 128 lex_id_match (struct substring keyword, struct substring token)
 129 {
 130   return lex_id_match_n (keyword, token, 3);
 131 }
 132
 133 /* Returns true if TOKEN is a case-insensitive match for at least
 134    the first N characters of KEYWORD.
 135
 136    KEYWORD must be ASCII, but TOKEN may be ASCII or UTF-8. */
 137 bool
 138 lex_id_match_n (struct substring keyword, struct substring token, size_t n)
 139 {
 140   size_t token_len = ss_length (token);
 141   size_t keyword_len = ss_length (keyword);
 142
 143   if (token_len >= n && token_len < keyword_len)
 144     return ss_equals_case (ss_head (keyword, token_len), token);
 145   else
 146     return ss_equals_case (keyword, token);
 147 }
 148 \f
 149 /* Table of keywords. */
 150 struct keyword
 151   {
 152     int token;
 153     const struct substring identifier;
 154   };
 155
 156 static const struct keyword keywords[] =
 157   {
 158     { T_AND,  SS_LITERAL_INITIALIZER ("AND") },
 159     { T_OR,   SS_LITERAL_INITIALIZER ("OR") },
 160     { T_NOT,  SS_LITERAL_INITIALIZER ("NOT") },
 161     { T_EQ,   SS_LITERAL_INITIALIZER ("EQ") },
 162     { T_GE,   SS_LITERAL_INITIALIZER ("GE") },
 163     { T_GT,   SS_LITERAL_INITIALIZER ("GT") },
 164     { T_LE,   SS_LITERAL_INITIALIZER ("LE") },
 165     { T_LT,   SS_LITERAL_INITIALIZER ("LT") },
 166     { T_NE,   SS_LITERAL_INITIALIZER ("NE") },
 167     { T_ALL,  SS_LITERAL_INITIALIZER ("ALL") },
 168     { T_BY,   SS_LITERAL_INITIALIZER ("BY") },
 169     { T_TO,   SS_LITERAL_INITIALIZER ("TO") },
 170     { T_WITH, SS_LITERAL_INITIALIZER ("WITH") },
 171   };
 172 static const size_t keyword_cnt = sizeof keywords / sizeof *keywords;
 173
 174 /* Returns true if TOKEN is representable as a keyword. */
 175 bool
 176 lex_is_keyword (int token)
 177 {
 178   const struct keyword *kw;
 179   for (kw = keywords; kw < &keywords[keyword_cnt]; kw++)
 180     if (kw->token == token)
 181       return true;
 182   return false;
 183 }
 184
 185 /* Returns the proper token type, either T_ID or a reserved
 186    keyword enum, for ID. */
 187 int
 188 lex_id_to_token (struct substring id)
 189 {
 190   if (ss_length (id) >= 2 && ss_length (id) <= 4)
 191     {
 192       const struct keyword *kw;
 193       for (kw = keywords; kw < &keywords[keyword_cnt]; kw++)
 194         if (ss_equals_case (kw->identifier, id))
 195           return kw->token;
 196     }
 197
 198   return T_ID;
 199 }
 200
 201 /* Returns the name for the given keyword token type. */
 202 const char *
 203 lex_id_name (int token)
 204 {
 205   const struct keyword *kw;
 206
 207   for (kw = keywords; kw < &keywords[keyword_cnt]; kw++)
 208     if (kw->token == token)
 209       {
 210         /* A "struct substring" is not guaranteed to be
 211            null-terminated, as our caller expects, but in this
 212            case it always will be. */
 213         return ss_data (kw->identifier);
 214       }
 215   NOT_REACHED ();
 216 }