1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2005, 2009, 2010 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
18 This file is concerned with the definition of the PSPP syntax, NOT the
19 action of scanning/parsing code .
24 #include "data/identifier.h"
31 #include "libpspp/assertion.h"
32 #include "libpspp/cast.h"
33 #include "libpspp/i18n.h"
34 #include "libpspp/message.h"
36 #include "gl/c-ctype.h"
39 #define _(msgid) gettext (msgid)
43 /* Returns TYPE as a string, e.g. "ID" for T_ID. */
45 token_type_to_name (enum token_type type)
49 #define TOKEN_TYPE(TYPE) case T_##TYPE: return #TYPE;
54 return "unknown token type";
58 /* Returns an ASCII string that yields TOKEN if it appeared in a syntax file,
59 as a statically allocated constant string. This function returns NULL for
60 tokens that don't have any fixed string representation, such as identifier
63 token_type_to_string (enum token_type token)
156 /* Recognizing identifiers. */
159 is_ascii_id1 (unsigned char c)
161 return c_isalpha (c) || c == '@' || c == '#' || c == '$';
165 is_ascii_idn (unsigned char c)
167 return is_ascii_id1 (c) || isdigit (c) || c == '.' || c == '_';
170 /* Returns true if C may be the first byte in an identifier in the current
173 (PSPP is transitioning to using Unicode internally for syntax, so please
174 use lex_uc_is_id1() instead, if possible.) */
178 return is_ascii_id1 (c) || (unsigned char) c >= 128;
181 /* Returns true if C may be a byte in an identifier other than the first.
183 (PSPP is transitioning to using Unicode internally for syntax, so please
184 use lex_uc_is_idn() instead, if possible.) */
188 return is_ascii_idn (c) || (unsigned char) c >= 128;
191 /* Returns true if Unicode code point UC may be the first character in an
192 identifier in the current locale. */
194 lex_uc_is_id1 (ucs4_t uc)
196 return is_ascii_id1 (uc) || (uc >= 0x80 && uc_is_property_id_start (uc));
199 /* Returns true if Unicode code point UC may be a character in an identifier
200 other than the first. */
202 lex_uc_is_idn (ucs4_t uc)
204 return (is_ascii_id1 (uc) || isdigit (uc) || uc == '.' || uc == '_'
205 || (uc >= 0x80 && uc_is_property_id_continue (uc)));
208 /* Returns true if Unicode code point UC is a space that separates tokens. */
210 lex_uc_is_space (ucs4_t uc)
212 /* These are all of the Unicode characters in category Zs, Zl, or Zp. */
213 return (uc == ' ' || (uc <= 0x000d && uc >= 0x0009)
215 && (uc == 0xa0 || uc == 0x85 || uc == 0x1680 || uc == 0x180e
216 || (uc >= 0x2000 && uc <= 0x200a)
217 || uc == 0x2028 || uc == 0x2029 || uc == 0x202f
218 || uc == 0x205f || uc == 0x3000)));
222 /* Returns the length of the longest prefix of STRING that forms
223 a valid identifier. Returns zero if STRING does not begin
224 with a valid identifier. */
226 lex_id_get_length (struct substring string)
229 if (!ss_is_empty (string) && lex_is_id1 (ss_first (string)))
232 while (length < ss_length (string)
233 && lex_is_idn (ss_at (string, length)))
239 /* Comparing identifiers. */
241 /* Returns true if TOKEN is a case-insensitive match for KEYWORD.
243 Keywords match if one of the following is true: KEYWORD and
244 TOKEN are identical, or TOKEN is at least 3 characters long
245 and those characters are identical to KEYWORD. (Letters that
246 differ only in case are considered identical.)
248 KEYWORD must be ASCII, but TOKEN may be ASCII or UTF-8. */
250 lex_id_match (struct substring keyword, struct substring token)
252 return lex_id_match_n (keyword, token, 3);
255 /* Returns true if TOKEN is a case-insensitive match for at least
256 the first N characters of KEYWORD.
258 KEYWORD must be ASCII, but TOKEN may be ASCII or UTF-8. */
260 lex_id_match_n (struct substring keyword, struct substring token, size_t n)
262 size_t token_len = ss_length (token);
263 size_t keyword_len = ss_length (keyword);
265 if (token_len >= n && token_len < keyword_len)
266 return ss_equals_case (ss_head (keyword, token_len), token);
268 return ss_equals_case (keyword, token);
271 /* Table of keywords. */
275 const struct substring identifier;
278 static const struct keyword keywords[] =
280 { T_AND, SS_LITERAL_INITIALIZER ("AND") },
281 { T_OR, SS_LITERAL_INITIALIZER ("OR") },
282 { T_NOT, SS_LITERAL_INITIALIZER ("NOT") },
283 { T_EQ, SS_LITERAL_INITIALIZER ("EQ") },
284 { T_GE, SS_LITERAL_INITIALIZER ("GE") },
285 { T_GT, SS_LITERAL_INITIALIZER ("GT") },
286 { T_LE, SS_LITERAL_INITIALIZER ("LE") },
287 { T_LT, SS_LITERAL_INITIALIZER ("LT") },
288 { T_NE, SS_LITERAL_INITIALIZER ("NE") },
289 { T_ALL, SS_LITERAL_INITIALIZER ("ALL") },
290 { T_BY, SS_LITERAL_INITIALIZER ("BY") },
291 { T_TO, SS_LITERAL_INITIALIZER ("TO") },
292 { T_WITH, SS_LITERAL_INITIALIZER ("WITH") },
294 static const size_t keyword_cnt = sizeof keywords / sizeof *keywords;
296 /* Returns true if TOKEN is representable as a keyword. */
298 lex_is_keyword (enum token_type token)
300 const struct keyword *kw;
301 for (kw = keywords; kw < &keywords[keyword_cnt]; kw++)
302 if (kw->token == token)
307 /* Returns the proper token type, either T_ID or a reserved
308 keyword enum, for ID. */
310 lex_id_to_token (struct substring id)
312 if (ss_length (id) >= 2 && ss_length (id) <= 4)
314 const struct keyword *kw;
315 for (kw = keywords; kw < &keywords[keyword_cnt]; kw++)
316 if (ss_equals_case (kw->identifier, id))
323 /* Returns the name for the given keyword token type. */
325 lex_id_name (enum token_type token)
327 const struct keyword *kw;
329 for (kw = keywords; kw < &keywords[keyword_cnt]; kw++)
330 if (kw->token == token)
332 /* A "struct substring" is not guaranteed to be
333 null-terminated, as our caller expects, but in this
334 case it always will be. */
335 return ss_data (kw->identifier);