lexer: Add tokens for '{', '}', ':', ';' for use in the matrix language.
[pspp] / src / data / identifier.h
index 352ef2eafc1b3b381a82d7ee55e96b4dc344623e..dcbce970cda97168ab96a22c3e51660d33efc763 100644 (file)
@@ -1,5 +1,5 @@
 /* PSPP - a program for statistical analysis.
-   Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
+   Copyright (C) 1997-9, 2000, 2010, 2011 Free Software Foundation, Inc.
 
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
 
 #include <ctype.h>
 #include <stdbool.h>
-#include <sys/types.h>
-#include <libpspp/str.h>
+#include <unitypes.h>
+#include "libpspp/str.h"
+#include "gl/verify.h"
 
+#define TOKEN_TYPES                                                     \
+    TOKEN_TYPE(STOP)                /* End of input. */                 \
+                                                                        \
+    TOKEN_TYPE(ID)                  /* Identifier. */                   \
+    TOKEN_TYPE(POS_NUM)             /* Positive number. */              \
+    TOKEN_TYPE(NEG_NUM)             /* Negative number. */              \
+    TOKEN_TYPE(STRING)              /* Quoted string. */                \
+                                                                        \
+    TOKEN_TYPE(ENDCMD)              /* . */                             \
+    TOKEN_TYPE(PLUS)                /* + */                             \
+    TOKEN_TYPE(DASH)                /* - */                             \
+    TOKEN_TYPE(ASTERISK)            /* * */                             \
+    TOKEN_TYPE(SLASH)               /* / */                             \
+    TOKEN_TYPE(EQUALS)              /* = */                             \
+    TOKEN_TYPE(LPAREN)              /* (*/                              \
+    TOKEN_TYPE(RPAREN)              /* ) */                             \
+    TOKEN_TYPE(LBRACK)              /* [ */                             \
+    TOKEN_TYPE(RBRACK)              /* ] */                             \
+    TOKEN_TYPE(LCURLY)              /* { */                             \
+    TOKEN_TYPE(RCURLY)              /* } */                             \
+    TOKEN_TYPE(COMMA)               /* , */                             \
+    TOKEN_TYPE(SEMICOLON)           /* ; */                             \
+    TOKEN_TYPE(COLON)               /* : */                             \
+                                                                        \
+    TOKEN_TYPE(AND)                 /* AND */                           \
+    TOKEN_TYPE(OR)                  /* OR */                            \
+    TOKEN_TYPE(NOT)                 /* NOT */                           \
+                                                                        \
+    TOKEN_TYPE(EQ)                  /* EQ */                            \
+    TOKEN_TYPE(GE)                  /* GE or >= */                      \
+    TOKEN_TYPE(GT)                  /* GT or > */                       \
+    TOKEN_TYPE(LE)                  /* LE or <= */                      \
+    TOKEN_TYPE(LT)                  /* LT or < */                       \
+    TOKEN_TYPE(NE)                  /* NE or ~= */                      \
+                                                                        \
+    TOKEN_TYPE(ALL)                 /* ALL */                           \
+    TOKEN_TYPE(BY)                  /* BY */                            \
+    TOKEN_TYPE(TO)                  /* TO */                            \
+    TOKEN_TYPE(WITH)                /* WITH */                          \
+                                                                        \
+    TOKEN_TYPE(EXP)                 /* ** */                            \
+                                                                        \
+    TOKEN_TYPE(MACRO_ID)            /* Identifier starting with '!'. */ \
+    TOKEN_TYPE(MACRO_PUNCT)         /* Miscellaneous punctuator. */
 /* Token types. */
-enum
+enum token_type
   {
-    T_ID = 256, /* Identifier. */
-    T_POS_NUM, /* Positive number. */
-    T_NEG_NUM, /* Negative number. */
-    T_STRING,  /* Quoted string. */
-    T_STOP,    /* End of input. */
+#define TOKEN_TYPE(TYPE) T_##TYPE,
+    TOKEN_TYPES
+#undef TOKEN_TYPE
+  };
+verify(T_STOP == 0);
 
-    T_AND,     /* AND */
-    T_OR,      /* OR */
-    T_NOT,     /* NOT */
+#define TOKEN_TYPE(TYPE) + 1
+enum { TOKEN_N_TYPES = TOKEN_TYPES };
+#undef TOKEN_TYPE
 
-    T_EQ,      /* EQ */
-    T_GE,      /* GE or >= */
-    T_GT,      /* GT or > */
-    T_LE,      /* LE or <= */
-    T_LT,      /* LT or < */
-    T_NE,      /* NE or ~= */
+const char *token_type_to_name (enum token_type);
+const char *token_type_to_string (enum token_type);
 
-    T_ALL,     /* ALL */
-    T_BY,      /* BY */
-    T_TO,      /* TO */
-    T_WITH,    /* WITH */
+/* Tokens. */
+bool lex_is_keyword (enum token_type);
 
-    T_EXP,     /* ** */
-  };
+/* Validating identifiers. */
+#define ID_MAX_LEN 64          /* Maximum length of identifier, in bytes. */
 
-/* Tokens. */
-bool lex_is_keyword (int token);
+bool id_is_valid (const char *id, const char *dict_encoding, bool issue_error);
+bool id_is_plausible (const char *id, bool issue_error);
 
 /* Recognizing identifiers. */
 bool lex_is_id1 (char);
 bool lex_is_idn (char);
+bool lex_uc_is_id1 (ucs4_t);
+bool lex_uc_is_idn (ucs4_t);
+bool lex_uc_is_space (ucs4_t);
 size_t lex_id_get_length (struct substring);
 
 /* Comparing identifiers. */
@@ -64,7 +106,4 @@ bool lex_id_match_n (struct substring keyword, struct substring token,
                      size_t n);
 int lex_id_to_token (struct substring);
 
-/* Identifier names. */
-const char *lex_id_name (int);
-
 #endif /* !data/identifier.h */