lexer: Reimplement for better testability and internationalization.

[pspp-builds.git] / src / data / identifier.h
diff --git a/src/data/identifier.h b/src/data/identifier.h

index 53be9a7655048c7cabd12c15950bf3a6cd791f6b..7f2f904239167f1c5c4200a570621dd7d83cf66a 100644 (file)
--- a/src/data/identifier.h
+++ b/src/data/identifier.h
@@ -1,73 +1,97 @@
-/* PSPP - computes sample statistics.
-   Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
-   Written by Ben Pfaff <blp@gnu.org>.
+/* PSPP - a program for statistical analysis.
+   Copyright (C) 1997-9, 2000, 2010, 2011 Free Software Foundation, Inc.
  
-   This program is free software; you can redistribute it and/or
-   modify it under the terms of the GNU General Public License as
-   published by the Free Software Foundation; either version 2 of the
-   License, or (at your option) any later version.
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
  
-   This program is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
  
     You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
-   02110-1301, USA. */
+   along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  
-#if !lex_def_h
-#define lex_def_h 1
+#ifndef DATA_IDENTIFIER_H
+#define DATA_IDENTIFIER_H 1
  
  #include <ctype.h>
  #include <stdbool.h>
-#include <sys/types.h>
+#include <unitypes.h>
+#include "libpspp/str.h"
+
+#define TOKEN_TYPES                                                     \
+    TOKEN_TYPE(ID)                         /* Identifier. */            \
+    TOKEN_TYPE(POS_NUM)                    /* Positive number. */       \
+    TOKEN_TYPE(NEG_NUM)                    /* Negative number. */       \
+    TOKEN_TYPE(STRING)                     /* Quoted string. */         \
+    TOKEN_TYPE(STOP)                       /* End of input. */          \
+                                                                        \
+    TOKEN_TYPE(ENDCMD)                     /* . */                      \
+    TOKEN_TYPE(PLUS)                       /* + */                      \
+    TOKEN_TYPE(DASH)                       /* - */                      \
+    TOKEN_TYPE(ASTERISK)                   /* * */                      \
+    TOKEN_TYPE(SLASH)                      /* / */                      \
+    TOKEN_TYPE(EQUALS)                     /* = */                      \
+    TOKEN_TYPE(LPAREN)                     /* ( */                      \
+    TOKEN_TYPE(RPAREN)                     /* ) */                      \
+    TOKEN_TYPE(LBRACK)                     /* [ */                      \
+    TOKEN_TYPE(RBRACK)                     /* ] */                      \
+    TOKEN_TYPE(COMMA)                      /* , */                      \
+                                                                        \
+    TOKEN_TYPE(AND)                        /* AND */                    \
+    TOKEN_TYPE(OR)                         /* OR */                     \
+    TOKEN_TYPE(NOT)                        /* NOT */                    \
+                                                                        \
+    TOKEN_TYPE(EQ)                         /* EQ */                     \
+    TOKEN_TYPE(GE)                         /* GE or >= */               \
+    TOKEN_TYPE(GT)                         /* GT or > */                \
+    TOKEN_TYPE(LE)                         /* LE or <= */               \
+    TOKEN_TYPE(LT)                         /* LT or < */                \
+    TOKEN_TYPE(NE)                         /* NE or ~= */               \
+                                                                        \
+    TOKEN_TYPE(ALL)                        /* ALL */                    \
+    TOKEN_TYPE(BY)                         /* BY */                     \
+    TOKEN_TYPE(TO)                         /* TO */                     \
+    TOKEN_TYPE(WITH)                       /* WITH */                   \
+                                                                        \
+    TOKEN_TYPE(EXP)                        /* ** */
  
  /* Token types. */
-/* The order of the enumerals below is important.  Do not change it. */
-enum
+enum token_type
    {
-    T_ID = 256, /* Identifier. */
-    T_POS_NUM, /* Positive number. */
-    T_NEG_NUM, /* Negative number. */
-    T_STRING,  /* Quoted string. */
-    T_STOP,    /* End of input. */
+#define TOKEN_TYPE(TYPE) T_##TYPE,
+    TOKEN_TYPES
+    TOKEN_N_TYPES
+#undef TOKEN_TYPE
+  };
  
-    T_AND,     /* AND */
-    T_OR,      /* OR */
-    T_NOT,     /* NOT */
+const char *token_type_to_name (enum token_type);
+const char *token_type_to_string (enum token_type);
  
-    T_EQ,      /* EQ */
-    T_GE,      /* GE or >= */
-    T_GT,      /* GT or > */
-    T_LE,      /* LE or <= */
-    T_LT,      /* LT or < */
-    T_NE,      /* NE or ~= */
+/* Tokens. */
+bool lex_is_keyword (enum token_type);
  
-    T_ALL,     /* ALL */
-    T_BY,      /* BY */
-    T_TO,      /* TO */
-    T_WITH,    /* WITH */
+/* Validating identifiers. */
+#define ID_MAX_LEN 64          /* Maximum length of identifier, in bytes. */
  
-    T_EXP,     /* ** */
-
-    T_FIRST_KEYWORD = T_AND,
-    T_LAST_KEYWORD = T_WITH,
-    T_N_KEYWORDS = T_LAST_KEYWORD - T_FIRST_KEYWORD + 1
-  };
+bool id_is_valid (const char *id, const char *dict_encoding, bool issue_error);
+bool id_is_plausible (const char *id, bool issue_error);
  
  /* Recognizing identifiers. */
  bool lex_is_id1 (char);
  bool lex_is_idn (char);
-char *lex_skip_identifier (const char *);
+bool lex_uc_is_id1 (ucs4_t);
+bool lex_uc_is_idn (ucs4_t);
+bool lex_uc_is_space (ucs4_t);
+size_t lex_id_get_length (struct substring);
  
  /* Comparing identifiers. */
-bool lex_id_match_len (const char *keyword_string, size_t keyword_len,
-                       const char *token_string, size_t token_len);
-bool lex_id_match (const char *keyword_string, const char *token_string);
-int lex_id_to_token (const char *id, size_t len);
-
-extern const char *keywords[T_N_KEYWORDS + 1] ;
+bool lex_id_match (struct substring keyword, struct substring token);
+bool lex_id_match_n (struct substring keyword, struct substring token,
+                     size_t n);
+int lex_id_to_token (struct substring);
  
-#endif /* !lex_def_h */
+#endif /* !data/identifier.h */