identifier: Rename token_type_to_string() and make a new version.

[pspp-builds.git] / src / data / identifier.c
diff --git a/src/data/identifier.c b/src/data/identifier.c

index daab97523ebede50d34b4bbddc7e011dc9c179ce..4b613bb480edb5555cb1532176d745182345c0c4 100644 (file)
--- a/src/data/identifier.c
+++ b/src/data/identifier.c
@@ -1,61 +1,229 @@
-/* PSPP - computes sample statistics.
-   Copyright (C) 1997-9, 2000, 2005 Free Software Foundation, Inc.
-   Written by John Darrington <john@darrington.wattle.id.au>
+/* PSPP - a program for statistical analysis.
+   Copyright (C) 1997-9, 2000, 2005, 2009, 2010 Free Software Foundation, Inc.
  
-   This program is free software; you can redistribute it and/or
-   modify it under the terms of the GNU General Public License as
-   published by the Free Software Foundation; either version 2 of the
-   License, or (at your option) any later version.
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
  
-   This program is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
  
     You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
-   02110-1301, USA. */
+   along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  
-/* 
-   This file is concerned with the definition of the PSPP syntax, NOT the 
+/*
+   This file is concerned with the definition of the PSPP syntax, NOT the
     action of scanning/parsing code .
  */
  
  #include <config.h>
-#include "identifier.h"
  
+#include "data/identifier.h"
  
  #include <assert.h>
  #include <string.h>
-#include <libpspp/assertion.h>
+#include <unictype.h>
+#include <unistr.h>
+
+#include "libpspp/assertion.h"
+#include "libpspp/cast.h"
+#include "libpspp/i18n.h"
+#include "libpspp/message.h"
+
+#include "gl/c-ctype.h"
+
+#include "gettext.h"
+#define _(msgid) gettext (msgid)
+
+/* Tokens. */
+
+/* Returns TYPE as a string, e.g. "ID" for T_ID. */
+const char *
+token_type_to_name (enum token_type type)
+{
+  switch (type)
+    {
+#define TOKEN_TYPE(TYPE) case T_##TYPE: return #TYPE;
+      TOKEN_TYPES
+#undef TOKEN_TYPE
+    case TOKEN_N_TYPES:
+    default:
+      return "unknown token type";
+    }
+}
+
+/* Returns an ASCII string that yields TOKEN if it appeared in a syntax file,
+   as a statically allocated constant string.  This function returns NULL for
+   tokens that don't have any fixed string representation, such as identifier
+   and number tokens. */
+const char *
+token_type_to_string (enum token_type token)
+{
+  switch (token)
+    {
+    case T_ID:
+    case T_POS_NUM:
+    case T_NEG_NUM:
+    case T_STRING:
+    case T_STOP:
+      return NULL;
+
+    case T_ENDCMD:
+      return ".";
+
+    case T_PLUS:
+      return "+";
+
+    case T_DASH:
+      return "-";
+
+    case T_ASTERISK:
+      return "*";
+
+    case T_SLASH:
+      return "/";
+
+    case T_EQUALS:
+      return "=";
+
+    case T_LPAREN:
+      return "(";
+
+    case T_RPAREN:
+      return ")";
+
+    case T_LBRACK:
+      return "[";
+
+    case T_RBRACK:
+      return "]";
+
+    case T_COMMA:
+      return ",";
+
+    case T_AND:
+      return "AND";
+
+    case T_OR:
+      return "OR";
+
+    case T_NOT:
+      return "NOT";
+
+    case T_EQ:
+      return "EQ";
+
+    case T_GE:
+      return ">=";
+
+    case T_GT:
+      return ">";
+
+    case T_LE:
+      return "<=";
+
+    case T_LT:
+      return "<";
+
+    case T_NE:
+      return "~=";
+
+    case T_ALL:
+      return "ALL";
+
+    case T_BY:
+      return "BY";
+
+    case T_TO:
+      return "TO";
+
+    case T_WITH:
+      return "WITH";
+
+    case T_EXP:
+      return "**";
+
+    case TOKEN_N_TYPES:
+      NOT_REACHED ();
+    }
+
+  NOT_REACHED ();
+}
  
  /* Recognizing identifiers. */
  
-/* Returns true if C may be the first character in an
+static bool
+is_ascii_id1 (unsigned char c)
+{
+  return c_isalpha (c) || c == '@' || c == '#' || c == '$';
+}
+
+static bool
+is_ascii_idn (unsigned char c)
+{
+  return is_ascii_id1 (c) || isdigit (c) || c == '.' || c == '_';
+}
+
+/* Returns true if C may be the first byte in an identifier in the current
+   locale.
+
+   (PSPP is transitioning to using Unicode internally for syntax, so please
+   use lex_uc_is_id1() instead, if possible.) */
+bool
+lex_is_id1 (char c)
+{
+  return is_ascii_id1 (c) || (unsigned char) c >= 128;
+}
+
+/* Returns true if C may be a byte in an identifier other than the first.
+
+   (PSPP is transitioning to using Unicode internally for syntax, so please
+   use lex_uc_is_idn() instead, if possible.) */
+bool
+lex_is_idn (char c)
+{
+  return is_ascii_idn (c) || (unsigned char) c >= 128;
+}
+
+/* Returns true if Unicode code point UC may be the first character in an
     identifier in the current locale. */
  bool
-lex_is_id1 (char c_) 
+lex_uc_is_id1 (ucs4_t uc)
  {
-  unsigned char c = c_;
-  return isalpha (c) || c == '@' || c == '#' || c == '$';
+  return is_ascii_id1 (uc) || (uc >= 0x80 && uc_is_property_id_start (uc));
  }
  
+/* Returns true if Unicode code point UC may be a character in an identifier
+   other than the first. */
+bool
+lex_uc_is_idn (ucs4_t uc)
+{
+  return (is_ascii_id1 (uc) || isdigit (uc) || uc == '.' || uc == '_'
+          || (uc >= 0x80 && uc_is_property_id_continue (uc)));
+}
  
-/* Returns true if C may be a character in an identifier other
-   than the first. */
+/* Returns true if Unicode code point UC is a space that separates tokens. */
  bool
-lex_is_idn (char c_)
+lex_uc_is_space (ucs4_t uc)
  {
-  unsigned char c = c_;
-  return lex_is_id1 (c) || isdigit (c) || c == '.' || c == '_';
+  /* These are all of the Unicode characters in category Zs, Zl, or Zp.  */
+  return (uc == ' ' || (uc <= 0x000d && uc >= 0x0009)
+          || (uc >= 0x80
+              && (uc == 0xa0 || uc == 0x85 || uc == 0x1680 || uc == 0x180e
+                  || (uc >= 0x2000 && uc <= 0x200a)
+                  || uc == 0x2028 || uc == 0x2029 || uc == 0x202f
+                  || uc == 0x205f || uc == 0x3000)));
  }
  
+
  /* Returns the length of the longest prefix of STRING that forms
     a valid identifier.  Returns zero if STRING does not begin
     with a valid identifier.  */
  size_t
-lex_id_get_length (struct substring string) 
+lex_id_get_length (struct substring string)
  {
    size_t length = 0;
    if (!ss_is_empty (string) && lex_is_id1 (ss_first (string)))
@@ -74,27 +242,40 @@ lex_id_get_length (struct substring string)
  
     Keywords match if one of the following is true: KEYWORD and
     TOKEN are identical, or TOKEN is at least 3 characters long
-   and those characters are identical to KEYWORD. */
+   and those characters are identical to KEYWORD.  (Letters that
+   differ only in case are considered identical.)
+
+   KEYWORD must be ASCII, but TOKEN may be ASCII or UTF-8. */
  bool
  lex_id_match (struct substring keyword, struct substring token)
+{
+  return lex_id_match_n (keyword, token, 3);
+}
+
+/* Returns true if TOKEN is a case-insensitive match for at least
+   the first N characters of KEYWORD.
+
+   KEYWORD must be ASCII, but TOKEN may be ASCII or UTF-8. */
+bool
+lex_id_match_n (struct substring keyword, struct substring token, size_t n)
  {
    size_t token_len = ss_length (token);
    size_t keyword_len = ss_length (keyword);
-  
-  if (token_len >= 3 && token_len < keyword_len)
+
+  if (token_len >= n && token_len < keyword_len)
      return ss_equals_case (ss_head (keyword, token_len), token);
    else
      return ss_equals_case (keyword, token);
  }
  \f
  /* Table of keywords. */
-struct keyword 
+struct keyword
    {
      int token;
      const struct substring identifier;
    };
  
-static const struct keyword keywords[] = 
+static const struct keyword keywords[] =
    {
      { T_AND,  SS_LITERAL_INITIALIZER ("AND") },
      { T_OR,   SS_LITERAL_INITIALIZER ("OR") },
@@ -114,11 +295,11 @@ static const size_t keyword_cnt = sizeof keywords / sizeof *keywords;
  
  /* Returns true if TOKEN is representable as a keyword. */
  bool
-lex_is_keyword (int token) 
+lex_is_keyword (enum token_type token)
  {
    const struct keyword *kw;
    for (kw = keywords; kw < &keywords[keyword_cnt]; kw++)
-    if (kw->token == token) 
+    if (kw->token == token)
        return true;
    return false;
  }
@@ -128,30 +309,30 @@ lex_is_keyword (int token)
  int
  lex_id_to_token (struct substring id)
  {
-  if (ss_length (id) >= 2 && ss_length (id) <= 4) 
+  if (ss_length (id) >= 2 && ss_length (id) <= 4)
      {
        const struct keyword *kw;
        for (kw = keywords; kw < &keywords[keyword_cnt]; kw++)
          if (ss_equals_case (kw->identifier, id))
            return kw->token;
      }
-  
+
    return T_ID;
  }
  
  /* Returns the name for the given keyword token type. */
  const char *
-lex_id_name (int token) 
+lex_id_name (enum token_type token)
  {
    const struct keyword *kw;
  
    for (kw = keywords; kw < &keywords[keyword_cnt]; kw++)
-    if (kw->token == token) 
+    if (kw->token == token)
        {
          /* A "struct substring" is not guaranteed to be
             null-terminated, as our caller expects, but in this
             case it always will be. */
-        return ss_data (kw->identifier); 
+        return ss_data (kw->identifier);
        }
    NOT_REACHED ();
  }