treewide: Replace <name>_cnt by n_<name>s and <name>_cap by allocated_<name>.

[pspp] / src / data / identifier.c
diff --git a/src/data/identifier.c b/src/data/identifier.c

index 1e149b8dc3ea9673c801be8ed7cb0010dbb546b5..d9d9b2a6444c25a8f76fec3df66089e1306e6b40 100644 (file)
--- a/src/data/identifier.c
+++ b/src/data/identifier.c
@@ -1,5 +1,5 @@
  /* PSPP - a program for statistical analysis.
-   Copyright (C) 1997-9, 2000, 2005, 2009, 2010 Free Software Foundation, Inc.
+   Copyright (C) 1997-9, 2000, 2005, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc.
  
     This program is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -23,14 +23,143 @@
  
  #include "data/identifier.h"
  
-#include <assert.h>
  #include <string.h>
+#include <unistr.h>
  #include <unictype.h>
  
  #include "libpspp/assertion.h"
+#include "libpspp/cast.h"
  
  #include "gl/c-ctype.h"
  
+#include "gettext.h"
+#define _(msgid) gettext (msgid)
+
+/* Tokens. */
+
+/* Returns TYPE as a string, e.g. "ID" for T_ID. */
+const char *
+token_type_to_name (enum token_type type)
+{
+  switch (type)
+    {
+#define TOKEN_TYPE(TYPE) case T_##TYPE: return #TYPE;
+      TOKEN_TYPES
+#undef TOKEN_TYPE
+    default:
+      return "unknown token type";
+    }
+}
+
+/* Returns an ASCII string that yields TOKEN if it appeared in a syntax file,
+   as a statically allocated constant string.  This function returns NULL for
+   tokens that don't have any fixed string representation, such as identifier
+   and number tokens. */
+const char *
+token_type_to_string (enum token_type token)
+{
+  switch (token)
+    {
+    case T_ID:
+    case T_POS_NUM:
+    case T_NEG_NUM:
+    case T_STRING:
+    case T_MACRO_ID:
+    case T_MACRO_PUNCT:
+    case T_STOP:
+      return NULL;
+
+    case T_ENDCMD:
+      return ".";
+
+    case T_PLUS:
+      return "+";
+
+    case T_DASH:
+      return "-";
+
+    case T_ASTERISK:
+      return "*";
+
+    case T_SLASH:
+      return "/";
+
+    case T_EQUALS:
+      return "=";
+
+    case T_LPAREN:
+      return "(";
+
+    case T_RPAREN:
+      return ")";
+
+    case T_LBRACK:
+      return "[";
+
+    case T_RBRACK:
+      return "]";
+
+    case T_LCURLY:
+      return "{";
+
+    case T_RCURLY:
+      return "}";
+
+    case T_COMMA:
+      return ",";
+
+    case T_SEMICOLON:
+      return ";";
+
+    case T_COLON:
+      return ":";
+
+    case T_AND:
+      return "AND";
+
+    case T_OR:
+      return "OR";
+
+    case T_NOT:
+      return "NOT";
+
+    case T_EQ:
+      return "EQ";
+
+    case T_GE:
+      return ">=";
+
+    case T_GT:
+      return ">";
+
+    case T_LE:
+      return "<=";
+
+    case T_LT:
+      return "<";
+
+    case T_NE:
+      return "~=";
+
+    case T_ALL:
+      return "ALL";
+
+    case T_BY:
+      return "BY";
+
+    case T_TO:
+      return "TO";
+
+    case T_WITH:
+      return "WITH";
+
+    case T_EXP:
+      return "**";
+    }
+
+  NOT_REACHED ();
+}
+
  /* Recognizing identifiers. */
  
  static bool
@@ -71,7 +200,13 @@ lex_is_idn (char c)
  bool
  lex_uc_is_id1 (ucs4_t uc)
  {
-  return is_ascii_id1 (uc) || (uc >= 0x80 && uc_is_property_id_start (uc));
+  return (uc < 0x80
+          ? is_ascii_id1 (uc)
+          : (uc_is_general_category_withtable (uc,
+                                               UC_CATEGORY_MASK_L |
+                                               UC_CATEGORY_MASK_M |
+                                               UC_CATEGORY_MASK_S)
+             && uc != 0xfffc && uc != 0xfffd));
  }
  
  /* Returns true if Unicode code point UC may be a character in an identifier
@@ -79,8 +214,14 @@ lex_uc_is_id1 (ucs4_t uc)
  bool
  lex_uc_is_idn (ucs4_t uc)
  {
-  return (is_ascii_id1 (uc) || isdigit (uc) || uc == '.' || uc == '_'
-          || (uc >= 0x80 && uc_is_property_id_continue (uc)));
+  return (uc < 0x80
+          ? is_ascii_id1 (uc) || isdigit (uc) || uc == '.' || uc == '_'
+          : (uc_is_general_category_withtable (uc,
+                                               UC_CATEGORY_MASK_L |
+                                               UC_CATEGORY_MASK_M |
+                                               UC_CATEGORY_MASK_S |
+                                               UC_CATEGORY_MASK_N)
+             && uc != 0xfffc && uc != 0xfffd));
  }
  
  /* Returns true if Unicode code point UC is a space that separates tokens. */
@@ -103,15 +244,21 @@ lex_uc_is_space (ucs4_t uc)
  size_t
  lex_id_get_length (struct substring string)
  {
-  size_t length = 0;
-  if (!ss_is_empty (string) && lex_is_id1 (ss_first (string)))
+  const uint8_t *s = CHAR_CAST (const uint8_t *, string.string);
+  size_t len = string.length;
+  size_t ofs;
+  int mblen;
+
+  for (ofs = 0; ofs < string.length; ofs += mblen)
      {
-      length = 1;
-      while (length < ss_length (string)
-             && lex_is_idn (ss_at (string, length)))
-        length++;
+      ucs4_t uc;
+
+      mblen = u8_mbtouc (&uc, s + ofs, len - ofs);
+      if (!(ofs == 0 ? lex_uc_is_id1 (uc) : lex_uc_is_idn (uc)))
+        break;
      }
-  return length;
+
+  return ofs;
  }
  \f
  /* Comparing identifiers. */
@@ -169,14 +316,14 @@ static const struct keyword keywords[] =
      { T_TO,   SS_LITERAL_INITIALIZER ("TO") },
      { T_WITH, SS_LITERAL_INITIALIZER ("WITH") },
    };
-static const size_t keyword_cnt = sizeof keywords / sizeof *keywords;
+static const size_t n_keywords = sizeof keywords / sizeof *keywords;
  
  /* Returns true if TOKEN is representable as a keyword. */
  bool
  lex_is_keyword (enum token_type token)
  {
    const struct keyword *kw;
-  for (kw = keywords; kw < &keywords[keyword_cnt]; kw++)
+  for (kw = keywords; kw < &keywords[n_keywords]; kw++)
      if (kw->token == token)
        return true;
    return false;
@@ -190,27 +337,10 @@ lex_id_to_token (struct substring id)
    if (ss_length (id) >= 2 && ss_length (id) <= 4)
      {
        const struct keyword *kw;
-      for (kw = keywords; kw < &keywords[keyword_cnt]; kw++)
+      for (kw = keywords; kw < &keywords[n_keywords]; kw++)
          if (ss_equals_case (kw->identifier, id))
            return kw->token;
      }
  
    return T_ID;
  }
-
-/* Returns the name for the given keyword token type. */
-const char *
-lex_id_name (enum token_type token)
-{
-  const struct keyword *kw;
-
-  for (kw = keywords; kw < &keywords[keyword_cnt]; kw++)
-    if (kw->token == token)
-      {
-        /* A "struct substring" is not guaranteed to be
-           null-terminated, as our caller expects, but in this
-           case it always will be. */
-        return ss_data (kw->identifier);
-      }
-  NOT_REACHED ();
-}