/* PSPP - a program for statistical analysis.
- Copyright (C) 1997-9, 2000, 2005, 2009, 2010 Free Software Foundation, Inc.
+ Copyright (C) 1997-9, 2000, 2005, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
#include "data/identifier.h"
-#include <assert.h>
#include <string.h>
-#include <unictype.h>
#include <unistr.h>
+#include <unictype.h>
#include "libpspp/assertion.h"
#include "libpspp/cast.h"
-#include "libpspp/i18n.h"
-#include "libpspp/message.h"
#include "gl/c-ctype.h"
bool
lex_uc_is_id1 (ucs4_t uc)
{
- return is_ascii_id1 (uc) || (uc >= 0x80 && uc_is_property_id_start (uc));
+ return (uc < 0x80
+ ? is_ascii_id1 (uc)
+ : (uc_is_general_category_withtable (uc,
+ UC_CATEGORY_MASK_L |
+ UC_CATEGORY_MASK_M |
+ UC_CATEGORY_MASK_S)
+ && uc != 0xfffc && uc != 0xfffd));
}
/* Returns true if Unicode code point UC may be a character in an identifier
bool
lex_uc_is_idn (ucs4_t uc)
{
- return (is_ascii_id1 (uc) || isdigit (uc) || uc == '.' || uc == '_'
- || (uc >= 0x80 && uc_is_property_id_continue (uc)));
+ return (uc < 0x80
+ ? is_ascii_id1 (uc) || isdigit (uc) || uc == '.' || uc == '_'
+ : (uc_is_general_category_withtable (uc,
+ UC_CATEGORY_MASK_L |
+ UC_CATEGORY_MASK_M |
+ UC_CATEGORY_MASK_S |
+ UC_CATEGORY_MASK_N)
+ && uc != 0xfffc && uc != 0xfffd));
}
/* Returns true if Unicode code point UC is a space that separates tokens. */
size_t
lex_id_get_length (struct substring string)
{
- size_t length = 0;
- if (!ss_is_empty (string) && lex_is_id1 (ss_first (string)))
+ const uint8_t *s = CHAR_CAST (const uint8_t *, string.string);
+ size_t len = string.length;
+ size_t ofs;
+ int mblen;
+
+ for (ofs = 0; ofs < string.length; ofs += mblen)
{
- length = 1;
- while (length < ss_length (string)
- && lex_is_idn (ss_at (string, length)))
- length++;
+ ucs4_t uc;
+
+ mblen = u8_mbtouc (&uc, s + ofs, len - ofs);
+ if (!(ofs == 0 ? lex_uc_is_id1 (uc) : lex_uc_is_idn (uc)))
+ break;
}
- return length;
+
+ return ofs;
}
\f
/* Comparing identifiers. */
return T_ID;
}
-
-/* Returns the name for the given keyword token type. */
-const char *
-lex_id_name (enum token_type token)
-{
- const struct keyword *kw;
-
- for (kw = keywords; kw < &keywords[keyword_cnt]; kw++)
- if (kw->token == token)
- {
- /* A "struct substring" is not guaranteed to be
- null-terminated, as our caller expects, but in this
- case it always will be. */
- return ss_data (kw->identifier);
- }
- NOT_REACHED ();
-}