From ca93ab32faadf34ab43dd5de464dbd137895e82b Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Thu, 3 Jan 2013 21:34:25 -0800 Subject: [PATCH] identifier: Broaden the class of characters allowed in identifiers. It appears that SPSS allows almost any Unicode character in an identifier, and particular U+00B4 ACUTE ACCENT. This commit adds more permitted characters to the identifier checks. Reported by Helen Barghan . --- Smake | 3 +-- src/data/identifier.c | 17 ++++++++++++++--- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/Smake b/Smake index 72ec791a5e..1e54533169 100644 --- a/Smake +++ b/Smake @@ -81,8 +81,7 @@ GNULIB_MODULES = \ unicase/u8-tolower \ unicase/u8-toupper \ unictype/ctype-print \ - unictype/property-id-continue \ - unictype/property-id-start \ + unictype/category-of \ unigbrk/uc-is-grapheme-break \ unilbrk/u8-possible-linebreaks \ uninorm/nfkd \ diff --git a/src/data/identifier.c b/src/data/identifier.c index a757b31e3a..6191f0db90 100644 --- a/src/data/identifier.c +++ b/src/data/identifier.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 1997-9, 2000, 2005, 2009, 2010, 2011, 2012 Free Software Foundation, Inc. + Copyright (C) 1997-9, 2000, 2005, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -188,7 +188,13 @@ lex_is_idn (char c) bool lex_uc_is_id1 (ucs4_t uc) { - return is_ascii_id1 (uc) || (uc >= 0x80 && uc_is_property_id_start (uc)); + return (uc < 0x80 + ? is_ascii_id1 (uc) + : (uc_is_general_category_withtable (uc, + UC_CATEGORY_MASK_L | + UC_CATEGORY_MASK_M | + UC_CATEGORY_MASK_S) + && uc != 0xfffc && uc != 0xfffd)); } /* Returns true if Unicode code point UC may be a character in an identifier @@ -198,7 +204,12 @@ lex_uc_is_idn (ucs4_t uc) { return (uc < 0x80 ? is_ascii_id1 (uc) || isdigit (uc) || uc == '.' || uc == '_' - : uc >= 0x80 && uc_is_property_id_continue (uc)); + : (uc_is_general_category_withtable (uc, + UC_CATEGORY_MASK_L | + UC_CATEGORY_MASK_M | + UC_CATEGORY_MASK_S | + UC_CATEGORY_MASK_N) + && uc != 0xfffc && uc != 0xfffd)); } /* Returns true if Unicode code point UC is a space that separates tokens. */ -- 2.30.2