From: Bruno Haible Date: Tue, 16 Aug 2005 12:21:21 +0000 (+0000) Subject: Make strcasecmp() work right in multibyte locales. X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=5516de90cec9ff3c9aead77fd8b6e54b4ec3b1c3;p=pspp Make strcasecmp() work right in multibyte locales. --- diff --git a/ChangeLog b/ChangeLog index 5fac83ca71..3ffa629023 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +2005-08-16 Bruno Haible + + * modules/strcase (Files): Add m4/mbrtowc.m4. + (Depends-on): Add strnlen1, mbchar. + 2005-08-16 Bruno Haible * modules/strnlen1: New file. diff --git a/lib/ChangeLog b/lib/ChangeLog index 992ce4e369..b75cf9baee 100644 --- a/lib/ChangeLog +++ b/lib/ChangeLog @@ -1,3 +1,10 @@ +2005-08-16 Bruno Haible + + * strcase.h (strcasecmp): Add note in comments. + * strncasecmp.c: Use code from strcasecmp.c. + * strcasecmp.c: Use mbchar module. Define private mbiter variant. + (strcasecmp): Work correctly in multibyte locales. + 2005-08-16 Bruno Haible * strnlen1.h: New file. diff --git a/lib/strcase.h b/lib/strcase.h index a51ed9aa9a..e420798058 100644 --- a/lib/strcase.h +++ b/lib/strcase.h @@ -1,5 +1,5 @@ /* Case-insensitive string comparison functions. - Copyright (C) 1995-1996, 2001, 2003 Free Software Foundation, Inc. + Copyright (C) 1995-1996, 2001, 2003, 2005 Free Software Foundation, Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -29,7 +29,8 @@ extern "C" { /* Compare strings S1 and S2, ignoring case, returning less than, equal to or greater than zero if S1 is lexicographically less than, equal to or greater than S2. - Note: This function does not work correctly in multibyte locales. */ + Note: This function may, in multibyte locales, return 0 for strings of + different lengths! */ extern int strcasecmp (const char *s1, const char *s2); /* Compare no more than N characters of strings S1 and S2, ignoring case, diff --git a/lib/strcasecmp.c b/lib/strcasecmp.c index aa269c6171..70c22a1889 100644 --- a/lib/strcasecmp.c +++ b/lib/strcasecmp.c @@ -1,5 +1,7 @@ -/* strcasecmp.c -- case insensitive string comparator - Copyright (C) 1998, 1999 Free Software Foundation, Inc. +/* Case-insensitive string comparison function. + Copyright (C) 1998, 1999, 2005 Free Software Foundation, Inc. + Written by Bruno Haible , 2005, + based on earlier glibc code. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -15,52 +17,225 @@ along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ -#if HAVE_CONFIG_H +#ifdef HAVE_CONFIG_H # include #endif -#ifdef LENGTH_LIMIT -# define STRXCASECMP_FUNCTION strncasecmp -# define STRXCASECMP_DECLARE_N , size_t n -# define LENGTH_LIMIT_EXPR(Expr) Expr -#else -# define STRXCASECMP_FUNCTION strcasecmp -# define STRXCASECMP_DECLARE_N /* empty */ -# define LENGTH_LIMIT_EXPR(Expr) 0 -#endif +/* Specification. */ +#include "strcase.h" -#include #include -#define TOLOWER(Ch) (isupper (Ch) ? tolower (Ch) : (Ch)) +#if HAVE_MBRTOWC -/* Compare {{no more than N characters of }}strings S1 and S2, - ignoring case, returning less than, equal to or - greater than zero if S1 is lexicographically less - than, equal to or greater than S2. */ +#include "strnlen1.h" -int -STRXCASECMP_FUNCTION (const char *s1, const char *s2 STRXCASECMP_DECLARE_N) +/* Like mbiter.h, except it doesn't look at the entire string. */ + +#include "mbchar.h" + +#include +#include +#include +#include +#include + +struct mbiter_multi +{ + bool at_end; /* true if the end of the string has been reached */ + bool in_shift; /* true if next byte may not be interpreted as ASCII */ + mbstate_t state; /* if in_shift: current shift state */ + bool next_done; /* true if mbi_avail has already filled the following */ + struct mbchar cur; /* the current character: + const char *cur.ptr pointer to current character + The following are only valid after mbi_avail. + size_t cur.bytes number of bytes of current character + bool cur.wc_valid true if wc is a valid wide character + wchar_t cur.wc if wc_valid: the current character + */ +}; + +static inline void +mbiter_multi_next (struct mbiter_multi *iter) { - register const unsigned char *p1 = (const unsigned char *) s1; - register const unsigned char *p2 = (const unsigned char *) s2; - unsigned char c1, c2; + if (iter->next_done) + return; + if (iter->in_shift) + goto with_shift; + /* Handle most ASCII characters quickly, without calling mbrtowc(). */ + if (is_basic (*iter->cur.ptr)) + { + /* These characters are part of the basic character set. ISO C 99 + guarantees that their wide character code is identical to their + char code. */ + iter->cur.bytes = 1; + iter->cur.wc = *iter->cur.ptr; + iter->cur.wc_valid = true; + } + else + { + assert (mbsinit (&iter->state)); + iter->in_shift = true; + with_shift: + iter->cur.bytes = mbrtowc (&iter->cur.wc, iter->cur.ptr, + strnlen1 (iter->cur.ptr, MB_CUR_MAX), + &iter->state); + if (iter->cur.bytes == (size_t) -1) + { + /* An invalid multibyte sequence was encountered. */ + iter->cur.bytes = 1; + iter->cur.wc_valid = false; + /* Whether to set iter->in_shift = false and reset iter->state + or not is not very important; the string is bogus anyway. */ + } + else if (iter->cur.bytes == (size_t) -2) + { + /* An incomplete multibyte character at the end. */ + iter->cur.bytes = strlen (iter->cur.ptr) + 1; + iter->cur.wc_valid = false; + /* Whether to set iter->in_shift = false and reset iter->state + or not is not important; the string end is reached anyway. */ + } + else + { + if (iter->cur.bytes == 0) + { + /* A null wide character was encountered. */ + iter->cur.bytes = 1; + assert (*iter->cur.ptr == '\0'); + assert (iter->cur.wc == 0); + } + iter->cur.wc_valid = true; + + /* When in the initial state, we can go back treating ASCII + characters more quickly. */ + if (mbsinit (&iter->state)) + iter->in_shift = false; + } + } + iter->next_done = true; +} + +static inline void +mbiter_multi_reloc (struct mbiter_multi *iter, ptrdiff_t ptrdiff) +{ + iter->cur.ptr += ptrdiff; +} - if (p1 == p2 || LENGTH_LIMIT_EXPR (n == 0)) +/* Iteration macros. */ +typedef struct mbiter_multi mbi_iterator_t; +#define mbi_init(iter, startptr) \ + ((iter).cur.ptr = (startptr), (iter).at_end = false, \ + (iter).in_shift = false, memset (&(iter).state, '\0', sizeof (mbstate_t)), \ + (iter).next_done = false) +#define mbi_avail(iter) \ + (!(iter).at_end && (mbiter_multi_next (&(iter)), true)) +#define mbi_advance(iter) \ + ((mb_isnul ((iter).cur) ? ((iter).at_end = true) : 0), \ + (iter).cur.ptr += (iter).cur.bytes, (iter).next_done = false) + +/* Access to the current character. */ +#define mbi_cur(iter) (iter).cur +#define mbi_cur_ptr(iter) (iter).cur.ptr + +#endif + +#define TOLOWER(Ch) (isupper (Ch) ? tolower (Ch) : (Ch)) + +/* Compare strings S1 and S2, ignoring case, returning less than, equal to or + greater than zero if S1 is lexicographically less than, equal to or greater + than S2. + Note: This function may, in multibyte locales, return 0 for strings of + different lengths! */ +int +strcasecmp (const char *s1, const char *s2) +{ + if (s1 == s2) return 0; - do + /* Be careful not to look at the entire extent of s1 or s2 until needed. + This is useful because when two strings differ, the difference is + most often already in the very few first characters. */ +#if HAVE_MBRTOWC + if (MB_CUR_MAX > 1) { - c1 = TOLOWER (*p1); - c2 = TOLOWER (*p2); + mbi_iterator_t iter1; + mbi_iterator_t iter2; - if (LENGTH_LIMIT_EXPR (--n == 0) || c1 == '\0') - break; + mbi_init (iter1, s1); + mbi_init (iter2, s2); - ++p1; - ++p2; + while (mbi_avail (iter1) && mbi_avail (iter2)) + { + /* Sort invalid characters after all valid ones. */ + if (!mbi_cur (iter1).wc_valid) + { + if (!mbi_cur (iter2).wc_valid) + { + /* Compare two invalid characters. */ + int cmp; + + if (mbi_cur (iter1).bytes > mbi_cur (iter2).bytes) + return 1; + if (mbi_cur (iter1).bytes < mbi_cur (iter2).bytes) + return -1; + cmp = memcmp (mbi_cur_ptr (iter1), mbi_cur_ptr (iter2), + mbi_cur (iter1).bytes); + if (cmp != 0) + return cmp; + } + else + /* mbi_cur (iter1) invalid, mbi_cur (iter2) valid. */ + return 1; + } + else + { + if (!mbi_cur (iter2).wc_valid) + /* mbi_cur (iter1) valid, mbi_cur (iter2) invalid. */ + return -1; + else + { + /* Compare two valid characters. */ + wchar_t c1 = towlower (mbi_cur (iter1).wc); + wchar_t c2 = towlower (mbi_cur (iter2).wc); + + if (c1 > c2) + return 1; + if (c1 < c2) + return -1; + } + } + mbi_advance (iter1); + mbi_advance (iter2); + } + if (mbi_avail (iter1)) + /* s2 terminated before s1. */ + return 1; + if (mbi_avail (iter2)) + /* s1 terminated before s2. */ + return -1; + return 0; } - while (c1 == c2); + else +#endif + { + const unsigned char *p1 = (const unsigned char *) s1; + const unsigned char *p2 = (const unsigned char *) s2; + unsigned char c1, c2; - return c1 - c2; + do + { + c1 = TOLOWER (*p1); + c2 = TOLOWER (*p2); + + if (c1 == '\0') + break; + + ++p1; + ++p2; + } + while (c1 == c2); + + return c1 - c2; + } } diff --git a/lib/strncasecmp.c b/lib/strncasecmp.c index 68d95aacc0..4e57debf51 100644 --- a/lib/strncasecmp.c +++ b/lib/strncasecmp.c @@ -1,2 +1,58 @@ -#define LENGTH_LIMIT -#include "strcasecmp.c" +/* strncasecmp.c -- case insensitive string comparator + Copyright (C) 1998, 1999 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ + +#if HAVE_CONFIG_H +# include +#endif + +/* Specification. */ +#include "strcase.h" + +#include + +#define TOLOWER(Ch) (isupper (Ch) ? tolower (Ch) : (Ch)) + +/* Compare no more than N bytes of strings S1 and S2, + ignoring case, returning less than, equal to or + greater than zero if S1 is lexicographically less + than, equal to or greater than S2. */ + +int +strncasecmp (const char *s1, const char *s2, size_t n) +{ + register const unsigned char *p1 = (const unsigned char *) s1; + register const unsigned char *p2 = (const unsigned char *) s2; + unsigned char c1, c2; + + if (p1 == p2 || n == 0) + return 0; + + do + { + c1 = TOLOWER (*p1); + c2 = TOLOWER (*p2); + + if (--n == 0 || c1 == '\0') + break; + + ++p1; + ++p2; + } + while (c1 == c2); + + return c1 - c2; +} diff --git a/m4/ChangeLog b/m4/ChangeLog index d5df181d0d..a0ca3b26fa 100644 --- a/m4/ChangeLog +++ b/m4/ChangeLog @@ -1,3 +1,8 @@ +2005-08-16 Bruno Haible + + * strcase.m4 (gl_FUNC_STRCASECMP): Use the replacement function always. + (gl_PREREQ_STRCASECMP): Use gl_FUNC_MBRTOWC. + 2005-08-16 Bruno Haible * mbfile.m4: New file. diff --git a/m4/strcase.m4 b/m4/strcase.m4 index 40ace46a72..8a8ff3a451 100644 --- a/m4/strcase.m4 +++ b/m4/strcase.m4 @@ -1,5 +1,5 @@ -# strcase.m4 serial 1 -dnl Copyright (C) 2002 Free Software Foundation, Inc. +# strcase.m4 serial 2 +dnl Copyright (C) 2002, 2005 Free Software Foundation, Inc. dnl This file is free software; the Free Software Foundation dnl gives unlimited permission to copy and/or distribute it, dnl with or without modifications, as long as this notice is preserved. @@ -12,10 +12,11 @@ AC_DEFUN([gl_STRCASE], AC_DEFUN([gl_FUNC_STRCASECMP], [ - AC_REPLACE_FUNCS(strcasecmp) - if test $ac_cv_func_strcasecmp = no; then - gl_PREREQ_STRCASECMP - fi + dnl No known system has a strcasecmp() function that works correctly in + dnl multibyte locales. Therefore we use our version always. + AC_LIBOBJ(strcasecmp) + AC_DEFINE(strcasecmp, rpl_strcasecmp, [Define to rpl_strcasecmp always.]) + gl_PREREQ_STRCASECMP ]) AC_DEFUN([gl_FUNC_STRNCASECMP], @@ -28,7 +29,7 @@ AC_DEFUN([gl_FUNC_STRNCASECMP], # Prerequisites of lib/strcasecmp.c. AC_DEFUN([gl_PREREQ_STRCASECMP], [ - : + gl_FUNC_MBRTOWC ]) # Prerequisites of lib/strncasecmp.c. diff --git a/modules/strcase b/modules/strcase index 1f98231207..81c82d71e6 100644 --- a/modules/strcase +++ b/modules/strcase @@ -6,8 +6,11 @@ lib/strcase.h lib/strcasecmp.c lib/strncasecmp.c m4/strcase.m4 +m4/mbrtowc.m4 Depends-on: +strnlen1 +mbchar configure.ac: gl_STRCASE