* lib/regex_internal.c (re_string_reconstruct): Handle

author Paul Eggert <eggert@cs.ucla.edu>

Thu, 21 Sep 2006 05:25:11 +0000 (05:25 +0000)

committer Paul Eggert <eggert@cs.ucla.edu>

Thu, 21 Sep 2006 05:25:11 +0000 (05:25 +0000)
author Paul Eggert <eggert@cs.ucla.edu>
Thu, 21 Sep 2006 05:25:11 +0000 (05:25 +0000)
committer Paul Eggert <eggert@cs.ucla.edu>
Thu, 21 Sep 2006 05:25:11 +0000 (05:25 +0000)
diff --git a/lib/ChangeLog b/lib/ChangeLog

index 85a95c70d1888ba3ce7db8393a0fcfacbd3bed72..046c22b717d5fde2116e04682bfb5560bb9f98cd 100644 (file)
--- a/lib/ChangeLog
+++ b/lib/ChangeLog
@@ -1,3 +1,15 @@
+2006-09-20  Paul Eggert  <eggert@cs.ucla.edu>
+
+       Import this patch from libc:
+
+       2006-09-06  Jakub Jelinek  <jakub@redhat.com>
+
+       * regex_internal.c (re_string_reconstruct): Handle
+       offset < pstr->valid_raw_len && pstr->offsets_needed case.
+       Ensure no bytes read before raw_mbs array.  Pass a saved copy of
+       pstr->valid_len - 1 rather than pstr->valid_raw_len - 1 to
+       re_string_context_at.
+
  2006-09-20  Bruno Haible  <bruno@clisp.org>
  
         * mkdtemp.c: Import from libc.
diff --git a/lib/regex_internal.c b/lib/regex_internal.c

index fa5bfbb5c47286a831e5d20b5f187fba2db1ed46..78e16f330147e97e27ca3489bd9815f1de04da5c 100644 (file)
--- a/lib/regex_internal.c
+++ b/lib/regex_internal.c
@@ -597,34 +597,98 @@ re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
  
    if (BE (offset != 0, 1))
      {
-      /* Are the characters which are already checked remain?  */
-      if (BE (offset < pstr->valid_raw_len, 1)
-#ifdef RE_ENABLE_I18N
-         /* Handling this would enlarge the code too much.
-            Accept a slowdown in that case.  */
-         && pstr->offsets_needed == 0
-#endif
-        )
+      /* Should the already checked characters be kept?  */
+      if (BE (offset < pstr->valid_raw_len, 1))
         {
           /* Yes, move them to the front of the buffer.  */
-         pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
  #ifdef RE_ENABLE_I18N
-         if (pstr->mb_cur_max > 1)
-           memmove (pstr->wcs, pstr->wcs + offset,
-                    (pstr->valid_len - offset) * sizeof (wint_t));
+         if (BE (pstr->offsets_needed, 0))
+           {
+             Idx low = 0, high = pstr->valid_len, mid;
+             do
+               {
+                 mid = (high + low) / 2;
+                 if (pstr->offsets[mid] > offset)
+                   high = mid;
+                 else if (pstr->offsets[mid] < offset)
+                   low = mid + 1;
+                 else
+                   break;
+               }
+             while (low < high);
+             if (pstr->offsets[mid] < offset)
+               ++mid;
+             pstr->tip_context = re_string_context_at (pstr, mid - 1,
+                                                       eflags);
+             /* This can be quite complicated, so handle specially
+                only the common and easy case where the character with
+                different length representation of lower and upper
+                case is present at or after offset.  */
+             if (pstr->valid_len > offset
+                 && mid == offset && pstr->offsets[mid] == offset)
+               {
+                 memmove (pstr->wcs, pstr->wcs + offset,
+                          (pstr->valid_len - offset) * sizeof (wint_t));
+                 memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
+                 pstr->valid_len -= offset;
+                 pstr->valid_raw_len -= offset;
+                 for (low = 0; low < pstr->valid_len; low++)
+                   pstr->offsets[low] = pstr->offsets[low + offset] - offset;
+               }
+             else
+               {
+                 /* Otherwise, just find out how long the partial multibyte
+                    character at offset is and fill it with WEOF/255.  */
+                 pstr->len = pstr->raw_len - idx + offset;
+                 pstr->stop = pstr->raw_stop - idx + offset;
+                 pstr->offsets_needed = 0;
+                 while (mid > 0 && pstr->offsets[mid - 1] == offset)
+                   --mid;
+                 while (mid < pstr->valid_len)
+                   if (pstr->wcs[mid] != WEOF)
+                     break;
+                   else
+                     ++mid;
+                 if (mid == pstr->valid_len)
+                   pstr->valid_len = 0;
+                 else
+                   {
+                     pstr->valid_len = pstr->offsets[mid] - offset;
+                     if (pstr->valid_len)
+                       {
+                         for (low = 0; low < pstr->valid_len; ++low)
+                           pstr->wcs[low] = WEOF;
+                         memset (pstr->mbs, 255, pstr->valid_len);
+                       }
+                   }
+                 pstr->valid_raw_len = pstr->valid_len;
+               }
+           }
+         else
+#endif
+           {
+             pstr->tip_context = re_string_context_at (pstr, offset - 1,
+                                                       eflags);
+#ifdef RE_ENABLE_I18N
+             if (pstr->mb_cur_max > 1)
+               memmove (pstr->wcs, pstr->wcs + offset,
+                        (pstr->valid_len - offset) * sizeof (wint_t));
  #endif /* RE_ENABLE_I18N */
-         if (BE (pstr->mbs_allocated, 0))
-           memmove (pstr->mbs, pstr->mbs + offset,
-                    pstr->valid_len - offset);
-         pstr->valid_len -= offset;
-         pstr->valid_raw_len -= offset;
+             if (BE (pstr->mbs_allocated, 0))
+               memmove (pstr->mbs, pstr->mbs + offset,
+                        pstr->valid_len - offset);
+             pstr->valid_len -= offset;
+             pstr->valid_raw_len -= offset;
  #if DEBUG
-         assert (pstr->valid_len > 0);
+             assert (pstr->valid_len > 0);
  #endif
+           }
         }
        else
         {
           /* No, skip all characters until IDX.  */
+         Idx prev_valid_len = pstr->valid_len;
+
  #ifdef RE_ENABLE_I18N
           if (BE (pstr->offsets_needed, 0))
             {
@@ -648,6 +712,8 @@ re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
                      byte other than 0x80 - 0xbf.  */
                   raw = pstr->raw_mbs + pstr->raw_mbs_idx;
                   end = raw + (offset - pstr->mb_cur_max);
+                 if (end < pstr->raw_mbs)
+                   end = pstr->raw_mbs;
                   p = raw + offset - 1;
  #ifdef _LIBC
                   /* We know the wchar_t encoding is UCS4, so for the simple
@@ -655,7 +721,7 @@ re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
                   if (isascii (*p) && BE (pstr->trans == NULL, 1))
                     {
                       memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
-                     pstr->valid_len = 0;
+                     /* pstr->valid_len = 0; */
                       wc = (wchar_t) *p;
                     }
                   else
@@ -698,7 +764,7 @@ re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
                 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
               if (wc == WEOF)
                 pstr->tip_context
-                 = re_string_context_at (pstr, pstr->valid_raw_len - 1, eflags);
+                 = re_string_context_at (pstr, prev_valid_len - 1, eflags);
               else
                 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
                                       && IS_WIDE_WORD_CHAR (wc))
@@ -711,7 +777,7 @@ re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
                   for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
                     pstr->wcs[wcs_idx] = WEOF;
                   if (pstr->mbs_allocated)
-                   memset (pstr->mbs, -1, pstr->valid_len);
+                   memset (pstr->mbs, 255, pstr->valid_len);
                 }
               pstr->valid_raw_len = pstr->valid_len;
             }
diff --git a/m4/ChangeLog b/m4/ChangeLog

index 55759549c8ae54b488a67b27b9d2c617e343f0ed..908c93d265aabadfb5d4d008d5d4b9fab8f308b6 100644 (file)
--- a/m4/ChangeLog
+++ b/m4/ChangeLog
@@ -1,5 +1,12 @@
  2006-09-20  Paul Eggert  <eggert@cs.ucla.edu>
  
+       * regex.m4 (gl_REGEX): Check for locale.h, since the test
+       now requires it.
+       (gl_PREREQ_REGEX): Don't check for locale.h any more, since
+       gl_REGEX now does it for us.
+       (gl_REGEX): Add test taken from
+       http://sourceware.org/ml/libc-hacker/2006-09/msg00008.html.
+
         * mkstemp.m4 (gl_FUNC_MKSTEMP): Require AC_SYS_LARGEFILE.
         Check that large offsets work.  Modernize Autoconf usages.
         Prefer "yes" to mean a good thing rather than a bad.
diff --git a/m4/regex.m4 b/m4/regex.m4

index 46c0131e4a6656ba0776c7ba2bc39a7350df0bf1..69dc326f7499e5465dba8d0f94e3a2dda1a8fb72 100644 (file)
--- a/m4/regex.m4
+++ b/m4/regex.m4
@@ -1,4 +1,4 @@
-#serial 39
+#serial 40
  
  # Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2003, 2004, 2005,
  # 2006 Free Software Foundation, Inc.
@@ -14,6 +14,8 @@ AC_PREREQ([2.50])
  
  AC_DEFUN([gl_REGEX],
  [
+  AC_CHECK_HEADERS_ONCE([locale.h])
+
    AC_ARG_WITH([included-regex],
      [AC_HELP_STRING([--without-included-regex],
                     [don't compile regex; this is the default on
@@ -34,6 +36,9 @@ AC_DEFUN([gl_REGEX],
        [AC_RUN_IFELSE(
         [AC_LANG_PROGRAM(
           [AC_INCLUDES_DEFAULT
+          #if HAVE_LOCALE_H
+           #include <locale.h>
+          #endif
            #include <limits.h>
            #include <regex.h>
            ],
@@ -42,6 +47,33 @@ AC_DEFUN([gl_REGEX],
             int i;
             const char *s;
             struct re_registers regs;
+
+           #if HAVE_LOCALE_H
+             /* http://sourceware.org/ml/libc-hacker/2006-09/msg00008.html
+                This test needs valgrind to catch the bug on Debian
+                GNU/Linux 3.1 x86, but it might catch the bug better
+                on other platforms and it shouldn't hurt to try the
+                test here.  */
+             if (setlocale (LC_ALL, "en_US.UTF-8"))
+               {
+                 static char const pat[] = "insert into";
+                 static char const data[] =
+                   "\xFF\0\x12\xA2\xAA\xC4\xB1,K\x12\xC4\xB1*\xACK";
+                 re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE
+                                | RE_ICASE);
+                 memset (&regex, 0, sizeof regex);
+                 s = re_compile_pattern (pat, sizeof pat - 1, &regex);
+                 if (s)
+                   return 1;
+                 if (re_search (&regex, data, sizeof data - 1,
+                                0, sizeof data - 1, &regs)
+                     != -1)
+                   return 1;
+                 if (! setlocale (LC_ALL, "C"))
+                   return 1;
+               }
+           #endif
+
             re_set_syntax (RE_SYNTAX_POSIX_EGREP);
             memset (&regex, 0, sizeof (regex));
             for (i = 0; i <= UCHAR_MAX; i++)
@@ -161,7 +193,7 @@ AC_DEFUN([gl_PREREQ_REGEX],
    AC_REQUIRE([AC_GNU_SOURCE])
    AC_REQUIRE([AC_C_RESTRICT])
    AC_REQUIRE([AM_LANGINFO_CODESET])
-  AC_CHECK_HEADERS_ONCE([locale.h wchar.h wctype.h])
+  AC_CHECK_HEADERS_ONCE([wchar.h wctype.h])
    AC_CHECK_FUNCS_ONCE([mbrtowc mempcpy wcrtomb wcscoll])
    AC_CHECK_DECLS([isblank], [], [], [#include <ctype.h>])
  ])
author	Paul Eggert <eggert@cs.ucla.edu>
	Thu, 21 Sep 2006 05:25:11 +0000 (05:25 +0000)
committer	Paul Eggert <eggert@cs.ucla.edu>
	Thu, 21 Sep 2006 05:25:11 +0000 (05:25 +0000)
lib/ChangeLog		patch \| blob \| history
lib/regex_internal.c		patch \| blob \| history
m4/ChangeLog		patch \| blob \| history
m4/regex.m4		patch \| blob \| history