if (BE (offset != 0, 1))
{
- /* Are the characters which are already checked remain? */
- if (BE (offset < pstr->valid_raw_len, 1)
-#ifdef RE_ENABLE_I18N
- /* Handling this would enlarge the code too much.
- Accept a slowdown in that case. */
- && pstr->offsets_needed == 0
-#endif
- )
+ /* Should the already checked characters be kept? */
+ if (BE (offset < pstr->valid_raw_len, 1))
{
/* Yes, move them to the front of the buffer. */
- pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
#ifdef RE_ENABLE_I18N
- if (pstr->mb_cur_max > 1)
- memmove (pstr->wcs, pstr->wcs + offset,
- (pstr->valid_len - offset) * sizeof (wint_t));
+ if (BE (pstr->offsets_needed, 0))
+ {
+ Idx low = 0, high = pstr->valid_len, mid;
+ do
+ {
+ mid = (high + low) / 2;
+ if (pstr->offsets[mid] > offset)
+ high = mid;
+ else if (pstr->offsets[mid] < offset)
+ low = mid + 1;
+ else
+ break;
+ }
+ while (low < high);
+ if (pstr->offsets[mid] < offset)
+ ++mid;
+ pstr->tip_context = re_string_context_at (pstr, mid - 1,
+ eflags);
+ /* This can be quite complicated, so handle specially
+ only the common and easy case where the character with
+ different length representation of lower and upper
+ case is present at or after offset. */
+ if (pstr->valid_len > offset
+ && mid == offset && pstr->offsets[mid] == offset)
+ {
+ memmove (pstr->wcs, pstr->wcs + offset,
+ (pstr->valid_len - offset) * sizeof (wint_t));
+ memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
+ pstr->valid_len -= offset;
+ pstr->valid_raw_len -= offset;
+ for (low = 0; low < pstr->valid_len; low++)
+ pstr->offsets[low] = pstr->offsets[low + offset] - offset;
+ }
+ else
+ {
+ /* Otherwise, just find out how long the partial multibyte
+ character at offset is and fill it with WEOF/255. */
+ pstr->len = pstr->raw_len - idx + offset;
+ pstr->stop = pstr->raw_stop - idx + offset;
+ pstr->offsets_needed = 0;
+ while (mid > 0 && pstr->offsets[mid - 1] == offset)
+ --mid;
+ while (mid < pstr->valid_len)
+ if (pstr->wcs[mid] != WEOF)
+ break;
+ else
+ ++mid;
+ if (mid == pstr->valid_len)
+ pstr->valid_len = 0;
+ else
+ {
+ pstr->valid_len = pstr->offsets[mid] - offset;
+ if (pstr->valid_len)
+ {
+ for (low = 0; low < pstr->valid_len; ++low)
+ pstr->wcs[low] = WEOF;
+ memset (pstr->mbs, 255, pstr->valid_len);
+ }
+ }
+ pstr->valid_raw_len = pstr->valid_len;
+ }
+ }
+ else
+#endif
+ {
+ pstr->tip_context = re_string_context_at (pstr, offset - 1,
+ eflags);
+#ifdef RE_ENABLE_I18N
+ if (pstr->mb_cur_max > 1)
+ memmove (pstr->wcs, pstr->wcs + offset,
+ (pstr->valid_len - offset) * sizeof (wint_t));
#endif /* RE_ENABLE_I18N */
- if (BE (pstr->mbs_allocated, 0))
- memmove (pstr->mbs, pstr->mbs + offset,
- pstr->valid_len - offset);
- pstr->valid_len -= offset;
- pstr->valid_raw_len -= offset;
+ if (BE (pstr->mbs_allocated, 0))
+ memmove (pstr->mbs, pstr->mbs + offset,
+ pstr->valid_len - offset);
+ pstr->valid_len -= offset;
+ pstr->valid_raw_len -= offset;
#if DEBUG
- assert (pstr->valid_len > 0);
+ assert (pstr->valid_len > 0);
#endif
+ }
}
else
{
/* No, skip all characters until IDX. */
+ Idx prev_valid_len = pstr->valid_len;
+
#ifdef RE_ENABLE_I18N
if (BE (pstr->offsets_needed, 0))
{
byte other than 0x80 - 0xbf. */
raw = pstr->raw_mbs + pstr->raw_mbs_idx;
end = raw + (offset - pstr->mb_cur_max);
+ if (end < pstr->raw_mbs)
+ end = pstr->raw_mbs;
p = raw + offset - 1;
#ifdef _LIBC
/* We know the wchar_t encoding is UCS4, so for the simple
if (isascii (*p) && BE (pstr->trans == NULL, 1))
{
memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
- pstr->valid_len = 0;
+ /* pstr->valid_len = 0; */
wc = (wchar_t) *p;
}
else
pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
if (wc == WEOF)
pstr->tip_context
- = re_string_context_at (pstr, pstr->valid_raw_len - 1, eflags);
+ = re_string_context_at (pstr, prev_valid_len - 1, eflags);
else
pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
&& IS_WIDE_WORD_CHAR (wc))
for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
pstr->wcs[wcs_idx] = WEOF;
if (pstr->mbs_allocated)
- memset (pstr->mbs, -1, pstr->valid_len);
+ memset (pstr->mbs, 255, pstr->valid_len);
}
pstr->valid_raw_len = pstr->valid_len;
}
-#serial 39
+#serial 40
# Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2003, 2004, 2005,
# 2006 Free Software Foundation, Inc.
AC_DEFUN([gl_REGEX],
[
+ AC_CHECK_HEADERS_ONCE([locale.h])
+
AC_ARG_WITH([included-regex],
[AC_HELP_STRING([--without-included-regex],
[don't compile regex; this is the default on
[AC_RUN_IFELSE(
[AC_LANG_PROGRAM(
[AC_INCLUDES_DEFAULT
+ #if HAVE_LOCALE_H
+ #include <locale.h>
+ #endif
#include <limits.h>
#include <regex.h>
],
int i;
const char *s;
struct re_registers regs;
+
+ #if HAVE_LOCALE_H
+ /* http://sourceware.org/ml/libc-hacker/2006-09/msg00008.html
+ This test needs valgrind to catch the bug on Debian
+ GNU/Linux 3.1 x86, but it might catch the bug better
+ on other platforms and it shouldn't hurt to try the
+ test here. */
+ if (setlocale (LC_ALL, "en_US.UTF-8"))
+ {
+ static char const pat[] = "insert into";
+ static char const data[] =
+ "\xFF\0\x12\xA2\xAA\xC4\xB1,K\x12\xC4\xB1*\xACK";
+ re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE
+ | RE_ICASE);
+ memset (®ex, 0, sizeof regex);
+ s = re_compile_pattern (pat, sizeof pat - 1, ®ex);
+ if (s)
+ return 1;
+ if (re_search (®ex, data, sizeof data - 1,
+ 0, sizeof data - 1, ®s)
+ != -1)
+ return 1;
+ if (! setlocale (LC_ALL, "C"))
+ return 1;
+ }
+ #endif
+
re_set_syntax (RE_SYNTAX_POSIX_EGREP);
memset (®ex, 0, sizeof (regex));
for (i = 0; i <= UCHAR_MAX; i++)
AC_REQUIRE([AC_GNU_SOURCE])
AC_REQUIRE([AC_C_RESTRICT])
AC_REQUIRE([AM_LANGINFO_CODESET])
- AC_CHECK_HEADERS_ONCE([locale.h wchar.h wctype.h])
+ AC_CHECK_HEADERS_ONCE([wchar.h wctype.h])
AC_CHECK_FUNCS_ONCE([mbrtowc mempcpy wcrtomb wcscoll])
AC_CHECK_DECLS([isblank], [], [], [#include <ctype.h>])
])