+2011-01-21 Pádraig Brady <P@draigBrady.com>
+ Bruno Haible <bruno@clisp.org>
+
+ Make uN_strstr functions O(n) worst-case.
+ * lib/unistr/u-strstr.h (FUNC): In the 8-bit case, use strstr. In the
+ 16-bit and 32-bit unit cases, use the unibyte algorithm from
+ lib/mbsstr.c.
+ * lib/unistr/u8-strstr.c: Include <string.h>.
+ (UNIT_IS_UINT8_T): New macro.
+ * lib/unistr/u16-strstr.c: Include malloca.h and str-kmp.h.
+ (U_STRLEN, U_STRNLEN): New macros.
+ * lib/unistr/u32-strstr.c: Include malloca.h and str-kmp.h.
+ (U_STRLEN, U_STRNLEN): New macros.
+ * modules/unistr/u8-strstr (Depends-on): Add strstr.
+ (configure.ac): Update required libunistring version.
+ * modules/unistr/u16-strstr (Files): Add lib/str-kmp.h.
+ (Depends-on): Add unistr/u16-strlen, unistr/u16-strnlen, stdbool,
+ malloca.
+ (configure.ac): Update required libunistring version.
+ * modules/unistr/u32-strstr (Files): Add lib/str-kmp.h.
+ (Depends-on): Add unistr/u32-strlen, unistr/u32-strnlen, stdbool,
+ malloca.
+ (configure.ac): Update required libunistring version.
+
2011-01-21 Pádraig Brady <P@draigBrady.com>
Bruno Haible <bruno@clisp.org>
/* Substring test for UTF-8/UTF-16/UTF-32 strings.
Copyright (C) 1999, 2002, 2006, 2010-2011 Free Software Foundation, Inc.
- Written by Bruno Haible <bruno@clisp.org>, 2002.
+ Written by Bruno Haible <bruno@clisp.org>, 2002, 2005.
This program is free software: you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published
}
#endif
- /* Search for needle's first unit. */
- for (; *haystack != 0; haystack++)
- if (*haystack == first)
+#if UNIT_IS_UINT8_T
+ return (uint8_t *) strstr ((const char *) haystack, (const char *) needle);
+#else
+ {
+ /* Minimizing the worst-case complexity:
+ Let n = U_STRLEN(haystack), m = U_STRLEN(needle).
+ The naïve algorithm is O(n*m) worst-case.
+ The Knuth-Morris-Pratt algorithm is O(n) worst-case but it needs a
+ memory allocation.
+ To achieve linear complexity and yet amortize the cost of the
+ memory allocation, we activate the Knuth-Morris-Pratt algorithm
+ only once the naïve algorithm has already run for some time; more
+ precisely, when
+ - the outer loop count is >= 10,
+ - the average number of comparisons per outer loop is >= 5,
+ - the total number of comparisons is >= m.
+ But we try it only once. If the memory allocation attempt failed,
+ we don't retry it. */
+ bool try_kmp = true;
+ size_t outer_loop_count = 0;
+ size_t comparison_count = 0;
+ size_t last_ccount = 0; /* last comparison count */
+ const UNIT *needle_last_ccount = needle; /* = needle + last_ccount */
+
+ /* Speed up the following searches of needle by caching its first
+ character. */
+ UNIT b = *needle++;
+
+ for (;; haystack++)
{
- /* Compare with needle's remaining units. */
- const UNIT *hptr = haystack + 1;
- const UNIT *nptr = needle + 1;
- for (;;)
+ if (*haystack == 0)
+ /* No match. */
+ return NULL;
+
+ /* See whether it's advisable to use an asymptotically faster
+ algorithm. */
+ if (try_kmp
+ && outer_loop_count >= 10
+ && comparison_count >= 5 * outer_loop_count)
{
- if (*hptr != *nptr)
- break;
- hptr++; nptr++;
- if (*nptr == 0)
- return (UNIT *) haystack;
+ /* See if needle + comparison_count now reaches the end of
+ needle. */
+ if (needle_last_ccount != NULL)
+ {
+ needle_last_ccount +=
+ U_STRNLEN (needle_last_ccount,
+ comparison_count - last_ccount);
+ if (*needle_last_ccount == 0)
+ needle_last_ccount = NULL;
+ last_ccount = comparison_count;
+ }
+ if (needle_last_ccount == NULL)
+ {
+ /* Try the Knuth-Morris-Pratt algorithm. */
+ const UNIT *result;
+ bool success =
+ knuth_morris_pratt (haystack,
+ needle - 1, U_STRLEN (needle - 1),
+ &result);
+ if (success)
+ return (UNIT *) result;
+ try_kmp = false;
+ }
}
- }
- return NULL;
+ outer_loop_count++;
+ comparison_count++;
+ if (*haystack == b)
+ /* The first character matches. */
+ {
+ const UNIT *rhaystack = haystack + 1;
+ const UNIT *rneedle = needle;
+
+ for (;; rhaystack++, rneedle++)
+ {
+ if (*rneedle == 0)
+ /* Found a match. */
+ return (UNIT *) haystack;
+ if (*rhaystack == 0)
+ /* No match. */
+ return NULL;
+ comparison_count++;
+ if (*rhaystack != *rneedle)
+ /* Nothing in this round. */
+ break;
+ }
+ }
+ }
+ }
+#endif
}