* lib/memmem.c (knuth_morris_pratt): New function.
(memmem): Use it if first few naive iterations fail.
* m4/memmem.m4 (gl_FUNC_MEMMEM): Detect cygwin bug.
* modules/memcmp (License): Set to LGPLv2+, not LGPL.
* modules/memchr (License): Likewise.
* modules/memmem (Depends-on): Add memcmp, memchr, stdbool, and
malloca.
* tests/test-memmem.c: Rewrite, borrowing ideas from
test-mbsstr1.c; the old version wouldn't even compile!
* modules/memmem-tests: New file.
* lib/string.in.h (rpl_memmem): Add declaration.
* modules/string (Makefile.am): Substitute REPLACE_MEMMEM.
* m4/string_h.m4 (gl_HEADER_STRING_H_DEFAULTS): Default for
REPLACE_MEMMEM.
Signed-off-by: Eric Blake <ebb9@byu.net>
+2007-12-19 Eric Blake <ebb9@byu.net>
+
+ Fix memmem to avoid O(n^2) worst-case complexity.
+ * lib/memmem.c (knuth_morris_pratt): New function.
+ (memmem): Use it if first few naive iterations fail.
+ * m4/memmem.m4 (gl_FUNC_MEMMEM): Detect cygwin bug.
+ * modules/memcmp (License): Set to LGPLv2+, not LGPL.
+ * modules/memchr (License): Likewise.
+ * modules/memmem (Depends-on): Add memcmp, memchr, stdbool, and
+ malloca.
+ * tests/test-memmem.c: Rewrite, borrowing ideas from
+ test-mbsstr1.c; the old version wouldn't even compile!
+ * modules/memmem-tests: New file.
+ * lib/string.in.h (rpl_memmem): Add declaration.
+ * modules/string (Makefile.am): Substitute REPLACE_MEMMEM.
+ * m4/string_h.m4 (gl_HEADER_STRING_H_DEFAULTS): Default for
+ REPLACE_MEMMEM.
+
2007-12-18 Paul Eggert <eggert@cs.ucla.edu>
Fix problem with _GL_JUST_INCLUDE_SYSTEM_INTTYPES_H on VMS.
#include <stddef.h>
#include <string.h>
+#include <stdbool.h>
+
+#include "malloca.h"
#ifndef _LIBC
# define __builtin_expect(expr, val) (expr)
#endif
-#undef memmem
+/* Knuth-Morris-Pratt algorithm.
+ See http://en.wikipedia.org/wiki/Knuth-Morris-Pratt_algorithm
+ Return a boolean indicating success. */
+
+static bool
+knuth_morris_pratt (const char *haystack, const char *last_haystack,
+ const char *needle, size_t m,
+ const char **resultp)
+{
+ /* Allocate the table. */
+ size_t *table = (size_t *) malloca (m * sizeof (size_t));
+ if (table == NULL)
+ return false;
+ /* Fill the table.
+ For 0 < i < m:
+ 0 < table[i] <= i is defined such that
+ rhaystack[0..i-1] == needle[0..i-1] and rhaystack[i] != needle[i]
+ implies
+ forall 0 <= x < table[i]: rhaystack[x..x+m-1] != needle[0..m-1],
+ and table[i] is as large as possible with this property.
+ table[0] remains uninitialized. */
+ {
+ size_t i, j;
+
+ table[1] = 1;
+ j = 0;
+ for (i = 2; i < m; i++)
+ {
+ unsigned char b = (unsigned char) needle[i - 1];
-/* Return the first occurrence of NEEDLE in HAYSTACK. */
+ for (;;)
+ {
+ if (b == (unsigned char) needle[j])
+ {
+ table[i] = i - ++j;
+ break;
+ }
+ if (j == 0)
+ {
+ table[i] = i;
+ break;
+ }
+ j = j - table[j];
+ }
+ }
+ }
+
+ /* Search, using the table to accelerate the processing. */
+ {
+ size_t j;
+ const char *rhaystack;
+ const char *phaystack;
+
+ *resultp = NULL;
+ j = 0;
+ rhaystack = haystack;
+ phaystack = haystack;
+ /* Invariant: phaystack = rhaystack + j. */
+ while (phaystack != last_haystack)
+ if ((unsigned char) needle[j] == (unsigned char) *phaystack)
+ {
+ j++;
+ phaystack++;
+ if (j == m)
+ {
+ /* The entire needle has been found. */
+ *resultp = rhaystack;
+ break;
+ }
+ }
+ else if (j > 0)
+ {
+ /* Found a match of needle[0..j-1], mismatch at needle[j]. */
+ rhaystack += table[j];
+ j -= table[j];
+ }
+ else
+ {
+ /* Found a mismatch at needle[0] already. */
+ rhaystack++;
+ phaystack++;
+ }
+ }
+
+ freea (table);
+ return true;
+}
+
+/* Return the first occurrence of NEEDLE in HAYSTACK. Return HAYSTACK
+ if NEEDLE_LEN is 0, otherwise NULL if NEEDLE is not found in
+ HAYSTACK. */
void *
-memmem (haystack, haystack_len, needle, needle_len)
- const void *haystack;
- size_t haystack_len;
- const void *needle;
- size_t needle_len;
+memmem (const void *haystack, size_t haystack_len,
+ const void *needle, size_t needle_len)
{
- const char *begin;
- const char *const last_possible
- = (const char *) haystack + haystack_len - needle_len;
+ /* Operating with void * is awkward. */
+ const char *Haystack = (const char *) haystack;
+ const char *Needle = (const char *) needle;
+ const char *last_haystack = Haystack + haystack_len;
+ const char *last_needle = Needle + needle_len;
if (needle_len == 0)
/* The first occurrence of the empty string is deemed to occur at
if (__builtin_expect (haystack_len < needle_len, 0))
return NULL;
- for (begin = (const char *) haystack; begin <= last_possible; ++begin)
- if (begin[0] == ((const char *) needle)[0] &&
- !memcmp ((const void *) &begin[1],
- (const void *) ((const char *) needle + 1),
- needle_len - 1))
- return (void *) begin;
+ /* Use optimizations in memchr when possible. */
+ if (__builtin_expect (needle_len == 1, 0))
+ return memchr (haystack, (unsigned char) *Needle, haystack_len);
+
+ /* Minimizing the worst-case complexity:
+ Let n = haystack_len, m = needle_len.
+ The naïve algorithm is O(n*m) worst-case.
+ The Knuth-Morris-Pratt algorithm is O(n) worst-case but it needs a
+ memory allocation.
+ To achieve linear complexity and yet amortize the cost of the
+ memory allocation, we activate the Knuth-Morris-Pratt algorithm
+ only once the naïve algorithm has already run for some time; more
+ precisely, when
+ - the outer loop count is >= 10,
+ - the average number of comparisons per outer loop is >= 5,
+ - the total number of comparisons is >= m.
+ But we try it only once. If the memory allocation attempt failed,
+ we don't retry it. */
+ {
+ bool try_kmp = true;
+ size_t outer_loop_count = 0;
+ size_t comparison_count = 0;
+
+ /* Speed up the following searches of needle by caching its first
+ character. */
+ char b = *Needle++;
+
+ for (;; Haystack++)
+ {
+ if (Haystack == last_haystack)
+ /* No match. */
+ return NULL;
+
+ /* See whether it's advisable to use an asymptotically faster
+ algorithm. */
+ if (try_kmp
+ && outer_loop_count >= 10
+ && comparison_count >= 5 * outer_loop_count)
+ {
+ /* See if needle + comparison_count now reaches the end of
+ needle. */
+ if (comparison_count >= needle_len)
+ {
+ /* Try the Knuth-Morris-Pratt algorithm. */
+ const char *result;
+ if (knuth_morris_pratt (Haystack, last_haystack,
+ Needle - 1, needle_len, &result))
+ return (void *) result;
+ try_kmp = false;
+ }
+ }
+
+ outer_loop_count++;
+ comparison_count++;
+ if (*Haystack == b)
+ /* The first character matches. */
+ {
+ const char *rhaystack = Haystack + 1;
+ const char *rneedle = Needle;
+
+ for (;; rhaystack++, rneedle++)
+ {
+ if (rneedle == last_needle)
+ /* Found a match. */
+ return (void *) Haystack;
+ if (rhaystack == last_haystack)
+ /* No match. */
+ return NULL;
+ comparison_count++;
+ if (*rhaystack != *rneedle)
+ /* Nothing in this round. */
+ break;
+ }
+ }
+ }
+ }
return NULL;
}
/* Return the first occurrence of NEEDLE in HAYSTACK. */
#if @GNULIB_MEMMEM@
-# if ! @HAVE_DECL_MEMMEM@
+# if @REPLACE_MEMMEM@
+# define memmem rpl_memmem
+# endif
+# if ! @HAVE_DECL_MEMMEM@ || @REPLACE_MEMMEM@
extern void *memmem (void const *__haystack, size_t __haystack_len,
void const *__needle, size_t __needle_len);
# endif
-# memmem.m4 serial 5
+# memmem.m4 serial 6
dnl Copyright (C) 2002, 2003, 2004, 2007 Free Software Foundation, Inc.
dnl This file is free software; the Free Software Foundation
dnl gives unlimited permission to copy and/or distribute it,
AC_CHECK_DECLS_ONCE(memmem)
if test $ac_cv_have_decl_memmem = no; then
HAVE_DECL_MEMMEM=0
+ else
+ AC_CACHE_CHECK([whether memmem works], [gl_cv_func_memmem_works],
+ [AC_RUN_IFELSE([AC_LANG_PROGRAM([#include <string.h>],
+ [return !memmem ("a", 1, NULL, 0);])],
+ [gl_cv_func_memmem_works=yes], [gl_cv_func_memmem_works=no],
+ [dnl pessimistically assume the worst, since even glibc 2.6.1
+ dnl has quadratic complexity in its memmem
+ gl_cv_func_memmem_works=no])])
+ if test $gl_cv_func_memmem_works = no; then
+ REPLACE_MEMMEM=1
+ AC_LIBOBJ([memmem])
+ fi
fi
gl_PREREQ_MEMMEM
])
# gives unlimited permission to copy and/or distribute it,
# with or without modifications, as long as this notice is preserved.
+# serial 2
+
# Written by Paul Eggert.
AC_DEFUN([gl_HEADER_STRING_H],
HAVE_DECL_STRTOK_R=1; AC_SUBST([HAVE_DECL_STRTOK_R])
HAVE_DECL_STRERROR=1; AC_SUBST([HAVE_DECL_STRERROR])
REPLACE_STRERROR=0; AC_SUBST([REPLACE_STRERROR])
+ REPLACE_MEMMEM=0; AC_SUBST([REPLACE_MEMMEM])
])
<string.h>
License:
-LGPL
+LGPLv2+
Maintainer:
Jim Meyering, glibc
<string.h>
License:
-LGPL
+LGPLv2+
Maintainer:
Jim Meyering, glibc
Depends-on:
extensions
string
+stdbool
+malloca
+memchr
+memcmp
configure.ac:
gl_FUNC_MEMMEM
--- /dev/null
+Files:
+tests/test-memmem.c
+
+Depends-on:
+
+configure.ac:
+
+Makefile.am:
+TESTS += test-memmem
+TESTS_ENVIRONMENT += EXEEXT='@EXEEXT@'
+check_PROGRAMS += test-memmem
+
-e 's|@''HAVE_STRCASESTR''@|$(HAVE_STRCASESTR)|g' \
-e 's|@''HAVE_DECL_STRTOK_R''@|$(HAVE_DECL_STRTOK_R)|g' \
-e 's|@''HAVE_DECL_STRERROR''@|$(HAVE_DECL_STRERROR)|g' \
+ -e 's|@''REPLACE_MEMMEM''@|$(REPLACE_MEMMEM)|g' \
-e 's|@''REPLACE_STRERROR''@|$(REPLACE_STRERROR)|g' \
-e '/definition of GL_LINK_WARNING/r $(LINK_WARNING_H)' \
< $(srcdir)/string.in.h; \
#include <stdio.h>
#include <string.h>
+#include <stdlib.h>
+
+#define ASSERT(expr) \
+ do \
+ { \
+ if (!(expr)) \
+ { \
+ fprintf (stderr, "%s:%d: assertion failed\n", __FILE__, __LINE__); \
+ abort (); \
+ } \
+ } \
+ while (0)
int
main (int argc, char *argv[])
{
- const char *big = "foo-bar-baz";
- char *p;
-
- p = memmem (big, "foo", strlen (big));
- if (p != big)
- {
- fprintf (stderr, "memmem FAILURE 1\n");
- return 1;
- }
-
- p = memmem (big, "baz", strlen (big));
- if (p != big + strlen (big) - 3)
- {
- fprintf (stderr, "memmem FAILURE 2\n");
- return 1;
-
- p = memmem (big, "-", strlen (big));
- if (p != big + 3)
- {
- fprintf (stderr, "memmem FAILURE 3\n");
- return 1;
- }
-
- p = memmem (big, "baz", strlen (big) - 1);
- if (p != NULL)
- {
- fprintf (stderr, "memmem FAILURE 4\n");
- return 1;
- }
+ {
+ const char input[] = "foo";
+ const char *result = memmem (input, strlen (input), "", 0);
+ ASSERT (result == input);
+ }
+
+ {
+ const char input[] = "foo";
+ const char *result = memmem (input, strlen (input), "o", 1);
+ ASSERT (result == input + 1);
+ }
+
+ {
+ const char input[] = "ABC ABCDAB ABCDABCDABDE";
+ const char *result = memmem (input, strlen (input), "ABCDABD", 7);
+ ASSERT (result == input + 15);
+ }
+
+ {
+ const char input[] = "ABC ABCDAB ABCDABCDABDE";
+ const char *result = memmem (input, strlen (input), "ABCDABE", 7);
+ ASSERT (result == NULL);
+ }
+
+ /* Check that length 0 does not dereference NULL. */
+ {
+ const char *result = memmem (NULL, 0, "foo", 3);
+ ASSERT (result == NULL);
+ }
+
+ {
+ const char input[] = "foo";
+ const char *result = memmem (input, strlen (input), NULL, 0);
+ ASSERT (result == input);
+ }
+
+ /* Check that a very long haystack is handled quickly if the needle is
+ short and occurs near the beginning. */
+ {
+ size_t repeat = 10000;
+ size_t m = 1000000;
+ char *needle =
+ "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"
+ "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA";
+ size_t n = strlen (needle);
+ char *haystack = (char *) malloc (m + 1);
+ if (haystack != NULL)
+ {
+ memset (haystack, 'A', m);
+ haystack[0] = 'B';
+
+ for (; repeat > 0; repeat--)
+ {
+ ASSERT (memmem (haystack, m, needle, n) == haystack + 1);
+ }
+
+ free (haystack);
+ }
+ }
+
+ /* Check that a very long needle is discarded quickly if the haystack is
+ short. */
+ {
+ size_t repeat = 10000;
+ size_t m = 1000000;
+ char *haystack =
+ "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"
+ "ABABABABABABABABABABABABABABABABABABABABABABABABABABABABABABABABABAB";
+ size_t n = strlen (haystack);
+ char *needle = (char *) malloc (m + 1);
+ if (needle != NULL)
+ {
+ memset (needle, 'A', m);
+
+ for (; repeat > 0; repeat--)
+ {
+ ASSERT (memmem (haystack, n, needle, m) == NULL);
+ }
+
+ free (needle);
+ }
+ }
+
+ /* Check that the asymptotic worst-case complexity is not quadratic. */
+ {
+ size_t m = 1000000;
+ char *haystack = (char *) malloc (2 * m + 1);
+ char *needle = (char *) malloc (m + 1);
+ if (haystack != NULL && needle != NULL)
+ {
+ const char *result;
+
+ memset (haystack, 'A', 2 * m);
+ haystack[2 * m] = 'B';
+
+ memset (needle, 'A', m);
+ needle[m] = 'B';
+
+ result = memmem (haystack, 2 * m + 1, needle, m + 1);
+ ASSERT (result == haystack + m);
+ }
+ if (needle != NULL)
+ free (needle);
+ if (haystack != NULL)
+ free (haystack);
+ }
return 0;
}