Use memmem algorithm for strstr

Message ID DB5PR08MB1030A9C447986450613546D683B60@DB5PR08MB1030.eurprd08.prod.outlook.com
State New
Headers show
Series
  • Use memmem algorithm for strstr
Related show

Commit Message

Wilco Dijkstra Dec. 27, 2018, 6:30 p.m.
This patch further improves performance of strstr by using the new memmem
algorithm (http://www.cygwin.com/ml/newlib/2018/msg01151.html). 
The performance gain on English text with randomized needles of average size
8 is 25% on small haystacks and 75% for large haystacks.

Tested against GLIBC testsuite, randomized tests and the GNULIB strstr test
(https://git.savannah.gnu.org/cgit/gnulib.git/tree/tests/test-strstr.c).

--

Comments

Eric Blake Jan. 3, 2019, 7:18 p.m. | #1
On 12/27/18 12:30 PM, Wilco Dijkstra wrote:
> This patch further improves performance of strstr by using the new memmem

> algorithm (http://www.cygwin.com/ml/newlib/2018/msg01151.html). 

> The performance gain on English text with randomized needles of average size

> 8 is 25% on small haystacks and 75% for large haystacks.

> 

> Tested against GLIBC testsuite, randomized tests and the GNULIB strstr test

> (https://git.savannah.gnu.org/cgit/gnulib.git/tree/tests/test-strstr.c).

> 

> --


> @@ -150,10 +130,6 @@ strstr (const char *haystack, const char *needle)

>      return (char*)strchr (hs, ne[0]);

>    if (ne[2] == '\0')

>      return strstr2 (hs, ne);

> -  if (ne[3] == '\0')

> -    return strstr3 (hs, ne);

> -  if (ne[4] == '\0')

> -    return strstr4 (hs, ne);

>  

>    size_t ne_len = strlen (ne);

>    size_t hs_len = strnlen (hs, ne_len | 512);


Is the new code faster than strstr[34] on short needles, or should you
be keeping those versions around?

Calling strlen (ne) can be wasteful for someone that calls
strstr("short", "super_long.....") - such callers will always get an
answer of NULL, but the question is how much dead work did we do in the
meantime.  My original Two-Way implementation tried hard to not compute
the bounds of EITHER needle or haystack in isolation, but rather made a
first pass through both simultaneously to detect early exit
opportunities long before an unbounded strlen() on a disproportionately
long counterpart.  Thus, I think this code should be using strnlen (ne,
256) before deciding to switch to Two-Way, rather than an unbounded length.

> @@ -162,39 +138,59 @@ strstr (const char *haystack, const char *needle)

>    if (hs_len < ne_len)

>      return NULL;

>  

> -  /* Use the Quick-Search algorithm for needle lengths less than 255.  */

> -  if (__builtin_expect (ne_len < 255, 1))

> +  /* Use Two-Way algorithm for very long needles.  */

> +  if (__builtin_expect (ne_len > 256, 0))

> +    return two_way_long_needle (hs, hs_len, ne, ne_len);


Is hs_len accurate (or even ne_len, if you take my suggestion to use
strnlen() in its compuatation)?  Or do you first need to (re-)compute
accurate lengths?

> +

> +  const unsigned char *end = hs + hs_len - ne_len;

> +  uint8_t shift[256];

> +  size_t tmp, shift1;

> +  size_t m1 = ne_len - 1;

> +  size_t offset = 0;

> +

> +  /* Initialize bad character shift hash table.  */

> +  memset (shift, 0, sizeof (shift));

> +  for (int i = 1; i < m1; i++)

> +    shift[hash2 (ne + i)] = i;

> +  shift1 = m1 - shift[hash2 (ne + m1)];

> +  shift[hash2 (ne + m1)] = m1;

> +

> +  while (1)

>      {

> -      uint8_t shift[1 << SHIFT_TABLE_BITS];

> -      const unsigned char *end = hs + hs_len - ne_len;

> -

> -      /* Initialize bad character shift hash table.  */

> -      memset (shift, ne_len + 1, sizeof (shift));

> -      for (int i = 0; i < ne_len; i++)

> -	shift[ne[i] % sizeof (shift)] = ne_len - i;

> +      if (__builtin_expect (hs > end, 0))

> +	{

> +	  end += strnlen (end + m1 + 1, 2048);


Why hard-coded 2048 here, vs. ne_len|512 above?

> +	  if (hs > end)

> +	    return NULL;

> +	}

>  

> +      /* Skip past character pairs not in the needle.  */

>        do

>  	{

> -	  hs--;

> -

> -	  /* Search by skipping past bad characters.  */

> -	  size_t tmp = shift[hs[ne_len] % sizeof (shift)];

> -	  for (hs += tmp; hs <= end; hs += tmp)

> -	    {

> -	      tmp = shift[hs[ne_len] % sizeof (shift)];

> -	      if (memcmp (hs, ne, ne_len) == 0)

> -		return (char*) hs;

> -	    }

> -	  if (end[ne_len] == 0)

> -	    return NULL;

> -	  end += strnlen (end + ne_len, 2048);

> +	  hs += m1;

> +	  tmp = shift[hash2 (hs)];

>  	}

> -      while (hs <= end);

> +      while (hs <= end && tmp == 0);

>  

> -      return NULL;

> -    }

> +      /* If the match is not at the end of the needle, shift to the end

> +	 and continue until we match the last 2 characters.  */

> +      hs -= tmp;

> +      if (tmp < m1)

> +	continue;

>  

> -  /* Use Two-Way algorithm for very long needles.  */

> -  return two_way_long_needle (hs, hs_len, ne, ne_len);

> +      /* The last 2 characters match.  If the needle is long, check a

> +	 fixed number of characters first to quickly filter out mismatches.  */

> +      if (m1 <= 15 || memcmp (hs + offset, ne + offset, sizeof (long)) == 0)

> +	{

> +	  if (memcmp (hs, ne, m1) == 0)

> +	    return (void *) hs;

> +

> +	  /* Adjust filter offset when it doesn't find the mismatch.  */

> +	  offset = (offset >= sizeof (long) ? offset : m1) - sizeof (long);

> +	}

> +

> +      /* Skip based on matching the last 2 characters.  */

> +      hs += shift1;

> +    }

>  }

>  #endif /* compilation for speed */

> 


-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.           +1-919-301-3226
Virtualization:  qemu.org | libvirt.org
Wilco Dijkstra Jan. 3, 2019, 11:39 p.m. | #2
Hi Eric,

> Is the new code faster than strstr[34] on short needles, or should you

> be keeping those versions around?


Well it depends. For really short haystacks they can be faster, but not for
long ones. However even if it were faster to use special code for short
needles, there is a misprediction cost from having to choose between the
various cases, so it tends to be faster overall to let the main algorithm deal
with them. I'm thinking of reducing inititalization overhead for small needles
by using a smaller hash table. Alternatively, the space-optimized version is
actually faster for very small haystacks.

> Calling strlen (ne) can be wasteful for someone that calls

> strstr("short", "super_long.....") - such callers will always get an

> answer of NULL, but the question is how much dead work did we do in the

> meantime.  My original Two-Way implementation tried hard to not compute

> the bounds of EITHER needle or haystack in isolation, but rather made a

> first pass through both simultaneously to detect early exit

> opportunities long before an unbounded strlen() on a disproportionately

> long counterpart.  Thus, I think this code should be using strnlen (ne,

> 256) before deciding to switch to Two-Way, rather than an unbounded length.


I don't believe we should optimize for such an uncommon case. That loop
caused a significant slowdown so I removed it a while back:
http://www.sourceware.org/ml/libc-alpha/2018-07/msg00636.html

Using strnlen is less bad than a byte-oriented loop, however it's still slower than
strlen, so it is better to just get the needle length and not worry about rare
corner cases.

> +  /* Use Two-Way algorithm for very long needles.  */

> +  if (__builtin_expect (ne_len > 256, 0))

> +    return two_way_long_needle (hs, hs_len, ne, ne_len);


> Is hs_len accurate (or even ne_len, if you take my suggestion to use

> strnlen() in its compuatation)?  Or do you first need to (re-)compute

> accurate lengths?


ne_len must be accurate for Two-way to work. hs_len doesn't since it always
checks AVAILABLE first to ensure there is enough data left.

> +      if (__builtin_expect (hs > end, 0))

> +     {

> +       end += strnlen (end + m1 + 1, 2048);


> Why hard-coded 2048 here, vs. ne_len|512 above?


Using ne_len | 512 ensures we can early-exit if haystack is smaller than needle
(not sure whether Two-way will work in that case, but that's an invariant I kept).

We don't want to waste time reading ahead too much since it's likely there will
be an early match. However once we've paid the startup overhead and finished
with the initial data without finding a match, the overhead of reading ahead
larger blocks is much lower (and reading more reduces the overhead of strnlen).

Btw badly implemented readahead kills the efficiency of all known Two-way
implementations in GLIBC, gnulib, MUSL, newlib etc. I don't get why Two-way is
in such widespread use, it's the worst and slowest algorithm for typical inputs...

Cheers,
Wilco

Patch

diff --git a/newlib/libc/string/strstr.c b/newlib/libc/string/strstr.c
index 0256c1aa63a4f945c0b8488236f3d6681119f44f..9d5909a696ecfe7987b436706fbb20ba8eeb105f 100644
--- a/newlib/libc/string/strstr.c
+++ b/newlib/libc/string/strstr.c
@@ -94,9 +94,6 @@  strstr (const char *hs, const char *ne)
 
 # include "str-two-way.h"
 
-/* Number of bits used to index shift table.  */
-#define SHIFT_TABLE_BITS 6
-
 static inline char *
 strstr2 (const unsigned char *hs, const unsigned char *ne)
 {
@@ -107,36 +104,19 @@  strstr2 (const unsigned char *hs, const unsigned char *ne)
   return h1 == h2 ? (char *)hs - 2 : NULL;
 }
 
-static inline char *
-strstr3 (const unsigned char *hs, const unsigned char *ne)
-{
-  uint32_t h1 = (ne[0] << 24) | (ne[1] << 16) | (ne[2] << 8);
-  uint32_t h2 = 0;
-  for (int c = hs[0]; h1 != h2 && c != 0; c = *++hs)
-      h2 = (h2 | c) << 8;
-  return h1 == h2 ? (char *)hs - 3 : NULL;
-}
+#define hash2(p) (((size_t)(p)[0] - ((size_t)(p)[-1] << 3)) % sizeof (shift))
 
-static inline char *
-strstr4 (const unsigned char *hs, const unsigned char *ne)
-{
-  uint32_t h1 = (ne[0] << 24) | (ne[1] << 16) | (ne[2] << 8) | ne[3];
-  uint32_t h2 = 0;
-  for (int c = hs[0]; c != 0 && h1 != h2; c = *++hs)
-    h2 = (h2 << 8) | c;
-  return h1 == h2 ? (char *)hs - 4 : NULL;
-}
-
-/* Extremely fast strstr algorithm with guaranteed linear-time performance.
-   Small needles up to size 4 use a dedicated linear search.  Longer needles
-   up to size 254 use Sunday's Quick-Search algorithm.  Due to its simplicity
-   it has the best average performance of string matching algorithms on almost
-   all inputs.  It uses a bad-character shift table to skip past mismatches.
-   By limiting the needle length to 254, the shift table can be reduced to 8
+/* Fast strstr algorithm with guaranteed linear-time performance.
+   Small needles up to size 2 use a dedicated linear search.  Longer needles
+   up to size 256 use a novel modified Horspool algorithm.  It hashes pairs
+   of characters to quickly skip past mismatches.  The main search loop only
+   exits if the last 2 characters match, avoiding unnecessary calls to memcmp
+   and allowing for a larger skip if there is no match.  A self-adapting
+   filtering check is used to quickly detect mismatches in long needles.
+   By limiting the needle length to 256, the shift table can be reduced to 8
    bits per entry, lowering preprocessing overhead and minimizing cache effects.
-   The limit also implies the worst-case performance is linear.
-   Even larger needles are processed by the linear-time Two-Way algorithm.
-*/
+   The limit also implies worst-case performance is linear.
+   Needles larger than 256 characters use the linear-time Two-Way algorithm.  */
 char *
 strstr (const char *haystack, const char *needle)
 {
@@ -150,10 +130,6 @@  strstr (const char *haystack, const char *needle)
     return (char*)strchr (hs, ne[0]);
   if (ne[2] == '\0')
     return strstr2 (hs, ne);
-  if (ne[3] == '\0')
-    return strstr3 (hs, ne);
-  if (ne[4] == '\0')
-    return strstr4 (hs, ne);
 
   size_t ne_len = strlen (ne);
   size_t hs_len = strnlen (hs, ne_len | 512);
@@ -162,39 +138,59 @@  strstr (const char *haystack, const char *needle)
   if (hs_len < ne_len)
     return NULL;
 
-  /* Use the Quick-Search algorithm for needle lengths less than 255.  */
-  if (__builtin_expect (ne_len < 255, 1))
+  /* Use Two-Way algorithm for very long needles.  */
+  if (__builtin_expect (ne_len > 256, 0))
+    return two_way_long_needle (hs, hs_len, ne, ne_len);
+
+  const unsigned char *end = hs + hs_len - ne_len;
+  uint8_t shift[256];
+  size_t tmp, shift1;
+  size_t m1 = ne_len - 1;
+  size_t offset = 0;
+
+  /* Initialize bad character shift hash table.  */
+  memset (shift, 0, sizeof (shift));
+  for (int i = 1; i < m1; i++)
+    shift[hash2 (ne + i)] = i;
+  shift1 = m1 - shift[hash2 (ne + m1)];
+  shift[hash2 (ne + m1)] = m1;
+
+  while (1)
     {
-      uint8_t shift[1 << SHIFT_TABLE_BITS];
-      const unsigned char *end = hs + hs_len - ne_len;
-
-      /* Initialize bad character shift hash table.  */
-      memset (shift, ne_len + 1, sizeof (shift));
-      for (int i = 0; i < ne_len; i++)
-	shift[ne[i] % sizeof (shift)] = ne_len - i;
+      if (__builtin_expect (hs > end, 0))
+	{
+	  end += strnlen (end + m1 + 1, 2048);
+	  if (hs > end)
+	    return NULL;
+	}
 
+      /* Skip past character pairs not in the needle.  */
       do
 	{
-	  hs--;
-
-	  /* Search by skipping past bad characters.  */
-	  size_t tmp = shift[hs[ne_len] % sizeof (shift)];
-	  for (hs += tmp; hs <= end; hs += tmp)
-	    {
-	      tmp = shift[hs[ne_len] % sizeof (shift)];
-	      if (memcmp (hs, ne, ne_len) == 0)
-		return (char*) hs;
-	    }
-	  if (end[ne_len] == 0)
-	    return NULL;
-	  end += strnlen (end + ne_len, 2048);
+	  hs += m1;
+	  tmp = shift[hash2 (hs)];
 	}
-      while (hs <= end);
+      while (hs <= end && tmp == 0);
 
-      return NULL;
-    }
+      /* If the match is not at the end of the needle, shift to the end
+	 and continue until we match the last 2 characters.  */
+      hs -= tmp;
+      if (tmp < m1)
+	continue;
 
-  /* Use Two-Way algorithm for very long needles.  */
-  return two_way_long_needle (hs, hs_len, ne, ne_len);
+      /* The last 2 characters match.  If the needle is long, check a
+	 fixed number of characters first to quickly filter out mismatches.  */
+      if (m1 <= 15 || memcmp (hs + offset, ne + offset, sizeof (long)) == 0)
+	{
+	  if (memcmp (hs, ne, m1) == 0)
+	    return (void *) hs;
+
+	  /* Adjust filter offset when it doesn't find the mismatch.  */
+	  offset = (offset >= sizeof (long) ? offset : m1) - sizeof (long);
+	}
+
+      /* Skip based on matching the last 2 characters.  */
+      hs += shift1;
+    }
 }
 #endif /* compilation for speed */