AArch64: Improve strlen_asimd performance (bug 25824)

Message ID DB8PR08MB503640B9D4858EECD4262346837F0@DB8PR08MB5036.eurprd08.prod.outlook.com
State New
Headers show
Series
  • AArch64: Improve strlen_asimd performance (bug 25824)
Related show

Commit Message

Wilco Dijkstra July 16, 2020, 1 p.m.
Optimize strlen using a mix of scalar and SIMD code. On modern micro
architectures large strings are 2.6 times faster than existing strlen_asimd
and 35% faster than the new MTE version of strlen.

On a random strlen benchmark using small sizes the speedup is 7% vs
strlen_asimd and 40% vs the MTE strlen.  This fixes the main strlen
regressions on Cortex-A53 and other cores with a simple Neon unit
(see https://sourceware.org/pipermail/libc-alpha/2020-June/114641.html )

Rename __strlen_generic to __strlen_mte, and select strlen_asimd when
MTE is not enabled (this is waiting on support for a HWCAP_MTE bit
which can hopefully be added soon).

This fixes big-endian bug 25824. Passes GLIBC regression tests.

I'd like this for 2.32 to fix the bug and avoid any regressions.

---

Comments

Alistair Francis via Libc-alpha July 16, 2020, 3:31 p.m. | #1
On 7/16/20 9:00 AM, Wilco Dijkstra wrote:
> Optimize strlen using a mix of scalar and SIMD code. On modern micro

> architectures large strings are 2.6 times faster than existing strlen_asimd

> and 35% faster than the new MTE version of strlen.

> 

> On a random strlen benchmark using small sizes the speedup is 7% vs

> strlen_asimd and 40% vs the MTE strlen.  This fixes the main strlen

> regressions on Cortex-A53 and other cores with a simple Neon unit

> (see https://sourceware.org/pipermail/libc-alpha/2020-June/114641.html )

> 

> Rename __strlen_generic to __strlen_mte, and select strlen_asimd when

> MTE is not enabled (this is waiting on support for a HWCAP_MTE bit

> which can hopefully be added soon).

> 

> This fixes big-endian bug 25824. Passes GLIBC regression tests.

> 

> I'd like this for 2.32 to fix the bug and avoid any regressions.


The copyright/description changes don't look correct.

Overall this is OK for 2.32, but Szabolcs should review this also.
 
> ---

> 

> diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile

> index a65c554bf3a60ccbed6b519bbbc46aabdf5b6025..4377df0735287c210efd661188f9e6e3923c8003 100644

> --- a/sysdeps/aarch64/multiarch/Makefile

> +++ b/sysdeps/aarch64/multiarch/Makefile

> @@ -4,5 +4,5 @@ sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \

>  		   memcpy_new \

>  		   memset_generic memset_falkor memset_emag memset_kunpeng \

>  		   memchr_generic memchr_nosimd \

> -		   strlen_generic strlen_asimd

> +		   strlen_mte strlen_asimd

>  endif

> diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c

> index c1df2a012ae17f45f26bd45fc98fe45b2f4d9eb1..1e22fdf8726bf4cd92aed09401b2772f514bf3dc 100644

> --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c

> +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c

> @@ -62,7 +62,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,

>  

>    IFUNC_IMPL (i, name, strlen,

>  	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_asimd)

> -	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_generic))

> +	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_mte))

>  

>    return i;

>  }

> diff --git a/sysdeps/aarch64/multiarch/strlen.c b/sysdeps/aarch64/multiarch/strlen.c

> index 99f2cf2cde54fd1158383d097ba51edc1377f55b..7c0352dd878086708ac785807bc4d210b85e528f 100644

> --- a/sysdeps/aarch64/multiarch/strlen.c

> +++ b/sysdeps/aarch64/multiarch/strlen.c

> @@ -26,17 +26,15 @@

>  # include <string.h>

>  # include <init-arch.h>

>  

> -#define USE_ASIMD_STRLEN() IS_FALKOR (midr)

> +/* This should check HWCAP_MTE when it is available.  */

> +#define MTE_ENABLED() (false)


OK.

>  

>  extern __typeof (__redirect_strlen) __strlen;

>  

> -extern __typeof (__redirect_strlen) __strlen_generic attribute_hidden;

> +extern __typeof (__redirect_strlen) __strlen_mte attribute_hidden;

>  extern __typeof (__redirect_strlen) __strlen_asimd attribute_hidden;

>  

> -libc_ifunc (__strlen,

> -	    (USE_ASIMD_STRLEN () || IS_KUNPENG920 (midr)

> -	    ? __strlen_asimd

> -	    :__strlen_generic));

> +libc_ifunc (__strlen, (MTE_ENABLED () ? __strlen_mte : __strlen_asimd));


OK.

>  

>  # undef strlen

>  strong_alias (__strlen, strlen);

> diff --git a/sysdeps/aarch64/multiarch/strlen_asimd.S b/sysdeps/aarch64/multiarch/strlen_asimd.S

> index 236a2c96a6eb5f02b0f0847d230857f0aee87fbe..076a905dceae501d85c1ab59a2250d8305c718f2 100644

> --- a/sysdeps/aarch64/multiarch/strlen_asimd.S

> +++ b/sysdeps/aarch64/multiarch/strlen_asimd.S

> @@ -1,5 +1,4 @@

> -/* Strlen implementation that uses ASIMD instructions for load and NULL checks.

> -   Copyright (C) 2018-2020 Free Software Foundation, Inc.


Why are you removing the first line description?

> +/* Copyright (C) 2020 Free Software Foundation, Inc.


This needs justification.

>  

>     This file is part of the GNU C Library.

>  

> @@ -20,80 +19,90 @@

>  #include <sysdep.h>

>  

>  /* Assumptions:

> + *

> + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.

> + * Not MTE compatible.

> + */

> +

> +#define srcin	x0

> +#define len	x0

> +

> +#define src	x1

> +#define data1	x2

> +#define data2	x3

> +#define has_nul1 x4

> +#define has_nul2 x5

> +#define tmp1	x4

> +#define tmp2	x5

> +#define tmp3	x6

> +#define tmp4	x7

> +#define zeroones x8

> +

> +#define maskv	v0

> +#define maskd	d0

> +#define dataq1	q1

> +#define dataq2	q2

> +#define datav1	v1

> +#define datav2	v2

> +#define tmp	x2

> +#define tmpw	w2

> +#define synd	x3

> +#define shift	x4

> +

> +/* For the first 32 bytes, NUL detection works on the principle that

> +   (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero if a

> +   byte is zero, and can be done in parallel across the entire word.  */

>  

> -   ARMv8-a, AArch64, ASIMD, unaligned accesses, min page size 4k.  */

> +#define REP8_01 0x0101010101010101

> +#define REP8_7f 0x7f7f7f7f7f7f7f7f

>  

>  /* To test the page crossing code path more thoroughly, compile with

>     -DTEST_PAGE_CROSS - this will force all calls through the slower

>     entry path.  This option is not intended for production use.  */

>  

> -/* Arguments and results.  */

> -#define srcin		x0

> -#define len		x0

> -

> -/* Locals and temporaries.  */

> -#define src		x1

> -#define data1		x2

> -#define data2		x3

> -#define has_nul1	x4

> -#define has_nul2	x5

> -#define tmp1		x4

> -#define tmp2		x5

> -#define tmp3		x6

> -#define tmp4		x7

> -#define zeroones	x8

> -#define dataq		q2

> -#define datav		v2

> -#define datab2		b3

> -#define dataq2		q3

> -#define datav2		v3

> -

> -#define REP8_01 0x0101010101010101

> -#define REP8_7f 0x7f7f7f7f7f7f7f7f

> -

>  #ifdef TEST_PAGE_CROSS

> -# define MIN_PAGE_SIZE 16

> +# define MIN_PAGE_SIZE 32

>  #else

>  # define MIN_PAGE_SIZE 4096

>  #endif

>  

> -	/* Since strings are short on average, we check the first 16 bytes

> -	   of the string for a NUL character.  In order to do an unaligned load

> -	   safely we have to do a page cross check first.  If there is a NUL

> -	   byte we calculate the length from the 2 8-byte words using

> -	   conditional select to reduce branch mispredictions (it is unlikely

> -	   strlen_asimd will be repeatedly called on strings with the same

> -	   length).

> -

> -	   If the string is longer than 16 bytes, we align src so don't need

> -	   further page cross checks, and process 16 bytes per iteration.

> -

> -	   If the page cross check fails, we read 16 bytes from an aligned

> -	   address, remove any characters before the string, and continue

> -	   in the main loop using aligned loads.  Since strings crossing a

> -	   page in the first 16 bytes are rare (probability of

> -	   16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.

> -

> -	   AArch64 systems have a minimum page size of 4k.  We don't bother

> -	   checking for larger page sizes - the cost of setting up the correct

> -	   page size is just not worth the extra gain from a small reduction in

> -	   the cases taking the slow path.  Note that we only care about

> -	   whether the first fetch, which may be misaligned, crosses a page

> -	   boundary.  */

> -

> -ENTRY_ALIGN (__strlen_asimd, 6)

> -	DELOUSE (0)

> -	DELOUSE (1)

> +/* Core algorithm:

> +

> +   Since strings are short on average, we check the first 32 bytes of the

> +   string for a NUL character without aligning the string.  In order to use

> +   unaligned loads safely we must do a page cross check first.

> +

> +   If there is a NUL byte we calculate the length from the 2 8-byte words

> +   using conditional select to reduce branch mispredictions (it is unlikely

> +   strlen will be repeatedly called on strings with the same length).

> +

> +   If the string is longer than 32 bytes, align src so we don't need further

> +   page cross checks, and process 32 bytes per iteration using a fast SIMD

> +   loop.

> +

> +   If the page cross check fails, we read 32 bytes from an aligned address,

> +   and ignore any characters before the string.  If it contains a NUL

> +   character, return the length, if not, continue in the main loop.  */

> +

> +ENTRY (__strlen_asimd)

> +        DELOUSE (0)

> +

>  	and	tmp1, srcin, MIN_PAGE_SIZE - 1

> -	mov	zeroones, REP8_01

> -	cmp	tmp1, MIN_PAGE_SIZE - 16

> -	b.gt	L(page_cross)

> +	cmp	tmp1, MIN_PAGE_SIZE - 32

> +	b.hi	L(page_cross)

> +

> +	/* Look for a NUL byte in the first 16 bytes.  */

>  	ldp	data1, data2, [srcin]

> +	mov	zeroones, REP8_01

> +

>  #ifdef __AARCH64EB__

> +	/* For big-endian, carry propagation (if the final byte in the

> +	   string is 0x01) means we cannot use has_nul1/2 directly.

> +	   Since we expect strings to be small and early-exit,

> +	   byte-swap the data now so has_null1/2 will be correct.  */

>  	rev	data1, data1

>  	rev	data2, data2

>  #endif

> -

>  	sub	tmp1, data1, zeroones

>  	orr	tmp2, data1, REP8_7f

>  	sub	tmp3, data2, zeroones

> @@ -101,78 +110,105 @@ ENTRY_ALIGN (__strlen_asimd, 6)

>  	bics	has_nul1, tmp1, tmp2

>  	bic	has_nul2, tmp3, tmp4

>  	ccmp	has_nul2, 0, 0, eq

> -	beq	L(main_loop_entry)

> +	b.eq	L(bytes16_31)

> +

> +	/* Find the exact offset of the first NUL byte in the first 16 bytes

> +	   from the string start.  Enter with C = has_nul1 == 0.  */

>  	csel	has_nul1, has_nul1, has_nul2, cc

>  	mov	len, 8

>  	rev	has_nul1, has_nul1

> -	clz	tmp1, has_nul1

>  	csel	len, xzr, len, cc

> +	clz	tmp1, has_nul1

>  	add	len, len, tmp1, lsr 3

>  	ret

>  

> -L(main_loop_entry):

> -	bic	src, srcin, 15

> -	sub	src, src, 16

> -

> -L(main_loop):

> -	ldr	dataq, [src, 32]!

> -L(page_cross_entry):

> -	/* Get the minimum value and keep going if it is not zero.  */

> -	uminv	datab2, datav.16b

> -	mov	tmp1, datav2.d[0]

> -	cbz	tmp1, L(tail)

> -	ldr	dataq, [src, 16]

> -	uminv	datab2, datav.16b

> -	mov	tmp1, datav2.d[0]

> -	cbnz	tmp1, L(main_loop)

> -	add	src, src, 16

> -

> -L(tail):

> +	.p2align 3

> +	/* Look for a NUL byte at offset 16..31 in the string.  */

> +L(bytes16_31):

> +	ldp	data1, data2, [srcin, 16]

>  #ifdef __AARCH64EB__

> -	rev64	datav.16b, datav.16b

> -#endif

> -	/* Set te NULL byte as 0xff and the rest as 0x00, move the data into a

> -	   pair of scalars and then compute the length from the earliest NULL

> -	   byte.  */

> -	cmeq	datav.16b, datav.16b, #0

> -	mov	data1, datav.d[0]

> -	mov	data2, datav.d[1]

> -	cmp	data1, 0

> -	csel	data1, data1, data2, ne

> -	sub	len, src, srcin

>  	rev	data1, data1

> -	add	tmp2, len, 8

> -	clz	tmp1, data1

> -	csel	len, len, tmp2, ne

> +	rev	data2, data2

> +#endif

> +	sub	tmp1, data1, zeroones

> +	orr	tmp2, data1, REP8_7f

> +	sub	tmp3, data2, zeroones

> +	orr	tmp4, data2, REP8_7f

> +	bics	has_nul1, tmp1, tmp2

> +	bic	has_nul2, tmp3, tmp4

> +	ccmp	has_nul2, 0, 0, eq

> +	b.eq	L(loop_entry)

> +

> +	/* Find the exact offset of the first NUL byte at offset 16..31 from

> +	   the string start.  Enter with C = has_nul1 == 0.  */

> +	csel	has_nul1, has_nul1, has_nul2, cc

> +	mov	len, 24

> +	rev	has_nul1, has_nul1

> +	mov	tmp3, 16

> +	clz	tmp1, has_nul1

> +	csel	len, tmp3, len, cc

>  	add	len, len, tmp1, lsr 3

>  	ret

>  

> -	/* Load 16 bytes from [srcin & ~15] and force the bytes that precede

> -	   srcin to 0xff, so we ignore any NUL bytes before the string.

> -	   Then continue in the aligned loop.  */

> -L(page_cross):

> -	mov	tmp3, 63

> -	bic	src, srcin, 15

> -	and	tmp1, srcin, 7

> -	ands	tmp2, srcin, 8

> -	ldr	dataq, [src]

> -	lsl	tmp1, tmp1, 3

> -	csel	tmp2, tmp2, tmp1, eq

> -	csel	tmp1, tmp1, tmp3, eq

> -	mov	tmp4, -1

> +L(loop_entry):

> +	bic	src, srcin, 31

> +

> +	.p2align 5

> +L(loop):

> +	ldp	dataq1, dataq2, [src, 32]!

> +	uminp	maskv.16b, datav1.16b, datav2.16b

> +	uminp	maskv.16b, maskv.16b, maskv.16b

> +	cmeq	maskv.8b, maskv.8b, 0

> +	fmov	synd, maskd

> +	cbz	synd, L(loop)

> +

> +	/* Low 32 bits of synd are non-zero if a NUL was found in datav1.  */

> +	cmeq	maskv.16b, datav1.16b, 0

> +	sub	len, src, srcin

> +	tst	synd, 0xffffffff

> +	b.ne	1f

> +	cmeq	maskv.16b, datav2.16b, 0

> +	add	len, len, 16

> +1:

> +	/* Generate a bitmask and compute correct byte offset.  */

>  #ifdef __AARCH64EB__

> -	/* Big-endian.  Early bytes are at MSB.  */

> -	lsr	tmp1, tmp4, tmp1

> -	lsr	tmp2, tmp4, tmp2

> +	bic	maskv.8h, 0xf0

>  #else

> -	/* Little-endian.  Early bytes are at LSB.  */

> -	lsl	tmp1, tmp4, tmp1

> -	lsl	tmp2, tmp4, tmp2

> +	bic	maskv.8h, 0x0f, lsl 8

> +#endif

> +	umaxp	maskv.16b, maskv.16b, maskv.16b

> +	fmov	synd, maskd

> +#ifndef __AARCH64EB__

> +	rbit	synd, synd

>  #endif

> -	mov	datav2.d[0], tmp1

> -	mov	datav2.d[1], tmp2

> -	orn	datav.16b, datav.16b, datav2.16b

> -	b	L(page_cross_entry)

> +	clz	tmp, synd

> +	add	len, len, tmp, lsr 2

> +	ret

> +

> +        .p2align 4

> +

> +L(page_cross):

> +	bic	src, srcin, 31

> +	mov	tmpw, 0x0c03

> +	movk	tmpw, 0xc030, lsl 16

> +	ld1	{datav1.16b, datav2.16b}, [src]

> +	dup	maskv.4s, tmpw

> +	cmeq	datav1.16b, datav1.16b, 0

> +	cmeq	datav2.16b, datav2.16b, 0

> +	and	datav1.16b, datav1.16b, maskv.16b

> +	and	datav2.16b, datav2.16b, maskv.16b

> +	addp	maskv.16b, datav1.16b, datav2.16b

> +	addp	maskv.16b, maskv.16b, maskv.16b

> +	fmov	synd, maskd

> +	lsl	shift, srcin, 1

> +	lsr	synd, synd, shift

> +	cbz	synd, L(loop)

> +

> +	rbit	synd, synd

> +	clz	len, synd

> +	lsr	len, len, 1

> +	ret

> +

>  END (__strlen_asimd)

>  weak_alias (__strlen_asimd, strlen_asimd)

>  libc_hidden_builtin_def (strlen_asimd)

> diff --git a/sysdeps/aarch64/multiarch/strlen_generic.S b/sysdeps/aarch64/multiarch/strlen_mte.S

> similarity index 88%

> rename from sysdeps/aarch64/multiarch/strlen_generic.S

> rename to sysdeps/aarch64/multiarch/strlen_mte.S

> index 61d3f72c9985bdd103d5e4c68337fed4a55511be..b8daa54dd89afbd99a6338cef45f49a25defaa26 100644

> --- a/sysdeps/aarch64/multiarch/strlen_generic.S

> +++ b/sysdeps/aarch64/multiarch/strlen_mte.S


OK. Rename.

> @@ -17,14 +17,14 @@

>     <https://www.gnu.org/licenses/>.  */

>  

>  /* The actual strlen code is in ../strlen.S.  If we are building libc this file

> -   defines __strlen_generic.  Otherwise the include of ../strlen.S will define

> +   defines __strlen_mte.  Otherwise the include of ../strlen.S will define


OK.

>     the normal __strlen  entry points.  */

>  

>  #include <sysdep.h>

>  

>  #if IS_IN (libc)

>  

> -# define STRLEN __strlen_generic

> +# define STRLEN __strlen_mte

>  

>  /* Do not hide the generic version of strlen, we use it internally.  */

>  # undef libc_hidden_builtin_def

> @@ -32,7 +32,7 @@

>  

>  # ifdef SHARED

>  /* It doesn't make sense to send libc-internal strlen calls through a PLT. */

> -	.globl __GI_strlen; __GI_strlen = __strlen_generic

> +	.globl __GI_strlen; __GI_strlen = __strlen_mte

>  # endif

>  #endif

>  

> 

> 



-- 
Cheers,
Carlos.
Szabolcs Nagy July 16, 2020, 4:52 p.m. | #2
The 07/16/2020 11:31, Carlos O'Donell wrote:
> On 7/16/20 9:00 AM, Wilco Dijkstra wrote:

> > Optimize strlen using a mix of scalar and SIMD code. On modern micro

> > architectures large strings are 2.6 times faster than existing strlen_asimd

> > and 35% faster than the new MTE version of strlen.

> > 

> > On a random strlen benchmark using small sizes the speedup is 7% vs

> > strlen_asimd and 40% vs the MTE strlen.  This fixes the main strlen

> > regressions on Cortex-A53 and other cores with a simple Neon unit

> > (see https://sourceware.org/pipermail/libc-alpha/2020-June/114641.html )

> > 

> > Rename __strlen_generic to __strlen_mte, and select strlen_asimd when

> > MTE is not enabled (this is waiting on support for a HWCAP_MTE bit

> > which can hopefully be added soon).

> > 

> > This fixes big-endian bug 25824. Passes GLIBC regression tests.

> > 

> > I'd like this for 2.32 to fix the bug and avoid any regressions.

> 

> The copyright/description changes don't look correct.

> 

> Overall this is OK for 2.32, but Szabolcs should review this also.

...
> > diff --git a/sysdeps/aarch64/multiarch/strlen.c b/sysdeps/aarch64/multiarch/strlen.c

> > index 99f2cf2cde54fd1158383d097ba51edc1377f55b..7c0352dd878086708ac785807bc4d210b85e528f 100644

> > --- a/sysdeps/aarch64/multiarch/strlen.c

> > +++ b/sysdeps/aarch64/multiarch/strlen.c

> > @@ -26,17 +26,15 @@

> >  # include <string.h>

> >  # include <init-arch.h>

> >  

> > -#define USE_ASIMD_STRLEN() IS_FALKOR (midr)

> > +/* This should check HWCAP_MTE when it is available.  */

> > +#define MTE_ENABLED() (false)

> 

> OK.


i'd like glibc 2.32 to support heap tagging via malloc
interposition (on future linux + future hw).

MTE_ENABLED==false in the ifunc dispatch prevents that.
(we select non-mte-safe strlen)

is adding the HWCAP value right before release OK?
i need to discuss with linux devs if we can reserve
the value ahead of time.

the patch is OK with the current logic, i will try to deal
with this issue separately.
Alistair Francis via Libc-alpha July 16, 2020, 6:28 p.m. | #3
On 7/16/20 12:52 PM, Szabolcs Nagy wrote:
> The 07/16/2020 11:31, Carlos O'Donell wrote:

>> On 7/16/20 9:00 AM, Wilco Dijkstra wrote:

>>> Optimize strlen using a mix of scalar and SIMD code. On modern micro

>>> architectures large strings are 2.6 times faster than existing strlen_asimd

>>> and 35% faster than the new MTE version of strlen.

>>>

>>> On a random strlen benchmark using small sizes the speedup is 7% vs

>>> strlen_asimd and 40% vs the MTE strlen.  This fixes the main strlen

>>> regressions on Cortex-A53 and other cores with a simple Neon unit

>>> (see https://sourceware.org/pipermail/libc-alpha/2020-June/114641.html )

>>>

>>> Rename __strlen_generic to __strlen_mte, and select strlen_asimd when

>>> MTE is not enabled (this is waiting on support for a HWCAP_MTE bit

>>> which can hopefully be added soon).

>>>

>>> This fixes big-endian bug 25824. Passes GLIBC regression tests.

>>>

>>> I'd like this for 2.32 to fix the bug and avoid any regressions.

>>

>> The copyright/description changes don't look correct.

>>

>> Overall this is OK for 2.32, but Szabolcs should review this also.

> ...

>>> diff --git a/sysdeps/aarch64/multiarch/strlen.c b/sysdeps/aarch64/multiarch/strlen.c

>>> index 99f2cf2cde54fd1158383d097ba51edc1377f55b..7c0352dd878086708ac785807bc4d210b85e528f 100644

>>> --- a/sysdeps/aarch64/multiarch/strlen.c

>>> +++ b/sysdeps/aarch64/multiarch/strlen.c

>>> @@ -26,17 +26,15 @@

>>>  # include <string.h>

>>>  # include <init-arch.h>

>>>  

>>> -#define USE_ASIMD_STRLEN() IS_FALKOR (midr)

>>> +/* This should check HWCAP_MTE when it is available.  */

>>> +#define MTE_ENABLED() (false)

>>

>> OK.

> 

> i'd like glibc 2.32 to support heap tagging via malloc

> interposition (on future linux + future hw).

> 

> MTE_ENABLED==false in the ifunc dispatch prevents that.

> (we select non-mte-safe strlen)

> 

> is adding the HWCAP value right before release OK?

> i need to discuss with linux devs if we can reserve

> the value ahead of time.


glibc would obviously like to see that HWCAP value finalized
and in a released kernel so it doesn't change.
 
> the patch is OK with the current logic, i will try to deal

> with this issue separately.

 
I assume this means you accept the patch as-is?

It's clearer if people provided a "Reviewed-by:" line in cases
like this, that way they indicate a clear intent that the patch
is reviewed and substantial issues solved.

-- 
Cheers,
Carlos.
Szabolcs Nagy July 17, 2020, 11:29 a.m. | #4
The 07/16/2020 14:28, Carlos O'Donell wrote:
> On 7/16/20 12:52 PM, Szabolcs Nagy wrote:

> > The 07/16/2020 11:31, Carlos O'Donell wrote:

> >> On 7/16/20 9:00 AM, Wilco Dijkstra wrote:

> >>> Optimize strlen using a mix of scalar and SIMD code. On modern micro

> >>> architectures large strings are 2.6 times faster than existing strlen_asimd

> >>> and 35% faster than the new MTE version of strlen.

> >>>

> >>> On a random strlen benchmark using small sizes the speedup is 7% vs

> >>> strlen_asimd and 40% vs the MTE strlen.  This fixes the main strlen

> >>> regressions on Cortex-A53 and other cores with a simple Neon unit

> >>> (see https://sourceware.org/pipermail/libc-alpha/2020-June/114641.html )

> >>>

> >>> Rename __strlen_generic to __strlen_mte, and select strlen_asimd when

> >>> MTE is not enabled (this is waiting on support for a HWCAP_MTE bit

> >>> which can hopefully be added soon).

> >>>

> >>> This fixes big-endian bug 25824. Passes GLIBC regression tests.

> >>>

> >>> I'd like this for 2.32 to fix the bug and avoid any regressions.

> >>

> >> The copyright/description changes don't look correct.

> >>

> >> Overall this is OK for 2.32, but Szabolcs should review this also.

> > ...

> >>> diff --git a/sysdeps/aarch64/multiarch/strlen.c b/sysdeps/aarch64/multiarch/strlen.c

> >>> index 99f2cf2cde54fd1158383d097ba51edc1377f55b..7c0352dd878086708ac785807bc4d210b85e528f 100644

> >>> --- a/sysdeps/aarch64/multiarch/strlen.c

> >>> +++ b/sysdeps/aarch64/multiarch/strlen.c

> >>> @@ -26,17 +26,15 @@

> >>>  # include <string.h>

> >>>  # include <init-arch.h>

> >>>  

> >>> -#define USE_ASIMD_STRLEN() IS_FALKOR (midr)

> >>> +/* This should check HWCAP_MTE when it is available.  */

> >>> +#define MTE_ENABLED() (false)

> >>

> >> OK.

> > 

> > i'd like glibc 2.32 to support heap tagging via malloc

> > interposition (on future linux + future hw).

> > 

> > MTE_ENABLED==false in the ifunc dispatch prevents that.

> > (we select non-mte-safe strlen)

> > 

> > is adding the HWCAP value right before release OK?

> > i need to discuss with linux devs if we can reserve

> > the value ahead of time.

> 

> glibc would obviously like to see that HWCAP value finalized

> and in a released kernel so it doesn't change.

>  

> > the patch is OK with the current logic, i will try to deal

> > with this issue separately.

>  

> I assume this means you accept the patch as-is?

> 

> It's clearer if people provided a "Reviewed-by:" line in cases

> like this, that way they indicate a clear intent that the patch

> is reviewed and substantial issues solved.


OK with the description header fix.

Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>

Patch

diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
index a65c554bf3a60ccbed6b519bbbc46aabdf5b6025..4377df0735287c210efd661188f9e6e3923c8003 100644
--- a/sysdeps/aarch64/multiarch/Makefile
+++ b/sysdeps/aarch64/multiarch/Makefile
@@ -4,5 +4,5 @@  sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \
 		   memcpy_new \
 		   memset_generic memset_falkor memset_emag memset_kunpeng \
 		   memchr_generic memchr_nosimd \
-		   strlen_generic strlen_asimd
+		   strlen_mte strlen_asimd
 endif
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
index c1df2a012ae17f45f26bd45fc98fe45b2f4d9eb1..1e22fdf8726bf4cd92aed09401b2772f514bf3dc 100644
--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
@@ -62,7 +62,7 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   IFUNC_IMPL (i, name, strlen,
 	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_asimd)
-	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_generic))
+	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_mte))
 
   return i;
 }
diff --git a/sysdeps/aarch64/multiarch/strlen.c b/sysdeps/aarch64/multiarch/strlen.c
index 99f2cf2cde54fd1158383d097ba51edc1377f55b..7c0352dd878086708ac785807bc4d210b85e528f 100644
--- a/sysdeps/aarch64/multiarch/strlen.c
+++ b/sysdeps/aarch64/multiarch/strlen.c
@@ -26,17 +26,15 @@ 
 # include <string.h>
 # include <init-arch.h>
 
-#define USE_ASIMD_STRLEN() IS_FALKOR (midr)
+/* This should check HWCAP_MTE when it is available.  */
+#define MTE_ENABLED() (false)
 
 extern __typeof (__redirect_strlen) __strlen;
 
-extern __typeof (__redirect_strlen) __strlen_generic attribute_hidden;
+extern __typeof (__redirect_strlen) __strlen_mte attribute_hidden;
 extern __typeof (__redirect_strlen) __strlen_asimd attribute_hidden;
 
-libc_ifunc (__strlen,
-	    (USE_ASIMD_STRLEN () || IS_KUNPENG920 (midr)
-	    ? __strlen_asimd
-	    :__strlen_generic));
+libc_ifunc (__strlen, (MTE_ENABLED () ? __strlen_mte : __strlen_asimd));
 
 # undef strlen
 strong_alias (__strlen, strlen);
diff --git a/sysdeps/aarch64/multiarch/strlen_asimd.S b/sysdeps/aarch64/multiarch/strlen_asimd.S
index 236a2c96a6eb5f02b0f0847d230857f0aee87fbe..076a905dceae501d85c1ab59a2250d8305c718f2 100644
--- a/sysdeps/aarch64/multiarch/strlen_asimd.S
+++ b/sysdeps/aarch64/multiarch/strlen_asimd.S
@@ -1,5 +1,4 @@ 
-/* Strlen implementation that uses ASIMD instructions for load and NULL checks.
-   Copyright (C) 2018-2020 Free Software Foundation, Inc.
+/* Copyright (C) 2020 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -20,80 +19,90 @@ 
 #include <sysdep.h>
 
 /* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
+ * Not MTE compatible.
+ */
+
+#define srcin	x0
+#define len	x0
+
+#define src	x1
+#define data1	x2
+#define data2	x3
+#define has_nul1 x4
+#define has_nul2 x5
+#define tmp1	x4
+#define tmp2	x5
+#define tmp3	x6
+#define tmp4	x7
+#define zeroones x8
+
+#define maskv	v0
+#define maskd	d0
+#define dataq1	q1
+#define dataq2	q2
+#define datav1	v1
+#define datav2	v2
+#define tmp	x2
+#define tmpw	w2
+#define synd	x3
+#define shift	x4
+
+/* For the first 32 bytes, NUL detection works on the principle that
+   (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero if a
+   byte is zero, and can be done in parallel across the entire word.  */
 
-   ARMv8-a, AArch64, ASIMD, unaligned accesses, min page size 4k.  */
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
 
 /* To test the page crossing code path more thoroughly, compile with
    -DTEST_PAGE_CROSS - this will force all calls through the slower
    entry path.  This option is not intended for production use.  */
 
-/* Arguments and results.  */
-#define srcin		x0
-#define len		x0
-
-/* Locals and temporaries.  */
-#define src		x1
-#define data1		x2
-#define data2		x3
-#define has_nul1	x4
-#define has_nul2	x5
-#define tmp1		x4
-#define tmp2		x5
-#define tmp3		x6
-#define tmp4		x7
-#define zeroones	x8
-#define dataq		q2
-#define datav		v2
-#define datab2		b3
-#define dataq2		q3
-#define datav2		v3
-
-#define REP8_01 0x0101010101010101
-#define REP8_7f 0x7f7f7f7f7f7f7f7f
-
 #ifdef TEST_PAGE_CROSS
-# define MIN_PAGE_SIZE 16
+# define MIN_PAGE_SIZE 32
 #else
 # define MIN_PAGE_SIZE 4096
 #endif
 
-	/* Since strings are short on average, we check the first 16 bytes
-	   of the string for a NUL character.  In order to do an unaligned load
-	   safely we have to do a page cross check first.  If there is a NUL
-	   byte we calculate the length from the 2 8-byte words using
-	   conditional select to reduce branch mispredictions (it is unlikely
-	   strlen_asimd will be repeatedly called on strings with the same
-	   length).
-
-	   If the string is longer than 16 bytes, we align src so don't need
-	   further page cross checks, and process 16 bytes per iteration.
-
-	   If the page cross check fails, we read 16 bytes from an aligned
-	   address, remove any characters before the string, and continue
-	   in the main loop using aligned loads.  Since strings crossing a
-	   page in the first 16 bytes are rare (probability of
-	   16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
-
-	   AArch64 systems have a minimum page size of 4k.  We don't bother
-	   checking for larger page sizes - the cost of setting up the correct
-	   page size is just not worth the extra gain from a small reduction in
-	   the cases taking the slow path.  Note that we only care about
-	   whether the first fetch, which may be misaligned, crosses a page
-	   boundary.  */
-
-ENTRY_ALIGN (__strlen_asimd, 6)
-	DELOUSE (0)
-	DELOUSE (1)
+/* Core algorithm:
+
+   Since strings are short on average, we check the first 32 bytes of the
+   string for a NUL character without aligning the string.  In order to use
+   unaligned loads safely we must do a page cross check first.
+
+   If there is a NUL byte we calculate the length from the 2 8-byte words
+   using conditional select to reduce branch mispredictions (it is unlikely
+   strlen will be repeatedly called on strings with the same length).
+
+   If the string is longer than 32 bytes, align src so we don't need further
+   page cross checks, and process 32 bytes per iteration using a fast SIMD
+   loop.
+
+   If the page cross check fails, we read 32 bytes from an aligned address,
+   and ignore any characters before the string.  If it contains a NUL
+   character, return the length, if not, continue in the main loop.  */
+
+ENTRY (__strlen_asimd)
+        DELOUSE (0)
+
 	and	tmp1, srcin, MIN_PAGE_SIZE - 1
-	mov	zeroones, REP8_01
-	cmp	tmp1, MIN_PAGE_SIZE - 16
-	b.gt	L(page_cross)
+	cmp	tmp1, MIN_PAGE_SIZE - 32
+	b.hi	L(page_cross)
+
+	/* Look for a NUL byte in the first 16 bytes.  */
 	ldp	data1, data2, [srcin]
+	mov	zeroones, REP8_01
+
 #ifdef __AARCH64EB__
+	/* For big-endian, carry propagation (if the final byte in the
+	   string is 0x01) means we cannot use has_nul1/2 directly.
+	   Since we expect strings to be small and early-exit,
+	   byte-swap the data now so has_null1/2 will be correct.  */
 	rev	data1, data1
 	rev	data2, data2
 #endif
-
 	sub	tmp1, data1, zeroones
 	orr	tmp2, data1, REP8_7f
 	sub	tmp3, data2, zeroones
@@ -101,78 +110,105 @@  ENTRY_ALIGN (__strlen_asimd, 6)
 	bics	has_nul1, tmp1, tmp2
 	bic	has_nul2, tmp3, tmp4
 	ccmp	has_nul2, 0, 0, eq
-	beq	L(main_loop_entry)
+	b.eq	L(bytes16_31)
+
+	/* Find the exact offset of the first NUL byte in the first 16 bytes
+	   from the string start.  Enter with C = has_nul1 == 0.  */
 	csel	has_nul1, has_nul1, has_nul2, cc
 	mov	len, 8
 	rev	has_nul1, has_nul1
-	clz	tmp1, has_nul1
 	csel	len, xzr, len, cc
+	clz	tmp1, has_nul1
 	add	len, len, tmp1, lsr 3
 	ret
 
-L(main_loop_entry):
-	bic	src, srcin, 15
-	sub	src, src, 16
-
-L(main_loop):
-	ldr	dataq, [src, 32]!
-L(page_cross_entry):
-	/* Get the minimum value and keep going if it is not zero.  */
-	uminv	datab2, datav.16b
-	mov	tmp1, datav2.d[0]
-	cbz	tmp1, L(tail)
-	ldr	dataq, [src, 16]
-	uminv	datab2, datav.16b
-	mov	tmp1, datav2.d[0]
-	cbnz	tmp1, L(main_loop)
-	add	src, src, 16
-
-L(tail):
+	.p2align 3
+	/* Look for a NUL byte at offset 16..31 in the string.  */
+L(bytes16_31):
+	ldp	data1, data2, [srcin, 16]
 #ifdef __AARCH64EB__
-	rev64	datav.16b, datav.16b
-#endif
-	/* Set te NULL byte as 0xff and the rest as 0x00, move the data into a
-	   pair of scalars and then compute the length from the earliest NULL
-	   byte.  */
-	cmeq	datav.16b, datav.16b, #0
-	mov	data1, datav.d[0]
-	mov	data2, datav.d[1]
-	cmp	data1, 0
-	csel	data1, data1, data2, ne
-	sub	len, src, srcin
 	rev	data1, data1
-	add	tmp2, len, 8
-	clz	tmp1, data1
-	csel	len, len, tmp2, ne
+	rev	data2, data2
+#endif
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, REP8_7f
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, REP8_7f
+	bics	has_nul1, tmp1, tmp2
+	bic	has_nul2, tmp3, tmp4
+	ccmp	has_nul2, 0, 0, eq
+	b.eq	L(loop_entry)
+
+	/* Find the exact offset of the first NUL byte at offset 16..31 from
+	   the string start.  Enter with C = has_nul1 == 0.  */
+	csel	has_nul1, has_nul1, has_nul2, cc
+	mov	len, 24
+	rev	has_nul1, has_nul1
+	mov	tmp3, 16
+	clz	tmp1, has_nul1
+	csel	len, tmp3, len, cc
 	add	len, len, tmp1, lsr 3
 	ret
 
-	/* Load 16 bytes from [srcin & ~15] and force the bytes that precede
-	   srcin to 0xff, so we ignore any NUL bytes before the string.
-	   Then continue in the aligned loop.  */
-L(page_cross):
-	mov	tmp3, 63
-	bic	src, srcin, 15
-	and	tmp1, srcin, 7
-	ands	tmp2, srcin, 8
-	ldr	dataq, [src]
-	lsl	tmp1, tmp1, 3
-	csel	tmp2, tmp2, tmp1, eq
-	csel	tmp1, tmp1, tmp3, eq
-	mov	tmp4, -1
+L(loop_entry):
+	bic	src, srcin, 31
+
+	.p2align 5
+L(loop):
+	ldp	dataq1, dataq2, [src, 32]!
+	uminp	maskv.16b, datav1.16b, datav2.16b
+	uminp	maskv.16b, maskv.16b, maskv.16b
+	cmeq	maskv.8b, maskv.8b, 0
+	fmov	synd, maskd
+	cbz	synd, L(loop)
+
+	/* Low 32 bits of synd are non-zero if a NUL was found in datav1.  */
+	cmeq	maskv.16b, datav1.16b, 0
+	sub	len, src, srcin
+	tst	synd, 0xffffffff
+	b.ne	1f
+	cmeq	maskv.16b, datav2.16b, 0
+	add	len, len, 16
+1:
+	/* Generate a bitmask and compute correct byte offset.  */
 #ifdef __AARCH64EB__
-	/* Big-endian.  Early bytes are at MSB.  */
-	lsr	tmp1, tmp4, tmp1
-	lsr	tmp2, tmp4, tmp2
+	bic	maskv.8h, 0xf0
 #else
-	/* Little-endian.  Early bytes are at LSB.  */
-	lsl	tmp1, tmp4, tmp1
-	lsl	tmp2, tmp4, tmp2
+	bic	maskv.8h, 0x0f, lsl 8
+#endif
+	umaxp	maskv.16b, maskv.16b, maskv.16b
+	fmov	synd, maskd
+#ifndef __AARCH64EB__
+	rbit	synd, synd
 #endif
-	mov	datav2.d[0], tmp1
-	mov	datav2.d[1], tmp2
-	orn	datav.16b, datav.16b, datav2.16b
-	b	L(page_cross_entry)
+	clz	tmp, synd
+	add	len, len, tmp, lsr 2
+	ret
+
+        .p2align 4
+
+L(page_cross):
+	bic	src, srcin, 31
+	mov	tmpw, 0x0c03
+	movk	tmpw, 0xc030, lsl 16
+	ld1	{datav1.16b, datav2.16b}, [src]
+	dup	maskv.4s, tmpw
+	cmeq	datav1.16b, datav1.16b, 0
+	cmeq	datav2.16b, datav2.16b, 0
+	and	datav1.16b, datav1.16b, maskv.16b
+	and	datav2.16b, datav2.16b, maskv.16b
+	addp	maskv.16b, datav1.16b, datav2.16b
+	addp	maskv.16b, maskv.16b, maskv.16b
+	fmov	synd, maskd
+	lsl	shift, srcin, 1
+	lsr	synd, synd, shift
+	cbz	synd, L(loop)
+
+	rbit	synd, synd
+	clz	len, synd
+	lsr	len, len, 1
+	ret
+
 END (__strlen_asimd)
 weak_alias (__strlen_asimd, strlen_asimd)
 libc_hidden_builtin_def (strlen_asimd)
diff --git a/sysdeps/aarch64/multiarch/strlen_generic.S b/sysdeps/aarch64/multiarch/strlen_mte.S
similarity index 88%
rename from sysdeps/aarch64/multiarch/strlen_generic.S
rename to sysdeps/aarch64/multiarch/strlen_mte.S
index 61d3f72c9985bdd103d5e4c68337fed4a55511be..b8daa54dd89afbd99a6338cef45f49a25defaa26 100644
--- a/sysdeps/aarch64/multiarch/strlen_generic.S
+++ b/sysdeps/aarch64/multiarch/strlen_mte.S
@@ -17,14 +17,14 @@ 
    <https://www.gnu.org/licenses/>.  */
 
 /* The actual strlen code is in ../strlen.S.  If we are building libc this file
-   defines __strlen_generic.  Otherwise the include of ../strlen.S will define
+   defines __strlen_mte.  Otherwise the include of ../strlen.S will define
    the normal __strlen  entry points.  */
 
 #include <sysdep.h>
 
 #if IS_IN (libc)
 
-# define STRLEN __strlen_generic
+# define STRLEN __strlen_mte
 
 /* Do not hide the generic version of strlen, we use it internally.  */
 # undef libc_hidden_builtin_def
@@ -32,7 +32,7 @@ 
 
 # ifdef SHARED
 /* It doesn't make sense to send libc-internal strlen calls through a PLT. */
-	.globl __GI_strlen; __GI_strlen = __strlen_generic
+	.globl __GI_strlen; __GI_strlen = __strlen_mte
 # endif
 #endif