[1/2] arm: Fix vcond_mask expander for MVE (PR target/100757)

Message ID 20210608213844.6218-1-christophe.lyon@linaro.org
State New
Headers show
Series
  • [1/2] arm: Fix vcond_mask expander for MVE (PR target/100757)
Related show

Commit Message

Thomas Koenig via Gcc-patches June 8, 2021, 9:38 p.m.
The problem in this PR is that we call VPSEL with a mask of vector
type instead of HImode. This happens because operand 3 in vcond_mask
is the pre-computed vector comparison and has vector type. The fix is
to transfer this value to VPR.P0 by comparing operand 3 with a vector
of constant 1 of the same type as operand 3.

The pr100757*.c testcases are derived from
gcc.c-torture/compile/20160205-1.c, forcing the use of MVE, and using
different types and return values different from 0 and 1 to avoid
commonalization with boolean masks.

Reducing the number of iterations in pr100757-3.c from 32 to 8, we
generate the code below:

float a[32];
float fn1(int d) {
  int c = 4;
  for (int b = 0; b < 8; b++)
    if (a[b] != 2.0f)
      c = 5;
  return c;
}

fn1:
        ldr     r3, .L4+80
	vpush.64        {d8, d9}
	vldrw.32        q3, [r3]	// q3=a[0..3]
	vldr.64 d8, .L4			// q4=(2.0,2.0,2.0,2.0)
	vldr.64 d9, .L4+8
	adds    r3, r3, #16
	vcmp.f32        eq, q3, q4	// cmp a[0..3] == (2.0,2.0,2.0,2.0)
	vldr.64 d2, .L4+16		// q1=(1,1,1,1)
	vldr.64 d3, .L4+24
	vldrw.32        q3, [r3]	// q3=a[4..7]
	vldr.64 d4, .L4+32		// q2=(0,0,0,0)
	vldr.64 d5, .L4+40
	vpsel q0, q1, q2		// q0=select (a[0..3])
	vcmp.f32        eq, q3, q4	// cmp a[4..7] == (2.0,2.0,2.0,2.0)
	vldm    sp!, {d8-d9}
	vpsel q2, q1, q2		// q2=select (a[4..7])
	vand    q2, q0, q2		// q2=select (a[0..3]) && select (a[4..7])
	vldr.64 d6, .L4+48		// q3=(4.0,4.0,4.0,4.0)
	vldr.64 d7, .L4+56
	vldr.64 d0, .L4+64		// q0=(5.0,5.0,5.0,5.0)
	vldr.64 d1, .L4+72
	vcmp.i32  eq, q2, q1		// cmp mask(a[0..7]) == (1,1,1,1)
	vpsel q3, q3, q0		// q3= vcond_mask(4.0,5.0)
	vmov.32 r3, q3[0]		// keep the scalar max
	vmov.32 r1, q3[1]
	vmov.32 r0, q3[3]
	vmov.32 r2, q3[2]
	vmov    s14, r1
	vmov    s15, r3
	vmaxnm.f32      s15, s15, s14
	vmov    s14, r2
	vmaxnm.f32      s15, s15, s14
	vmov    s14, r0
	vmaxnm.f32      s15, s15, s14
	vmov    r0, s15
	bx      lr
	.L5:
	.align  3
	.L4:
	.word   1073741824
	.word   1073741824
	.word   1073741824
	.word   1073741824
	.word   1
	.word   1
	.word   1
	.word   1
	.word   0
	.word   0
	.word   0
	.word   0
	.word   1082130432
	.word   1082130432
	.word   1082130432
	.word   1082130432
	.word   1084227584
	.word   1084227584
	.word   1084227584
	.word   1084227584

2021-06-09  Christophe Lyon  <christophe.lyon@linaro.org>

	PR target/100757
	gcc/
	* config/arm/vec-common.md (vcond_mask_<mode><v_cmp_result>): Fix
	expansion for MVE.

	gcc/testsuite/
	* gcc.target/arm/simd/pr100757.c: New test.
	* gcc.target/arm/simd/pr100757-2.c: New test.
	* gcc.target/arm/simd/pr100757-3.c: New test.
---
 gcc/config/arm/vec-common.md                  | 24 +++++++++++++++++--
 .../gcc.target/arm/simd/pr100757-2.c          | 20 ++++++++++++++++
 .../gcc.target/arm/simd/pr100757-3.c          | 20 ++++++++++++++++
 gcc/testsuite/gcc.target/arm/simd/pr100757.c  | 19 +++++++++++++++
 4 files changed, 81 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/arm/simd/pr100757-2.c
 create mode 100644 gcc/testsuite/gcc.target/arm/simd/pr100757-3.c
 create mode 100644 gcc/testsuite/gcc.target/arm/simd/pr100757.c

-- 
2.25.1

Comments

Thomas Koenig via Gcc-patches June 9, 2021, 3:04 p.m. | #1
Christophe Lyon <christophe.lyon@linaro.org> writes:
> The problem in this PR is that we call VPSEL with a mask of vector

> type instead of HImode. This happens because operand 3 in vcond_mask

> is the pre-computed vector comparison and has vector type. The fix is

> to transfer this value to VPR.P0 by comparing operand 3 with a vector

> of constant 1 of the same type as operand 3.


The alternative is to implement TARGET_VECTORIZE_GET_MASK_MODE
and return HImode for MVE.  This is how AVX512 handles masks.

It might be worth trying that to see how it works.  I'm not sure
off-hand whether it'll produce worse code or better code.  However,
using HImode as the mask mode would help when defining other
predicated optabs in future.

Thanks,
Richard

> The pr100757*.c testcases are derived from

> gcc.c-torture/compile/20160205-1.c, forcing the use of MVE, and using

> different types and return values different from 0 and 1 to avoid

> commonalization with boolean masks.

>

> Reducing the number of iterations in pr100757-3.c from 32 to 8, we

> generate the code below:

>

> float a[32];

> float fn1(int d) {

>   int c = 4;

>   for (int b = 0; b < 8; b++)

>     if (a[b] != 2.0f)

>       c = 5;

>   return c;

> }

>

> fn1:

>         ldr     r3, .L4+80

> 	vpush.64        {d8, d9}

> 	vldrw.32        q3, [r3]	// q3=a[0..3]

> 	vldr.64 d8, .L4			// q4=(2.0,2.0,2.0,2.0)

> 	vldr.64 d9, .L4+8

> 	adds    r3, r3, #16

> 	vcmp.f32        eq, q3, q4	// cmp a[0..3] == (2.0,2.0,2.0,2.0)

> 	vldr.64 d2, .L4+16		// q1=(1,1,1,1)

> 	vldr.64 d3, .L4+24

> 	vldrw.32        q3, [r3]	// q3=a[4..7]

> 	vldr.64 d4, .L4+32		// q2=(0,0,0,0)

> 	vldr.64 d5, .L4+40

> 	vpsel q0, q1, q2		// q0=select (a[0..3])

> 	vcmp.f32        eq, q3, q4	// cmp a[4..7] == (2.0,2.0,2.0,2.0)

> 	vldm    sp!, {d8-d9}

> 	vpsel q2, q1, q2		// q2=select (a[4..7])

> 	vand    q2, q0, q2		// q2=select (a[0..3]) && select (a[4..7])

> 	vldr.64 d6, .L4+48		// q3=(4.0,4.0,4.0,4.0)

> 	vldr.64 d7, .L4+56

> 	vldr.64 d0, .L4+64		// q0=(5.0,5.0,5.0,5.0)

> 	vldr.64 d1, .L4+72

> 	vcmp.i32  eq, q2, q1		// cmp mask(a[0..7]) == (1,1,1,1)

> 	vpsel q3, q3, q0		// q3= vcond_mask(4.0,5.0)

> 	vmov.32 r3, q3[0]		// keep the scalar max

> 	vmov.32 r1, q3[1]

> 	vmov.32 r0, q3[3]

> 	vmov.32 r2, q3[2]

> 	vmov    s14, r1

> 	vmov    s15, r3

> 	vmaxnm.f32      s15, s15, s14

> 	vmov    s14, r2

> 	vmaxnm.f32      s15, s15, s14

> 	vmov    s14, r0

> 	vmaxnm.f32      s15, s15, s14

> 	vmov    r0, s15

> 	bx      lr

> 	.L5:

> 	.align  3

> 	.L4:

> 	.word   1073741824

> 	.word   1073741824

> 	.word   1073741824

> 	.word   1073741824

> 	.word   1

> 	.word   1

> 	.word   1

> 	.word   1

> 	.word   0

> 	.word   0

> 	.word   0

> 	.word   0

> 	.word   1082130432

> 	.word   1082130432

> 	.word   1082130432

> 	.word   1082130432

> 	.word   1084227584

> 	.word   1084227584

> 	.word   1084227584

> 	.word   1084227584

>

> 2021-06-09  Christophe Lyon  <christophe.lyon@linaro.org>

>

> 	PR target/100757

> 	gcc/

> 	* config/arm/vec-common.md (vcond_mask_<mode><v_cmp_result>): Fix

> 	expansion for MVE.

>

> 	gcc/testsuite/

> 	* gcc.target/arm/simd/pr100757.c: New test.

> 	* gcc.target/arm/simd/pr100757-2.c: New test.

> 	* gcc.target/arm/simd/pr100757-3.c: New test.

> ---

>  gcc/config/arm/vec-common.md                  | 24 +++++++++++++++++--

>  .../gcc.target/arm/simd/pr100757-2.c          | 20 ++++++++++++++++

>  .../gcc.target/arm/simd/pr100757-3.c          | 20 ++++++++++++++++

>  gcc/testsuite/gcc.target/arm/simd/pr100757.c  | 19 +++++++++++++++

>  4 files changed, 81 insertions(+), 2 deletions(-)

>  create mode 100644 gcc/testsuite/gcc.target/arm/simd/pr100757-2.c

>  create mode 100644 gcc/testsuite/gcc.target/arm/simd/pr100757-3.c

>  create mode 100644 gcc/testsuite/gcc.target/arm/simd/pr100757.c

>

> diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md

> index 0ffc7a9322c..ccdfaa8321f 100644

> --- a/gcc/config/arm/vec-common.md

> +++ b/gcc/config/arm/vec-common.md

> @@ -478,8 +478,28 @@ (define_expand "vcond_mask_<mode><v_cmp_result>"

>      }

>    else if (TARGET_HAVE_MVE)

>      {

> -      emit_insn (gen_mve_vpselq (VPSELQ_S, <MODE>mode, operands[0],

> -                                 operands[1], operands[2], operands[3]));

> +      /* Convert pre-computed vector comparison into VPR.P0 by comparing

> +         operand 3 with a vector of '1', then use VPSEL.  */

> +      machine_mode cmp_mode = GET_MODE (operands[3]);

> +      rtx vpr_p0 = gen_reg_rtx (HImode);

> +      rtx one = gen_reg_rtx (cmp_mode);

> +      emit_move_insn (one, CONST1_RTX (cmp_mode));

> +      emit_insn (gen_mve_vcmpq (EQ, cmp_mode, vpr_p0, operands[3], one));

> +

> +      switch (GET_MODE_CLASS (<MODE>mode))

> +        {

> +          case MODE_VECTOR_INT:

> +            emit_insn (gen_mve_vpselq (VPSELQ_S, <MODE>mode, operands[0], operands[1], operands[2], vpr_p0));

> +            break;

> +          case MODE_VECTOR_FLOAT:

> +	    if (TARGET_HAVE_MVE_FLOAT)

> +              emit_insn (gen_mve_vpselq_f (<MODE>mode, operands[0], operands[1], operands[2], vpr_p0));

> +	    else

> +	      gcc_unreachable ();

> +            break;

> +          default:

> +            gcc_unreachable ();

> +         }

>      }

>    else

>      gcc_unreachable ();

> diff --git a/gcc/testsuite/gcc.target/arm/simd/pr100757-2.c b/gcc/testsuite/gcc.target/arm/simd/pr100757-2.c

> new file mode 100644

> index 00000000000..993ce369090

> --- /dev/null

> +++ b/gcc/testsuite/gcc.target/arm/simd/pr100757-2.c

> @@ -0,0 +1,20 @@

> +/* { dg-do compile } */

> +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */

> +/* { dg-add-options arm_v8_1m_mve_fp } */

> +/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */

> +/* Derived from gcc.c-torture/compile/20160205-1.c.  */

> +

> +float a[32];

> +int fn1(int d) {

> +  int c = 4;

> +  for (int b = 0; b < 32; b++)

> +    if (a[b] != 2.0f)

> +      c = 5;

> +  return c;

> +}

> +

> +/* { dg-final { scan-assembler-times {\t.word\t1073741824\n} 4 } } */ /* Constant 2.0f.  */

> +/* { dg-final { scan-assembler-times {\t.word\t1\n} 4 } } */ /* 'true' mask.  */

> +/* { dg-final { scan-assembler-times {\t.word\t0\n} 4 } } */ /* 'false' mask.  */

> +/* { dg-final { scan-assembler-times {\t.word\t4\n} 4 } } */ /* Initial value for c.  */

> +/* { dg-final { scan-assembler-times {\t.word\t5\n} 4 } } */ /* Possible value for c.  */

> diff --git a/gcc/testsuite/gcc.target/arm/simd/pr100757-3.c b/gcc/testsuite/gcc.target/arm/simd/pr100757-3.c

> new file mode 100644

> index 00000000000..b94a73b2d2c

> --- /dev/null

> +++ b/gcc/testsuite/gcc.target/arm/simd/pr100757-3.c

> @@ -0,0 +1,20 @@

> +/* { dg-do compile } */

> +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */

> +/* { dg-add-options arm_v8_1m_mve_fp } */

> +/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */

> +/* Copied from gcc.c-torture/compile/20160205-1.c.  */

> +

> +float a[32];

> +float fn1(int d) {

> +  float c = 4;

> +  for (int b = 0; b < 32; b++)

> +    if (a[b] != 2.0f)

> +      c = 5;

> +  return c;

> +}

> +

> +/* { dg-final { scan-assembler-times {\t.word\t1073741824\n} 4 } } */ /* Constant 2.0f.  */

> +/* { dg-final { scan-assembler-times {\t.word\t1\n} 4 } } */ /* 'true' mask.  */

> +/* { dg-final { scan-assembler-times {\t.word\t0\n} 4 } } */ /* 'false' mask.  */

> +/* { dg-final { scan-assembler-times {\t.word\t1084227584\n} 4 } } */ /* Initial value for c.  */

> +/* { dg-final { scan-assembler-times {\t.word\t1082130432\n} 4 } } */ /* Possible value for c.  */

> diff --git a/gcc/testsuite/gcc.target/arm/simd/pr100757.c b/gcc/testsuite/gcc.target/arm/simd/pr100757.c

> new file mode 100644

> index 00000000000..e51e716b4ec

> --- /dev/null

> +++ b/gcc/testsuite/gcc.target/arm/simd/pr100757.c

> @@ -0,0 +1,19 @@

> +/* { dg-do compile } */

> +/* { dg-require-effective-target arm_v8_1m_mve_ok } */

> +/* { dg-add-options arm_v8_1m_mve } */

> +/* { dg-additional-options "-O3" } */

> +/* Derived from gcc.c-torture/compile/20160205-1.c.  */

> +

> +int a[32];

> +int fn1(int d) {

> +  int c = 2;

> +  for (int b = 0; b < 32; b++)

> +    if (a[b])

> +      c = 3;

> +  return c;

> +}

> +

> +/* { dg-final { scan-assembler-times {\t.word\t1\n} 4 } } */ /* 'true' mask.  */

> +/* { dg-final { scan-assembler-times {\t.word\t0\n} 4 } } */ /* 'false' mask.  */

> +/* { dg-final { scan-assembler-times {\t.word\t2\n} 4 } } */ /* Initial value for c.  */

> +/* { dg-final { scan-assembler-times {\t.word\t3\n} 4 } } */ /* Possible value for c.  */
Thomas Koenig via Gcc-patches July 2, 2021, 8:53 a.m. | #2
Hi,

On Wed, 9 Jun 2021 at 17:04, Richard Sandiford
<richard.sandiford@arm.com> wrote:
>

> Christophe Lyon <christophe.lyon@linaro.org> writes:

> > The problem in this PR is that we call VPSEL with a mask of vector

> > type instead of HImode. This happens because operand 3 in vcond_mask

> > is the pre-computed vector comparison and has vector type. The fix is

> > to transfer this value to VPR.P0 by comparing operand 3 with a vector

> > of constant 1 of the same type as operand 3.

>

> The alternative is to implement TARGET_VECTORIZE_GET_MASK_MODE

> and return HImode for MVE.  This is how AVX512 handles masks.

>

> It might be worth trying that to see how it works.  I'm not sure

> off-hand whether it'll produce worse code or better code.  However,

> using HImode as the mask mode would help when defining other

> predicated optabs in future.

>


Here is my v2 of this patch, hopefully implementing what you suggested.

Sorry it took me so long, but as implementing this hook was of course
not sufficient, and it took me a while to figure out I needed to keep the
non-HI expanders (vec_cmp ,...). Each time I fixed a bug, I created
another one... I shouldn't have added so many tests ;-)

I'm not sure how to improve the vectorizer doc, to better describe the
vec_cmp/vcond patterns and see which ones the vectorizer is trying
to use (to understand which ones I should implement).

Then I realized I was about to break Neon support, so I decided
it was safer to add Neon tests ;-)

Is that version OK?

Thanks,

Christophe


> Thanks,

> Richard

>

> > The pr100757*.c testcases are derived from

> > gcc.c-torture/compile/20160205-1.c, forcing the use of MVE, and using

> > different types and return values different from 0 and 1 to avoid

> > commonalization with boolean masks.

> >

> > Reducing the number of iterations in pr100757-3.c from 32 to 8, we

> > generate the code below:

> >

> > float a[32];

> > float fn1(int d) {

> >   int c = 4;

> >   for (int b = 0; b < 8; b++)

> >     if (a[b] != 2.0f)

> >       c = 5;

> >   return c;

> > }

> >

> > fn1:

> >         ldr     r3, .L4+80

> >       vpush.64        {d8, d9}

> >       vldrw.32        q3, [r3]        // q3=a[0..3]

> >       vldr.64 d8, .L4                 // q4=(2.0,2.0,2.0,2.0)

> >       vldr.64 d9, .L4+8

> >       adds    r3, r3, #16

> >       vcmp.f32        eq, q3, q4      // cmp a[0..3] == (2.0,2.0,2.0,2.0)

> >       vldr.64 d2, .L4+16              // q1=(1,1,1,1)

> >       vldr.64 d3, .L4+24

> >       vldrw.32        q3, [r3]        // q3=a[4..7]

> >       vldr.64 d4, .L4+32              // q2=(0,0,0,0)

> >       vldr.64 d5, .L4+40

> >       vpsel q0, q1, q2                // q0=select (a[0..3])

> >       vcmp.f32        eq, q3, q4      // cmp a[4..7] == (2.0,2.0,2.0,2.0)

> >       vldm    sp!, {d8-d9}

> >       vpsel q2, q1, q2                // q2=select (a[4..7])

> >       vand    q2, q0, q2              // q2=select (a[0..3]) && select (a[4..7])

> >       vldr.64 d6, .L4+48              // q3=(4.0,4.0,4.0,4.0)

> >       vldr.64 d7, .L4+56

> >       vldr.64 d0, .L4+64              // q0=(5.0,5.0,5.0,5.0)

> >       vldr.64 d1, .L4+72

> >       vcmp.i32  eq, q2, q1            // cmp mask(a[0..7]) == (1,1,1,1)

> >       vpsel q3, q3, q0                // q3= vcond_mask(4.0,5.0)

> >       vmov.32 r3, q3[0]               // keep the scalar maxv2-0001-arm-Fix-vcond_mask-expander-for-MVE-PR-target-100.patch

> >       vmov.32 r1, q3[1]

> >       vmov.32 r0, q3[3]

> >       vmov.32 r2, q3[2]

> >       vmov    s14, r1

> >       vmov    s15, r3

> >       vmaxnm.f32      s15, s15, s14

> >       vmov    s14, r2

> >       vmaxnm.f32      s15, s15, s14

> >       vmov    s14, r0

> >       vmaxnm.f32      s15, s15, s14

> >       vmov    r0, s15

> >       bx      lr

> >       .L5:

> >       .align  3

> >       .L4:

> >       .word   1073741824

> >       .word   1073741824

> >       .word   1073741824

> >       .word   1073741824

> >       .word   1

> >       .word   1

> >       .word   1

> >       .word   1

> >       .word   0

> >       .word   0

> >       .word   0

> >       .word   0

> >       .word   1082130432

> >       .word   1082130432

> >       .word   1082130432

> >       .word   1082130432

> >       .word   1084227584

> >       .word   1084227584

> >       .word   1084227584

> >       .word   1084227584

> >

> > 2021-06-09  Christophe Lyon  <christophe.lyon@linaro.org>

> >

> >       PR target/100757

> >       gcc/

> >       * config/arm/vec-common.md (vcond_mask_<mode><v_cmp_result>): Fix

> >       expansion for MVE.

> >

> >       gcc/testsuite/

> >       * gcc.target/arm/simd/pr100757.c: New test.

> >       * gcc.target/arm/simd/pr100757-2.c: New test.

> >       * gcc.target/arm/simd/pr100757-3.c: New test.

> > ---

> >  gcc/config/arm/vec-common.md                  | 24 +++++++++++++++++--

> >  .../gcc.target/arm/simd/pr100757-2.c          | 20 ++++++++++++++++

> >  .../gcc.target/arm/simd/pr100757-3.c          | 20 ++++++++++++++++

> >  gcc/testsuite/gcc.target/arm/simd/pr100757.c  | 19 +++++++++++++++

> >  4 files changed, 81 insertions(+), 2 deletions(-)

> >  create mode 100644 gcc/testsuite/gcc.target/arm/simd/pr100757-2.c

> >  create mode 100644 gcc/testsuite/gcc.target/arm/simd/pr100757-3.c

> >  create mode 100644 gcc/testsuite/gcc.target/arm/simd/pr100757.c

> >

> > diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md

> > index 0ffc7a9322c..ccdfaa8321f 100644

> > --- a/gcc/config/arm/vec-common.md

> > +++ b/gcc/config/arm/vec-common.md

> > @@ -478,8 +478,28 @@ (define_expand "vcond_mask_<mode><v_cmp_result>"

> >      }

> >    else if (TARGET_HAVE_MVE)

> >      {

> > -      emit_insn (gen_mve_vpselq (VPSELQ_S, <MODE>mode, operands[0],

> > -                                 operands[1], operands[2], operands[3]));

> > +      /* Convert pre-computed vector comparison into VPR.P0 by comparing

> > +         operand 3 with a vector of '1', then use VPSEL.  */

> > +      machine_mode cmp_mode = GET_MODE (operands[3]);

> > +      rtx vpr_p0 = gen_reg_rtx (HImode);

> > +      rtx one = gen_reg_rtx (cmp_mode);

> > +      emit_move_insn (one, CONST1_RTX (cmp_mode));

> > +      emit_insn (gen_mve_vcmpq (EQ, cmp_mode, vpr_p0, operands[3], one));

> > +

> > +      switch (GET_MODE_CLASS (<MODE>mode))

> > +        {

> > +          case MODE_VECTOR_INT:

> > +            emit_insn (gen_mve_vpselq (VPSELQ_S, <MODE>mode, operands[0], operands[1], operands[2], vpr_p0));

> > +            break;

> > +          case MODE_VECTOR_FLOAT:

> > +         if (TARGET_HAVE_MVE_FLOAT)

> > +              emit_insn (gen_mve_vpselq_f (<MODE>mode, operands[0], operands[1], operands[2], vpr_p0));

> > +         else

> > +           gcc_unreachable ();

> > +            break;

> > +          default:

> > +            gcc_unreachable ();

> > +         }

> >      }

> >    else

> >      gcc_unreachable ();

> > diff --git a/gcc/testsuite/gcc.target/arm/simd/pr100757-2.c b/gcc/testsuite/gcc.target/arm/simd/pr100757-2.c

> > new file mode 100644

> > index 00000000000..993ce369090

> > --- /dev/null

> > +++ b/gcc/testsuite/gcc.target/arm/simd/pr100757-2.c

> > @@ -0,0 +1,20 @@

> > +/* { dg-do compile } */

> > +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */

> > +/* { dg-add-options arm_v8_1m_mve_fp } */

> > +/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */

> > +/* Derived from gcc.c-torture/compile/20160205-1.c.  */

> > +

> > +float a[32];

> > +int fn1(int d) {

> > +  int c = 4;

> > +  for (int b = 0; b < 32; b++)

> > +    if (a[b] != 2.0f)

> > +      c = 5;

> > +  return c;

> > +}

> > +

> > +/* { dg-final { scan-assembler-times {\t.word\t1073741824\n} 4 } } */ /* Constant 2.0f.  */

> > +/* { dg-final { scan-assembler-times {\t.word\t1\n} 4 } } */ /* 'true' mask.  */

> > +/* { dg-final { scan-assembler-times {\t.word\t0\n} 4 } } */ /* 'false' mask.  */

> > +/* { dg-final { scan-assembler-times {\t.word\t4\n} 4 } } */ /* Initial value for c.  */

> > +/* { dg-final { scan-assembler-times {\t.word\t5\n} 4 } } */ /* Possible value for c.  */

> > diff --git a/gcc/testsuite/gcc.target/arm/simd/pr100757-3.c b/gcc/testsuite/gcc.target/arm/simd/pr100757-3.c

> > new file mode 100644

> > index 00000000000..b94a73b2d2c

> > --- /dev/null

> > +++ b/gcc/testsuite/gcc.target/arm/simd/pr100757-3.c

> > @@ -0,0 +1,20 @@

> > +/* { dg-do compile } */

> > +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */

> > +/* { dg-add-options arm_v8_1m_mve_fp } */

> > +/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */

> > +/* Copied from gcc.c-torture/compile/20160205-1.c.  */

> > +

> > +float a[32];

> > +float fn1(int d) {

> > +  float c = 4;

> > +  for (int b = 0; b < 32; b++)

> > +    if (a[b] != 2.0f)

> > +      c = 5;

> > +  return c;

> > +}

> > +

> > +/* { dg-final { scan-assembler-times {\t.word\t1073741824\n} 4 } } */ /* Constant 2.0f.  */

> > +/* { dg-final { scan-assembler-times {\t.word\t1\n} 4 } } */ /* 'true' mask.  */

> > +/* { dg-final { scan-assembler-times {\t.word\t0\n} 4 } } */ /* 'false' mask.  */

> > +/* { dg-final { scan-assembler-times {\t.word\t1084227584\n} 4 } } */ /* Initial value for c.  */

> > +/* { dg-final { scan-assembler-times {\t.word\t1082130432\n} 4 } } */ /* Possible value for c.  */

> > diff --git a/gcc/testsuite/gcc.target/arm/simd/pr100757.c b/gcc/testsuite/gcc.target/arm/simd/pr100757.c

> > new file mode 100644

> > index 00000000000..e51e716b4ec

> > --- /dev/null

> > +++ b/gcc/testsuite/gcc.target/arm/simd/pr100757.c

> > @@ -0,0 +1,19 @@

> > +/* { dg-do compile } */

> > +/* { dg-require-effective-target arm_v8_1m_mve_ok } */

> > +/* { dg-add-options arm_v8_1m_mve } */

> > +/* { dg-additional-options "-O3" } */

> > +/* Derived from gcc.c-torture/compile/20160205-1.c.  */

> > +

> > +int a[32];

> > +int fn1(int d) {

> > +  int c = 2;

> > +  for (int b = 0; b < 32; b++)

> > +    if (a[b])

> > +      c = 3;

> > +  return c;

> > +}

> > +

> > +/* { dg-final { scan-assembler-times {\t.word\t1\n} 4 } } */ /* 'true' mask.  */

> > +/* { dg-final { scan-assembler-times {\t.word\t0\n} 4 } } */ /* 'false' mask.  */

> > +/* { dg-final { scan-assembler-times {\t.word\t2\n} 4 } } */ /* Initial value for c.  */

> > +/* { dg-final { scan-assembler-times {\t.word\t3\n} 4 } } */ /* Possible value for c.  */
From 44f559535006df6d370bf23498ab7a3800798dae Mon Sep 17 00:00:00 2001
From: Christophe Lyon <christophe.lyon@linaro.org>
Date: Tue, 8 Jun 2021 21:32:28 +0000
Subject: [PATCH v2] arm: Fix vcond_mask expander for MVE (PR target/100757)

The problem in this PR is that we call VPSEL with a mask of vector
type instead of HImode. This happens because operand 3 in vcond_mask
is the pre-computed vector comparison and has vector type.

This patch fixes it by implementing TARGET_VECTORIZE_GET_MASK_MODE,
returning HImode when targeting MVE.  In turn, this implies
implementing vec_cmp<mode>hi, vec_cmpu<mode>hi and
vcond_mask_<mode>hi, but we still need to keep the non-HI versions
even for MVE, which was not obvious. Indeed, only defining
TARGET_VECTORIZE_GET_MASK_MODE causes regressions (no vectorization),
but also ICEs.  At some point I also added vcond<V_cvtto>hi, but it
seems useless (it would help if the vectorizer traces included the
list of optabs it is trying to use)..

The patch moves vcond_mask_<mode><v_cmp_result> back to neon.md since
it is not used by MVE anymore. The new *hi patterns listed above are
implemented in mve.md since they are only valid for MVE. However this
may make maintenance/comparison more painful than having all of them
in vec-common.md.

Compared to neon.md's vcond_mask_<mode><v_cmp_result> before my
"arm: Auto-vectorization for MVE: vcmp" patch (r12-834
a6eacbf1055520e968d1a25f6d30d6ff4b66272d), it keeps the VDQWH iterator
added in r12-835 7606865198b241b4c944f66761d6506b02ead951 (to have
V4HF/V8HF support), as well as the
(!<Is_float_mode> || flag_unsafe_math_optimizations) condition which
was not present before r12-834 although SF modes were enabled by VDQW
(I think this was a bug).

Using TARGET_VECTORIZE_GET_MASK_MODE has the advantage that we no
longer need to generate vpsel with vectors of 0 and 1: the masks are
now merged via scalar 'and' instructions operating on 16-bit masks
(HImode).

In addition, this patch fixes a problem in arm_expand_vcond() where
the result would be a vector of 0 or 1 instead of operand 1 or 2.  The
mve-vcmp-f32-2.c testcase is an update from mve-vcmp-f32.c using a
conditional with 2.0f and 3.0f constants to help scan-assembler-times.

The patch adds several Neon tests similar to the MVE ones to make sure
the fix for MVE does not break Neon support.

The pr100757*.c testcases are derived from
gcc.c-torture/compile/20160205-1.c, forcing the use of MVE, and using
various types and return values different from 0 and 1 to avoid
commonalization with boolean masks.  In addition, since we no longer
need these masks unlike the previous version of this patch, the tests
make sure they are not present.

Reducing the number of iterations in pr100757-3.c from 32 to 8, we
generate the code below:

float a[32];
float fn1(int d) {
  int c = 4;
  for (int b = 0; b < 8; b++)
    if (a[b] != 2.0f)
      c = 5;
  return c;
}

fn1:
        ldr     r3, .L3+48
	vldr.64 d4, .L3			// q2=(2.0,2.0,2.0,2.0)
	vldr.64 d5, .L3+8
	vldrw.32        q0, [r3]	// q0=a[0..3]
	adds    r3, r3, #16
	vcmp.f32        eq, q0, q2	// cmp a[0..3] == (2.0,2.0,2.0,2.0)
	vldrw.32        q1, [r3]	// q1=a[4..7]
	vmrs    r3, P0  @ movhi		// r3=mask[0..3]
	vcmp.f32        eq, q1, q2	// cmp a[4..7] == (2.0,2.0,2.0,2.0)
	vmrs    r2, P0  @ movhi		// r2=mask[4..7]
	ands    r3, r3, r2		// r3=mask[0..7]
	vmsr     P0, r3 @ movhi
	vldr.64 d4, .L3+16		// q2=(4.0,4.0,4.0,4.0)
	vldr.64 d5, .L3+24
	vldr.64 d6, .L3+32		// q3=(5.0,5.0,5.0,5.0)
	vldr.64 d7, .L3+40
	vpsel q3, q3, q2		// q3=select (a[0..7])
	vmov.32 r3, q3[0]
	vmov.32 r1, q3[1]
	vmov.32 r0, q3[3]
	vmov.32 r2, q3[2]
	vmov    s14, r1
	vmov    s15, r3
	vmaxnm.f32      s15, s15, s14
	vmov    s14, r2
	vmaxnm.f32      s15, s15, s14
	vmov    s14, r0
	vmaxnm.f32      s15, s15, s14
	vmov    r0, s15
	bx      lr
	.L4:
	.align  3
	.L3:
	.word   1073741824
	.word   1073741824
	.word   1073741824
	.word   1073741824
	.word   1084227584
	.word   1084227584
	.word   1084227584
	.word   1084227584
	.word   1082130432
	.word   1082130432
	.word   1082130432
	.word   1082130432

2021-07-02  Christophe Lyon  <christophe.lyon@linaro.org>

	PR target/100757
	gcc/
	* config/arm/arm-protos.h (arm_get_mask_mode): New prototype.
	* config/arm/arm.c (TARGET_VECTORIZE_GET_MASK_MODE): New.
	(arm_expand_vector_compare): Remove useless generation of vpsel.
	(arm_expand_vcond): Fix select operands.
	(arm_get_mask_mode): New.
	* config/arm/mve.md (vec_cmp<mode>hi): New.
	(vec_cmpu<mode>hi): New.
	(vcond_mask_<mode>hi): New.
	* config/arm/vec-common.md (vcond_mask_<mode><v_cmp_result>): Move to ...
	* config/arm/neon.md (vcond_mask_<mode><v_cmp_result>): ... here
	and disable for MVE.

	gcc/testsuite/
	* gcc.target/arm/simd/mve-vcmp-f32-2.c: New test.
	* gcc.target/arm/simd/neon-compare-1.c: New test.
	* gcc.target/arm/simd/neon-compare-2.c: New test.
	* gcc.target/arm/simd/neon-compare-3.c: New test.
	* gcc.target/arm/simd/neon-compare-scalar-1.c: New test.
	* gcc.target/arm/simd/neon-vcmp-f16.c: New test.
	* gcc.target/arm/simd/neon-vcmp-f32-2.c: New test.
	* gcc.target/arm/simd/neon-vcmp-f32-3.c: New test.
	* gcc.target/arm/simd/neon-vcmp-f32.c: New test.
	* gcc.target/arm/simd/neon-vcmp.c: New test.
	* gcc.target/arm/simd/pr100757.c: New test.
	* gcc.target/arm/simd/pr100757-2.c: New test.
	* gcc.target/arm/simd/pr100757-3.c: New test.
	* gcc.target/arm/simd/pr100757-4.c: New test.
---
 gcc/config/arm/arm-protos.h                   |  1 +
 gcc/config/arm/arm.c                          | 56 +++++++------
 gcc/config/arm/mve.md                         | 55 +++++++++++++
 gcc/config/arm/neon.md                        | 14 ++++
 gcc/config/arm/vec-common.md                  | 25 ------
 .../gcc.target/arm/simd/mve-vcmp-f32-2.c      | 32 ++++++++
 .../gcc.target/arm/simd/neon-compare-1.c      | 78 +++++++++++++++++++
 .../gcc.target/arm/simd/neon-compare-2.c      | 13 ++++
 .../gcc.target/arm/simd/neon-compare-3.c      | 14 ++++
 .../arm/simd/neon-compare-scalar-1.c          | 57 ++++++++++++++
 .../gcc.target/arm/simd/neon-vcmp-f16.c       | 12 +++
 .../gcc.target/arm/simd/neon-vcmp-f32-2.c     | 15 ++++
 .../gcc.target/arm/simd/neon-vcmp-f32-3.c     | 12 +++
 .../gcc.target/arm/simd/neon-vcmp-f32.c       | 12 +++
 gcc/testsuite/gcc.target/arm/simd/neon-vcmp.c | 22 ++++++
 .../gcc.target/arm/simd/pr100757-2.c          | 20 +++++
 .../gcc.target/arm/simd/pr100757-3.c          | 20 +++++
 .../gcc.target/arm/simd/pr100757-4.c          | 19 +++++
 gcc/testsuite/gcc.target/arm/simd/pr100757.c  | 19 +++++
 19 files changed, 442 insertions(+), 54 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f32-2.c
 create mode 100644 gcc/testsuite/gcc.target/arm/simd/neon-compare-1.c
 create mode 100644 gcc/testsuite/gcc.target/arm/simd/neon-compare-2.c
 create mode 100644 gcc/testsuite/gcc.target/arm/simd/neon-compare-3.c
 create mode 100644 gcc/testsuite/gcc.target/arm/simd/neon-compare-scalar-1.c
 create mode 100644 gcc/testsuite/gcc.target/arm/simd/neon-vcmp-f16.c
 create mode 100644 gcc/testsuite/gcc.target/arm/simd/neon-vcmp-f32-2.c
 create mode 100644 gcc/testsuite/gcc.target/arm/simd/neon-vcmp-f32-3.c
 create mode 100644 gcc/testsuite/gcc.target/arm/simd/neon-vcmp-f32.c
 create mode 100644 gcc/testsuite/gcc.target/arm/simd/neon-vcmp.c
 create mode 100644 gcc/testsuite/gcc.target/arm/simd/pr100757-2.c
 create mode 100644 gcc/testsuite/gcc.target/arm/simd/pr100757-3.c
 create mode 100644 gcc/testsuite/gcc.target/arm/simd/pr100757-4.c
 create mode 100644 gcc/testsuite/gcc.target/arm/simd/pr100757.c

diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
index 9b1f61394ad..3c84969b340 100644
--- a/gcc/config/arm/arm-protos.h
+++ b/gcc/config/arm/arm-protos.h
@@ -201,6 +201,7 @@ extern void arm_init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree);
 extern bool arm_pad_reg_upward (machine_mode, tree, int);
 #endif
 extern int arm_apply_result_size (void);
+extern opt_machine_mode arm_get_mask_mode (machine_mode mode);
 
 #endif /* RTX_CODE */
 
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 7b37e1b602c..f9672397a26 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -838,6 +838,10 @@ static const struct attribute_spec arm_attribute_table[] =
 
 #undef TARGET_MD_ASM_ADJUST
 #define TARGET_MD_ASM_ADJUST arm_md_asm_adjust
+
+#undef TARGET_VECTORIZE_GET_MASK_MODE
+#define TARGET_VECTORIZE_GET_MASK_MODE arm_get_mask_mode
+
 
 /* Obstack for minipool constant handling.  */
 static struct obstack minipool_obstack;
@@ -31061,13 +31065,7 @@ arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1,
 
 	  /* If we are not expanding a vcond, build the result here.  */
 	  if (!vcond_mve)
-	    {
-	      rtx zero = gen_reg_rtx (cmp_result_mode);
-	      rtx one = gen_reg_rtx (cmp_result_mode);
-	      emit_move_insn (zero, CONST0_RTX (cmp_result_mode));
-	      emit_move_insn (one, CONST1_RTX (cmp_result_mode));
-	      emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, target, one, zero, vpr_p0));
-	    }
+	    emit_move_insn (target, vpr_p0);
 	}
       else
 	emit_insn (gen_neon_vc (code, cmp_mode, target, op0, op1));
@@ -31088,13 +31086,7 @@ arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1,
 
 	  emit_insn (gen_mve_vcmpq (code, cmp_mode, vpr_p0, op0, force_reg (cmp_mode, op1)));
 	  if (!vcond_mve)
-	    {
-	      rtx zero = gen_reg_rtx (cmp_result_mode);
-	      rtx one = gen_reg_rtx (cmp_result_mode);
-	      emit_move_insn (zero, CONST0_RTX (cmp_result_mode));
-	      emit_move_insn (one, CONST1_RTX (cmp_result_mode));
-	      emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, target, one, zero, vpr_p0));
-	    }
+	    emit_move_insn (target, vpr_p0);
 	}
       else
 	emit_insn (gen_neon_vc (code, cmp_mode, target,
@@ -31115,13 +31107,7 @@ arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1,
 
 	  emit_insn (gen_mve_vcmpq (swap_condition (code), cmp_mode, vpr_p0, force_reg (cmp_mode, op1), op0));
 	  if (!vcond_mve)
-	    {
-	      rtx zero = gen_reg_rtx (cmp_result_mode);
-	      rtx one = gen_reg_rtx (cmp_result_mode);
-	      emit_move_insn (zero, CONST0_RTX (cmp_result_mode));
-	      emit_move_insn (one, CONST1_RTX (cmp_result_mode));
-	      emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, target, one, zero, vpr_p0));
-	    }
+	    emit_move_insn (target, vpr_p0);
 	}
       else
 	emit_insn (gen_neon_vc (swap_condition (code), cmp_mode,
@@ -31163,7 +31149,7 @@ arm_expand_vcond (rtx *operands, machine_mode cmp_result_mode)
 
   if (TARGET_HAVE_MVE)
     {
-      vcond_mve=true;
+      vcond_mve = true;
       mask = gen_reg_rtx (HImode);
     }
   else
@@ -31178,20 +31164,21 @@ arm_expand_vcond (rtx *operands, machine_mode cmp_result_mode)
 			    mask, operands[1], operands[2]));
   else
     {
-      machine_mode cmp_mode = GET_MODE (operands[4]);
+      machine_mode cmp_mode = GET_MODE (operands[0]);
       rtx vpr_p0 = mask;
-      rtx zero = gen_reg_rtx (cmp_mode);
-      rtx one = gen_reg_rtx (cmp_mode);
-      emit_move_insn (zero, CONST0_RTX (cmp_mode));
-      emit_move_insn (one, CONST1_RTX (cmp_mode));
+
       switch (GET_MODE_CLASS (cmp_mode))
 	{
 	case MODE_VECTOR_INT:
-	  emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, operands[0], one, zero, vpr_p0));
+	  emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_mode, operands[0],
+				     operands[1], operands[2], vpr_p0));
 	  break;
 	case MODE_VECTOR_FLOAT:
 	  if (TARGET_HAVE_MVE_FLOAT)
-	    emit_insn (gen_mve_vpselq_f (cmp_mode, operands[0], one, zero, vpr_p0));
+	    emit_insn (gen_mve_vpselq_f (cmp_mode, operands[0],
+					 operands[1], operands[2], vpr_p0));
+	  else
+	    gcc_unreachable ();
 	  break;
 	default:
 	  gcc_unreachable ();
@@ -34163,4 +34150,15 @@ arm_mode_base_reg_class (machine_mode mode)
 
 struct gcc_target targetm = TARGET_INITIALIZER;
 
+/* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
+
+opt_machine_mode
+arm_get_mask_mode (machine_mode mode)
+{
+  if (TARGET_HAVE_MVE)
+    return HImode;
+
+  return default_get_mask_mode (mode);
+}
+
 #include "gt-arm.h"
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index e393518ea88..a9840408bdd 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -10516,3 +10516,58 @@ (define_insn "*movmisalign<mode>_mve_load"
   "vldr<V_sz_elem1>.<V_sz_elem>\t%q0, %E1"
   [(set_attr "type" "mve_load")]
 )
+
+;; Expanders for vec_cmp and vcond
+
+(define_expand "vec_cmp<mode>hi"
+  [(set (match_operand:HI 0 "s_register_operand")
+	(match_operator:HI 1 "comparison_operator"
+	  [(match_operand:MVE_VLD_ST 2 "s_register_operand")
+	   (match_operand:MVE_VLD_ST 3 "reg_or_zero_operand")]))]
+  "TARGET_HAVE_MVE
+   && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
+{
+  arm_expand_vector_compare (operands[0], GET_CODE (operands[1]),
+			     operands[2], operands[3], false, false);
+  DONE;
+})
+
+(define_expand "vec_cmpu<mode>hi"
+  [(set (match_operand:HI 0 "s_register_operand")
+	(match_operator:HI 1 "comparison_operator"
+	  [(match_operand:MVE_2 2 "s_register_operand")
+	   (match_operand:MVE_2 3 "reg_or_zero_operand")]))]
+  "TARGET_HAVE_MVE
+   && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
+{
+  arm_expand_vector_compare (operands[0], GET_CODE (operands[1]),
+			     operands[2], operands[3], false, false);
+  DONE;
+})
+
+(define_expand "vcond_mask_<mode>hi"
+  [(set (match_operand:MVE_VLD_ST 0 "s_register_operand")
+	(if_then_else:MVE_VLD_ST
+	  (match_operand:HI 3 "s_register_operand")
+	  (match_operand:MVE_VLD_ST 1 "s_register_operand")
+	  (match_operand:MVE_VLD_ST 2 "s_register_operand")))]
+  "TARGET_HAVE_MVE"
+{
+  switch (GET_MODE_CLASS (<MODE>mode))
+    {
+      case MODE_VECTOR_INT:
+	emit_insn (gen_mve_vpselq (VPSELQ_S, <MODE>mode, operands[0],
+				   operands[1], operands[2], operands[3]));
+	break;
+      case MODE_VECTOR_FLOAT:
+	if (TARGET_HAVE_MVE_FLOAT)
+	  emit_insn (gen_mve_vpselq_f (<MODE>mode, operands[0],
+				       operands[1], operands[2], operands[3]));
+	else
+	  gcc_unreachable ();
+	break;
+      default:
+	gcc_unreachable ();
+    }
+  DONE;
+})
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index 392d9607919..d3ca48d707c 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -1403,6 +1403,20 @@ (define_insn "*us_sub<mode>_neon"
   [(set_attr "type" "neon_qsub<q>")]
 )
 
+(define_expand "vcond_mask_<mode><v_cmp_result>"
+  [(set (match_operand:VDQWH 0 "s_register_operand")
+	(if_then_else:VDQWH
+	  (match_operand:<V_cmp_result> 3 "s_register_operand")
+	  (match_operand:VDQWH 1 "s_register_operand")
+	  (match_operand:VDQWH 2 "s_register_operand")))]
+  "TARGET_NEON
+   && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
+{
+  emit_insn (gen_neon_vbsl<mode> (operands[0], operands[3], operands[1],
+				  operands[2]));
+  DONE;
+})
+
 ;; Patterns for builtins.
 
 ; good for plain vadd, vaddq.
diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
index f90afa4cdb9..8b4ec57fb5d 100644
--- a/gcc/config/arm/vec-common.md
+++ b/gcc/config/arm/vec-common.md
@@ -461,31 +461,6 @@ (define_expand "vcondu<mode><v_cmp_result>"
   DONE;
 })
 
-(define_expand "vcond_mask_<mode><v_cmp_result>"
-  [(set (match_operand:VDQWH 0 "s_register_operand")
-        (if_then_else:VDQWH
-          (match_operand:<V_cmp_result> 3 "s_register_operand")
-          (match_operand:VDQWH 1 "s_register_operand")
-          (match_operand:VDQWH 2 "s_register_operand")))]
-  "ARM_HAVE_<MODE>_ARITH
-   && !TARGET_REALLY_IWMMXT
-   && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
-{
-  if (TARGET_NEON)
-    {
-      emit_insn (gen_neon_vbsl (<MODE>mode, operands[0], operands[3],
-                                operands[1], operands[2]));
-    }
-  else if (TARGET_HAVE_MVE)
-    {
-      emit_insn (gen_mve_vpselq (VPSELQ_S, <MODE>mode, operands[0],
-                                 operands[1], operands[2], operands[3]));
-    }
-  else
-    gcc_unreachable ();
-  DONE;
-})
-
 (define_expand "vec_load_lanesoi<mode>"
   [(set (match_operand:OI 0 "s_register_operand")
         (unspec:OI [(match_operand:OI 1 "neon_struct_operand")
diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f32-2.c b/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f32-2.c
new file mode 100644
index 00000000000..917a95bf141
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f32-2.c
@@ -0,0 +1,32 @@
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
+/* { dg-add-options arm_v8_1m_mve_fp } */
+/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */
+
+#include <stdint.h>
+
+#define NB 4
+
+#define FUNC(OP, NAME)							\
+  void test_ ## NAME ##_f (float * __restrict__ dest, float *a, float *b) { \
+    int i;								\
+    for (i=0; i<NB; i++) {						\
+      dest[i] = (a[i] OP b[i]) ? 2.0f : 3.0f;				\
+    }									\
+  }
+
+FUNC(==, vcmpeq)
+FUNC(!=, vcmpne)
+FUNC(<, vcmplt)
+FUNC(<=, vcmple)
+FUNC(>, vcmpgt)
+FUNC(>=, vcmpge)
+
+/* { dg-final { scan-assembler-times {\tvcmp.f32\teq, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f32\tne, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f32\tlt, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f32\tle, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f32\tgt, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcmp.f32\tge, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\t.word\t1073741824\n} 24 } } */ /* Constant 2.0f.  */
+/* { dg-final { scan-assembler-times {\t.word\t1077936128\n} 24 } } */ /* Constant 3.0f.  */
diff --git a/gcc/testsuite/gcc.target/arm/simd/neon-compare-1.c b/gcc/testsuite/gcc.target/arm/simd/neon-compare-1.c
new file mode 100644
index 00000000000..2e0222a71f2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/neon-compare-1.c
@@ -0,0 +1,78 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-add-options arm_neon } */
+/* { dg-additional-options "-O3" } */
+
+#include "mve-compare-1.c"
+
+/* 64-bit vectors.  */
+/* vmvn is used by 'ne' comparisons: 3 sizes * 2 (signed/unsigned) * 2
+   (register/zero) = 12.  */
+/* { dg-final { scan-assembler-times {\tvmvn\td[0-9]+, d[0-9]+\n} 12 } } */
+
+/* { 8 bits } x { eq, ne, lt, le, gt, ge }. */
+/* ne uses eq, lt/le only apply to comparison with zero, they use gt/ge
+   otherwise.  */
+/* { dg-final { scan-assembler-times {\tvceq.i8\td[0-9]+, d[0-9]+, d[0-9]+\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tvceq.i8\td[0-9]+, d[0-9]+, #0\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tvclt.s8\td[0-9]+, d[0-9]+, #0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcle.s8\td[0-9]+, d[0-9]+, #0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcgt.s8\td[0-9]+, d[0-9]+, d[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcgt.s8\td[0-9]+, d[0-9]+, #0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcge.s8\td[0-9]+, d[0-9]+, d[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcge.s8\td[0-9]+, d[0-9]+, #0\n} 1 } } */
+
+/* { 16 bits } x { eq, ne, lt, le, gt, ge }. */
+/* { dg-final { scan-assembler-times {\tvceq.i16\td[0-9]+, d[0-9]+, d[0-9]+\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tvceq.i16\td[0-9]+, d[0-9]+, #0\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tvclt.s16\td[0-9]+, d[0-9]+, #0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcle.s16\td[0-9]+, d[0-9]+, #0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcgt.s16\td[0-9]+, d[0-9]+, d[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcgt.s16\td[0-9]+, d[0-9]+, #0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcge.s16\td[0-9]+, d[0-9]+, d[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcge.s16\td[0-9]+, d[0-9]+, #0\n} 1 } } */
+
+/* { 32 bits } x { eq, ne, lt, le, gt, ge }. */
+/* { dg-final { scan-assembler-times {\tvceq.i32\td[0-9]+, d[0-9]+, d[0-9]+\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tvceq.i32\td[0-9]+, d[0-9]+, #0\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tvclt.s32\td[0-9]+, d[0-9]+, #0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcle.s32\td[0-9]+, d[0-9]+, #0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcgt.s32\td[0-9]+, d[0-9]+, d[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcgt.s32\td[0-9]+, d[0-9]+, #0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcge.s32\td[0-9]+, d[0-9]+, d[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcge.s32\td[0-9]+, d[0-9]+, #0\n} 1 } } */
+
+/* 128-bit vectors.  */
+
+/* vmvn is used by 'ne' comparisons.  */
+/* { dg-final { scan-assembler-times {\tvmvn\tq[0-9]+, q[0-9]+\n} 12 } } */
+
+/* { 8 bits } x { eq, ne, lt, le, gt, ge }.  */
+/* { dg-final { scan-assembler-times {\tvceq.i8\tq[0-9]+, q[0-9]+, q[0-9]+\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tvceq.i8\tq[0-9]+, q[0-9]+, #0\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tvclt.s8\tq[0-9]+, q[0-9]+, #0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcle.s8\tq[0-9]+, q[0-9]+, #0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcgt.s8\tq[0-9]+, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcgt.s8\tq[0-9]+, q[0-9]+, #0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcge.s8\tq[0-9]+, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcge.s8\tq[0-9]+, q[0-9]+, #0\n} 1 } } */
+
+/* { 16 bits } x { eq, ne, lt, le, gt, ge }.  */
+/* { dg-final { scan-assembler-times {\tvceq.i16\tq[0-9]+, q[0-9]+, q[0-9]+\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tvceq.i16\tq[0-9]+, q[0-9]+, #0\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tvclt.s16\tq[0-9]+, q[0-9]+, #0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcle.s16\tq[0-9]+, q[0-9]+, #0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcgt.s16\tq[0-9]+, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcgt.s16\tq[0-9]+, q[0-9]+, #0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcge.s16\tq[0-9]+, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcge.s16\tq[0-9]+, q[0-9]+, #0\n} 1 } } */
+
+/* { 32 bits } x { eq, ne, lt, le, gt, ge }.  */
+/* { dg-final { scan-assembler-times {\tvceq.i32\tq[0-9]+, q[0-9]+, q[0-9]+\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tvceq.i32\tq[0-9]+, q[0-9]+, #0\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tvclt.s32\tq[0-9]+, q[0-9]+, #0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcle.s32\tq[0-9]+, q[0-9]+, #0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcgt.s32\tq[0-9]+, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcgt.s32\tq[0-9]+, q[0-9]+, #0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcge.s32\tq[0-9]+, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcge.s32\tq[0-9]+, q[0-9]+, #0\n} 1 } } */
diff --git a/gcc/testsuite/gcc.target/arm/simd/neon-compare-2.c b/gcc/testsuite/gcc.target/arm/simd/neon-compare-2.c
new file mode 100644
index 00000000000..06f3c14c91e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/neon-compare-2.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-add-options arm_neon } */
+/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */
+
+#include "mve-compare-2.c"
+
+/* eq, ne, lt, le, gt, ge.  */
+/* ne uses eq+vmvn, lt/le use gt/ge with swapped operands.  */
+/* { dg-final { scan-assembler-times {\tvceq.f32\tq[0-9]+, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvmvn\tq[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcgt.f32\tq[0-9]+, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcge.f32\tq[0-9]+, q[0-9]+, q[0-9]+\n} 2 } } */
diff --git a/gcc/testsuite/gcc.target/arm/simd/neon-compare-3.c b/gcc/testsuite/gcc.target/arm/simd/neon-compare-3.c
new file mode 100644
index 00000000000..9c9f108843b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/neon-compare-3.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_2a_fp16_neon_ok } */
+/* { dg-add-options arm_v8_2a_fp16_neon } */
+/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */
+
+#include "mve-compare-3.c"
+
+
+/* eq, ne, lt, le, gt, ge.  */
+/* ne uses eq+vmvn, lt/le use gt/ge with swapped operands.  */
+/* { dg-final { scan-assembler-times {\tvceq.f16\tq[0-9]+, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvmvn\tq[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvcgt.f16\tq[0-9]+, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcge.f16\tq[0-9]+, q[0-9]+, q[0-9]+\n} 2 } } */
diff --git a/gcc/testsuite/gcc.target/arm/simd/neon-compare-scalar-1.c b/gcc/testsuite/gcc.target/arm/simd/neon-compare-scalar-1.c
new file mode 100644
index 00000000000..0783624a3f2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/neon-compare-scalar-1.c
@@ -0,0 +1,57 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-add-options arm_neon } */
+/* { dg-additional-options "-O3" } */
+
+#include "mve-compare-scalar-1.c"
+
+/* 64-bit vectors.  */
+/* vmvn is used by 'ne' comparisons.  */
+/* { dg-final { scan-assembler-times {\tvmvn\td[0-9]+, d[0-9]+\n} 6 } } */
+
+/* { 8 bits } x { eq, ne, lt, le, gt, ge }.  */
+/* { dg-final { scan-assembler-times {\tvceq.i8\td[0-9]+, d[0-9]+, d[0-9]+\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tvcgt.s8\td[0-9]+, d[0-9]+, d[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcgt.u8\td[0-9]+, d[0-9]+, d[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcge.s8\td[0-9]+, d[0-9]+, d[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcge.u8\td[0-9]+, d[0-9]+, d[0-9]+\n} 2 } } */
+
+/* { 16 bits } x { eq, ne, lt, le, gt, ge }.  */
+/* { dg-final { scan-assembler-times {\tvceq.i16\td[0-9]+, d[0-9]+, d[0-9]+\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tvcgt.s16\td[0-9]+, d[0-9]+, d[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcgt.u16\td[0-9]+, d[0-9]+, d[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcge.s16\td[0-9]+, d[0-9]+, d[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcge.u16\td[0-9]+, d[0-9]+, d[0-9]+\n} 2 } } */
+
+/* { 32 bits } x { eq, ne, lt, le, gt, ge }.  */
+/* { dg-final { scan-assembler-times {\tvceq.i32\td[0-9]+, d[0-9]+, d[0-9]+\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tvcgt.s32\td[0-9]+, d[0-9]+, d[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcgt.u32\td[0-9]+, d[0-9]+, d[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcge.s32\td[0-9]+, d[0-9]+, d[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcge.u32\td[0-9]+, d[0-9]+, d[0-9]+\n} 2 } } */
+
+/* 128-bit vectors.  */
+
+/* vmvn is used by 'ne' comparisons.  */
+/* { dg-final { scan-assembler-times {\tvmvn\tq[0-9]+, q[0-9]+\n} 6 } } */
+
+/* { 8 bits } x { eq, ne, lt, le, gt, ge }.  */
+/* { dg-final { scan-assembler-times {\tvceq.i8\tq[0-9]+, q[0-9]+, q[0-9]+\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tvcgt.s8\tq[0-9]+, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcgt.u8\tq[0-9]+, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcge.s8\tq[0-9]+, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcge.u8\tq[0-9]+, q[0-9]+, q[0-9]+\n} 2 } } */
+
+/* { 16 bits } x { eq, ne, lt, le, gt, ge }.  */
+/* { dg-final { scan-assembler-times {\tvceq.i16\tq[0-9]+, q[0-9]+, q[0-9]+\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tvcgt.s16\tq[0-9]+, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcgt.u16\tq[0-9]+, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcge.s16\tq[0-9]+, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcge.u16\tq[0-9]+, q[0-9]+, q[0-9]+\n} 2 } } */
+
+/* { 32 bits } x { eq, ne, lt, le, gt, ge }.  */
+/* { dg-final { scan-assembler-times {\tvceq.i32\tq[0-9]+, q[0-9]+, q[0-9]+\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tvcgt.s32\tq[0-9]+, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcgt.u32\tq[0-9]+, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcge.s32\tq[0-9]+, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcge.u32\tq[0-9]+, q[0-9]+, q[0-9]+\n} 2 } } */
diff --git a/gcc/testsuite/gcc.target/arm/simd/neon-vcmp-f16.c b/gcc/testsuite/gcc.target/arm/simd/neon-vcmp-f16.c
new file mode 100644
index 00000000000..688fd9a235f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/neon-vcmp-f16.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_2a_fp16_neon_ok } */
+/* { dg-add-options arm_v8_2a_fp16_neon } */
+/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */
+
+#include "mve-vcmp-f16.c"
+
+/* 'ne' uses vceq.  */
+/* le and lt use ge and gt with inverted operands.  */
+/* { dg-final { scan-assembler-times {\tvceq.f16\tq[0-9]+, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcge.f16\tq[0-9]+, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcgt.f16\tq[0-9]+, q[0-9]+, q[0-9]+\n} 2 } } */
diff --git a/gcc/testsuite/gcc.target/arm/simd/neon-vcmp-f32-2.c b/gcc/testsuite/gcc.target/arm/simd/neon-vcmp-f32-2.c
new file mode 100644
index 00000000000..a22923eb242
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/neon-vcmp-f32-2.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-add-options arm_neon } */
+/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */
+
+#include "mve-vcmp-f32-2.c"
+
+/* 'ne' uses vceq.  */
+/* le and lt use ge and gt with inverted operands.  */
+/* { dg-final { scan-assembler-times {\tvceq.f32\tq[0-9]+, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcge.f32\tq[0-9]+, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcgt.f32\tq[0-9]+, q[0-9]+, q[0-9]+\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tvmov.f32\tq[0-9]+, #2.0e\+0} 6 } } */
+/* { dg-final { scan-assembler-times {\tvmov.f32\tq[0-9]+, #3.0e\+0} 6 } } */
diff --git a/gcc/testsuite/gcc.target/arm/simd/neon-vcmp-f32-3.c b/gcc/testsuite/gcc.target/arm/simd/neon-vcmp-f32-3.c
new file mode 100644
index 00000000000..4f12f043d3a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/neon-vcmp-f32-3.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-add-options arm_neon } */
+/* { dg-additional-options "-O3" } */
+
+#include "mve-vcmp-f32.c"
+
+/* Should not be vectorized, since we do not use -funsafe-math-optimizations.  */
+
+/* { dg-final { scan-assembler-not {\tvceq.f32\tq[0-9]+, q[0-9]+, q[0-9]+\n} } } */
+/* { dg-final { scan-assembler-not {\tvcge.f32\tq[0-9]+, q[0-9]+, q[0-9]+\n} } } */
+/* { dg-final { scan-assembler-not {\tvcgt.f32\tq[0-9]+, q[0-9]+, q[0-9]+\n} } } */
diff --git a/gcc/testsuite/gcc.target/arm/simd/neon-vcmp-f32.c b/gcc/testsuite/gcc.target/arm/simd/neon-vcmp-f32.c
new file mode 100644
index 00000000000..06e5c4fd1d1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/neon-vcmp-f32.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-add-options arm_neon } */
+/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */
+
+#include "mve-vcmp-f32.c"
+
+/* 'ne' uses vceq.  */
+/* le and lt use ge and gt with inverted operands.  */
+/* { dg-final { scan-assembler-times {\tvceq.f32\tq[0-9]+, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcge.f32\tq[0-9]+, q[0-9]+, q[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvcgt.f32\tq[0-9]+, q[0-9]+, q[0-9]+\n} 2 } } */
diff --git a/gcc/testsuite/gcc.target/arm/simd/neon-vcmp.c b/gcc/testsuite/gcc.target/arm/simd/neon-vcmp.c
new file mode 100644
index 00000000000..f2b92b1be7f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/neon-vcmp.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-add-options arm_neon } */
+/* { dg-additional-options "-O3" } */
+
+#include "mve-vcmp.c"
+
+/* vceq is also used for 'ne' comparisons.  */
+/* { dg-final { scan-assembler-times {\tvceq.i[0-9]+\td[0-9]+, d[0-9]+, d[0-9]+\n} 12 } } */
+/* { dg-final { scan-assembler-times {\tvceq.i[0-9]+\tq[0-9]+, q[0-9]+, q[0-9]+\n} 12 } } */
+
+/* lt and le are replaced with the opposite condition, hence the double number
+   of matches for gt and ge.  */
+/* { dg-final { scan-assembler-times {\tvcge.s[0-9]+\td[0-9]+, d[0-9]+, d[0-9]+\n} 6 } } */
+/* { dg-final { scan-assembler-times {\tvcge.s[0-9]+\tq[0-9]+, q[0-9]+, q[0-9]+\n} 6 } } */
+/* { dg-final { scan-assembler-times {\tvcge.u[0-9]+\td[0-9]+, d[0-9]+, d[0-9]+\n} 6 } } */
+/* { dg-final { scan-assembler-times {\tvcge.u[0-9]+\tq[0-9]+, q[0-9]+, q[0-9]+\n} 6 } } */
+
+/* { dg-final { scan-assembler-times {\tvcgt.s[0-9]+\td[0-9]+, d[0-9]+, d[0-9]+\n} 6 } } */
+/* { dg-final { scan-assembler-times {\tvcgt.s[0-9]+\tq[0-9]+, q[0-9]+, q[0-9]+\n} 6 } } */
+/* { dg-final { scan-assembler-times {\tvcgt.u[0-9]+\td[0-9]+, d[0-9]+, d[0-9]+\n} 6 } } */
+/* { dg-final { scan-assembler-times {\tvcgt.u[0-9]+\tq[0-9]+, q[0-9]+, q[0-9]+\n} 6 } } */
diff --git a/gcc/testsuite/gcc.target/arm/simd/pr100757-2.c b/gcc/testsuite/gcc.target/arm/simd/pr100757-2.c
new file mode 100644
index 00000000000..c2262b4d81e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/pr100757-2.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
+/* { dg-add-options arm_v8_1m_mve_fp } */
+/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */
+/* Derived from gcc.c-torture/compile/20160205-1.c.  */
+
+float a[32];
+int fn1(int d) {
+  int c = 4;
+  for (int b = 0; b < 32; b++)
+    if (a[b] != 2.0f)
+      c = 5;
+  return c;
+}
+
+/* { dg-final { scan-assembler-times {\t.word\t1073741824\n} 4 } } */ /* Constant 2.0f.  */
+/* { dg-final { scan-assembler-times {\t.word\t4\n} 4 } } */ /* Initial value for c.  */
+/* { dg-final { scan-assembler-times {\t.word\t5\n} 4 } } */ /* Possible value for c.  */
+/* { dg-final { scan-assembler-not {\t.word\t1\n} } } */ /* 'true' mask.  */
+/* { dg-final { scan-assembler-not {\t.word\t0\n} } } */ /* 'false' mask.  */
diff --git a/gcc/testsuite/gcc.target/arm/simd/pr100757-3.c b/gcc/testsuite/gcc.target/arm/simd/pr100757-3.c
new file mode 100644
index 00000000000..e604555c04c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/pr100757-3.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
+/* { dg-add-options arm_v8_1m_mve_fp } */
+/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */
+/* Copied from gcc.c-torture/compile/20160205-1.c.  */
+
+float a[32];
+float fn1(int d) {
+  float c = 4.0f;
+  for (int b = 0; b < 32; b++)
+    if (a[b] != 2.0f)
+      c = 5.0f;
+  return c;
+}
+
+/* { dg-final { scan-assembler-times {\t.word\t1073741824\n} 4 } } */ /* Constant 2.0f.  */
+/* { dg-final { scan-assembler-times {\t.word\t1084227584\n} 4 } } */ /* Initial value for c (4.0).  */
+/* { dg-final { scan-assembler-times {\t.word\t1082130432\n} 4 } } */ /* Possible value for c (5.0).  */
+/* { dg-final { scan-assembler-not {\t.word\t1\n} } } */ /* 'true' mask.  */
+/* { dg-final { scan-assembler-not {\t.word\t0\n} } } */ /* 'false' mask.  */
diff --git a/gcc/testsuite/gcc.target/arm/simd/pr100757-4.c b/gcc/testsuite/gcc.target/arm/simd/pr100757-4.c
new file mode 100644
index 00000000000..c12040c517f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/pr100757-4.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-add-options arm_v8_1m_mve } */
+/* { dg-additional-options "-O3" } */
+/* Derived from gcc.c-torture/compile/20160205-1.c.  */
+
+unsigned int a[32];
+int fn1(int d) {
+  int c = 2;
+  for (int b = 0; b < 32; b++)
+    if (a[b])
+      c = 3;
+  return c;
+}
+
+/* { dg-final { scan-assembler-times {\t.word\t0\n} 4 } } */ /* 'false' mask.  */
+/* { dg-final { scan-assembler-not {\t.word\t1\n} } } */ /* 'true' mask.  */
+/* { dg-final { scan-assembler-times {\t.word\t2\n} 4 } } */ /* Initial value for c.  */
+/* { dg-final { scan-assembler-times {\t.word\t3\n} 4 } } */ /* Possible value for c.  */
diff --git a/gcc/testsuite/gcc.target/arm/simd/pr100757.c b/gcc/testsuite/gcc.target/arm/simd/pr100757.c
new file mode 100644
index 00000000000..41d6e4e2d7a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/pr100757.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-add-options arm_v8_1m_mve } */
+/* { dg-additional-options "-O3" } */
+/* Derived from gcc.c-torture/compile/20160205-1.c.  */
+
+int a[32];
+int fn1(int d) {
+  int c = 2;
+  for (int b = 0; b < 32; b++)
+    if (a[b])
+      c = 3;
+  return c;
+}
+
+/* { dg-final { scan-assembler-times {\t.word\t0\n} 4 } } */ /* 'false' mask.  */
+/* { dg-final { scan-assembler-not {\t.word\t1\n} } } */ /* 'true' mask.  */
+/* { dg-final { scan-assembler-times {\t.word\t2\n} 4 } } */ /* Initial value for c.  */
+/* { dg-final { scan-assembler-times {\t.word\t3\n} 4 } } */ /* Possible value for c.  */
Thomas Koenig via Gcc-patches July 2, 2021, 9:07 a.m. | #3
On Fri, 2 Jul 2021 at 10:53, Christophe Lyon <christophe.lyon@linaro.org> wrote:
>

> Hi,

>

> On Wed, 9 Jun 2021 at 17:04, Richard Sandiford

> <richard.sandiford@arm.com> wrote:

> >

> > Christophe Lyon <christophe.lyon@linaro.org> writes:

> > > The problem in this PR is that we call VPSEL with a mask of vector

> > > type instead of HImode. This happens because operand 3 in vcond_mask

> > > is the pre-computed vector comparison and has vector type. The fix is

> > > to transfer this value to VPR.P0 by comparing operand 3 with a vector

> > > of constant 1 of the same type as operand 3.

> >

> > The alternative is to implement TARGET_VECTORIZE_GET_MASK_MODE

> > and return HImode for MVE.  This is how AVX512 handles masks.

> >

> > It might be worth trying that to see how it works.  I'm not sure

> > off-hand whether it'll produce worse code or better code.  However,

> > using HImode as the mask mode would help when defining other

> > predicated optabs in future.

> >

>

> Here is my v2 of this patch, hopefully implementing what you suggested.

>

> Sorry it took me so long, but as implementing this hook was of course

> not sufficient, and it took me a while to figure out I needed to keep the

> non-HI expanders (vec_cmp ,...). Each time I fixed a bug, I created

> another one... I shouldn't have added so many tests ;-)

>

> I'm not sure how to improve the vectorizer doc, to better describe the

> vec_cmp/vcond patterns and see which ones the vectorizer is trying

> to use (to understand which ones I should implement).

>

> Then I realized I was about to break Neon support, so I decided

> it was safer to add Neon tests ;-)

>

> Is that version OK?

>

I forgot to mention that I merged the original patch 2/2
https://gcc.gnu.org/pipermail/gcc-patches/2021-June/572300.html
into this one.

> Thanks,

>

> Christophe

>

>

> > Thanks,

> > Richard

> >

> > > The pr100757*.c testcases are derived from

> > > gcc.c-torture/compile/20160205-1.c, forcing the use of MVE, and using

> > > different types and return values different from 0 and 1 to avoid

> > > commonalization with boolean masks.

> > >

> > > Reducing the number of iterations in pr100757-3.c from 32 to 8, we

> > > generate the code below:

> > >

> > > float a[32];

> > > float fn1(int d) {

> > >   int c = 4;

> > >   for (int b = 0; b < 8; b++)

> > >     if (a[b] != 2.0f)

> > >       c = 5;

> > >   return c;

> > > }

> > >

> > > fn1:

> > >         ldr     r3, .L4+80

> > >       vpush.64        {d8, d9}

> > >       vldrw.32        q3, [r3]        // q3=a[0..3]

> > >       vldr.64 d8, .L4                 // q4=(2.0,2.0,2.0,2.0)

> > >       vldr.64 d9, .L4+8

> > >       adds    r3, r3, #16

> > >       vcmp.f32        eq, q3, q4      // cmp a[0..3] == (2.0,2.0,2.0,2.0)

> > >       vldr.64 d2, .L4+16              // q1=(1,1,1,1)

> > >       vldr.64 d3, .L4+24

> > >       vldrw.32        q3, [r3]        // q3=a[4..7]

> > >       vldr.64 d4, .L4+32              // q2=(0,0,0,0)

> > >       vldr.64 d5, .L4+40

> > >       vpsel q0, q1, q2                // q0=select (a[0..3])

> > >       vcmp.f32        eq, q3, q4      // cmp a[4..7] == (2.0,2.0,2.0,2.0)

> > >       vldm    sp!, {d8-d9}

> > >       vpsel q2, q1, q2                // q2=select (a[4..7])

> > >       vand    q2, q0, q2              // q2=select (a[0..3]) && select (a[4..7])

> > >       vldr.64 d6, .L4+48              // q3=(4.0,4.0,4.0,4.0)

> > >       vldr.64 d7, .L4+56

> > >       vldr.64 d0, .L4+64              // q0=(5.0,5.0,5.0,5.0)

> > >       vldr.64 d1, .L4+72

> > >       vcmp.i32  eq, q2, q1            // cmp mask(a[0..7]) == (1,1,1,1)

> > >       vpsel q3, q3, q0                // q3= vcond_mask(4.0,5.0)

> > >       vmov.32 r3, q3[0]               // keep the scalar maxv2-0001-arm-Fix-vcond_mask-expander-for-MVE-PR-target-100.patch

> > >       vmov.32 r1, q3[1]

> > >       vmov.32 r0, q3[3]

> > >       vmov.32 r2, q3[2]

> > >       vmov    s14, r1

> > >       vmov    s15, r3

> > >       vmaxnm.f32      s15, s15, s14

> > >       vmov    s14, r2

> > >       vmaxnm.f32      s15, s15, s14

> > >       vmov    s14, r0

> > >       vmaxnm.f32      s15, s15, s14

> > >       vmov    r0, s15

> > >       bx      lr

> > >       .L5:

> > >       .align  3

> > >       .L4:

> > >       .word   1073741824

> > >       .word   1073741824

> > >       .word   1073741824

> > >       .word   1073741824

> > >       .word   1

> > >       .word   1

> > >       .word   1

> > >       .word   1

> > >       .word   0

> > >       .word   0

> > >       .word   0

> > >       .word   0

> > >       .word   1082130432

> > >       .word   1082130432

> > >       .word   1082130432

> > >       .word   1082130432

> > >       .word   1084227584

> > >       .word   1084227584

> > >       .word   1084227584

> > >       .word   1084227584

> > >

> > > 2021-06-09  Christophe Lyon  <christophe.lyon@linaro.org>

> > >

> > >       PR target/100757

> > >       gcc/

> > >       * config/arm/vec-common.md (vcond_mask_<mode><v_cmp_result>): Fix

> > >       expansion for MVE.

> > >

> > >       gcc/testsuite/

> > >       * gcc.target/arm/simd/pr100757.c: New test.

> > >       * gcc.target/arm/simd/pr100757-2.c: New test.

> > >       * gcc.target/arm/simd/pr100757-3.c: New test.

> > > ---

> > >  gcc/config/arm/vec-common.md                  | 24 +++++++++++++++++--

> > >  .../gcc.target/arm/simd/pr100757-2.c          | 20 ++++++++++++++++

> > >  .../gcc.target/arm/simd/pr100757-3.c          | 20 ++++++++++++++++

> > >  gcc/testsuite/gcc.target/arm/simd/pr100757.c  | 19 +++++++++++++++

> > >  4 files changed, 81 insertions(+), 2 deletions(-)

> > >  create mode 100644 gcc/testsuite/gcc.target/arm/simd/pr100757-2.c

> > >  create mode 100644 gcc/testsuite/gcc.target/arm/simd/pr100757-3.c

> > >  create mode 100644 gcc/testsuite/gcc.target/arm/simd/pr100757.c

> > >

> > > diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md

> > > index 0ffc7a9322c..ccdfaa8321f 100644

> > > --- a/gcc/config/arm/vec-common.md

> > > +++ b/gcc/config/arm/vec-common.md

> > > @@ -478,8 +478,28 @@ (define_expand "vcond_mask_<mode><v_cmp_result>"

> > >      }

> > >    else if (TARGET_HAVE_MVE)

> > >      {

> > > -      emit_insn (gen_mve_vpselq (VPSELQ_S, <MODE>mode, operands[0],

> > > -                                 operands[1], operands[2], operands[3]));

> > > +      /* Convert pre-computed vector comparison into VPR.P0 by comparing

> > > +         operand 3 with a vector of '1', then use VPSEL.  */

> > > +      machine_mode cmp_mode = GET_MODE (operands[3]);

> > > +      rtx vpr_p0 = gen_reg_rtx (HImode);

> > > +      rtx one = gen_reg_rtx (cmp_mode);

> > > +      emit_move_insn (one, CONST1_RTX (cmp_mode));

> > > +      emit_insn (gen_mve_vcmpq (EQ, cmp_mode, vpr_p0, operands[3], one));

> > > +

> > > +      switch (GET_MODE_CLASS (<MODE>mode))

> > > +        {

> > > +          case MODE_VECTOR_INT:

> > > +            emit_insn (gen_mve_vpselq (VPSELQ_S, <MODE>mode, operands[0], operands[1], operands[2], vpr_p0));

> > > +            break;

> > > +          case MODE_VECTOR_FLOAT:

> > > +         if (TARGET_HAVE_MVE_FLOAT)

> > > +              emit_insn (gen_mve_vpselq_f (<MODE>mode, operands[0], operands[1], operands[2], vpr_p0));

> > > +         else

> > > +           gcc_unreachable ();

> > > +            break;

> > > +          default:

> > > +            gcc_unreachable ();

> > > +         }

> > >      }

> > >    else

> > >      gcc_unreachable ();

> > > diff --git a/gcc/testsuite/gcc.target/arm/simd/pr100757-2.c b/gcc/testsuite/gcc.target/arm/simd/pr100757-2.c

> > > new file mode 100644

> > > index 00000000000..993ce369090

> > > --- /dev/null

> > > +++ b/gcc/testsuite/gcc.target/arm/simd/pr100757-2.c

> > > @@ -0,0 +1,20 @@

> > > +/* { dg-do compile } */

> > > +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */

> > > +/* { dg-add-options arm_v8_1m_mve_fp } */

> > > +/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */

> > > +/* Derived from gcc.c-torture/compile/20160205-1.c.  */

> > > +

> > > +float a[32];

> > > +int fn1(int d) {

> > > +  int c = 4;

> > > +  for (int b = 0; b < 32; b++)

> > > +    if (a[b] != 2.0f)

> > > +      c = 5;

> > > +  return c;

> > > +}

> > > +

> > > +/* { dg-final { scan-assembler-times {\t.word\t1073741824\n} 4 } } */ /* Constant 2.0f.  */

> > > +/* { dg-final { scan-assembler-times {\t.word\t1\n} 4 } } */ /* 'true' mask.  */

> > > +/* { dg-final { scan-assembler-times {\t.word\t0\n} 4 } } */ /* 'false' mask.  */

> > > +/* { dg-final { scan-assembler-times {\t.word\t4\n} 4 } } */ /* Initial value for c.  */

> > > +/* { dg-final { scan-assembler-times {\t.word\t5\n} 4 } } */ /* Possible value for c.  */

> > > diff --git a/gcc/testsuite/gcc.target/arm/simd/pr100757-3.c b/gcc/testsuite/gcc.target/arm/simd/pr100757-3.c

> > > new file mode 100644

> > > index 00000000000..b94a73b2d2c

> > > --- /dev/null

> > > +++ b/gcc/testsuite/gcc.target/arm/simd/pr100757-3.c

> > > @@ -0,0 +1,20 @@

> > > +/* { dg-do compile } */

> > > +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */

> > > +/* { dg-add-options arm_v8_1m_mve_fp } */

> > > +/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */

> > > +/* Copied from gcc.c-torture/compile/20160205-1.c.  */

> > > +

> > > +float a[32];

> > > +float fn1(int d) {

> > > +  float c = 4;

> > > +  for (int b = 0; b < 32; b++)

> > > +    if (a[b] != 2.0f)

> > > +      c = 5;

> > > +  return c;

> > > +}

> > > +

> > > +/* { dg-final { scan-assembler-times {\t.word\t1073741824\n} 4 } } */ /* Constant 2.0f.  */

> > > +/* { dg-final { scan-assembler-times {\t.word\t1\n} 4 } } */ /* 'true' mask.  */

> > > +/* { dg-final { scan-assembler-times {\t.word\t0\n} 4 } } */ /* 'false' mask.  */

> > > +/* { dg-final { scan-assembler-times {\t.word\t1084227584\n} 4 } } */ /* Initial value for c.  */

> > > +/* { dg-final { scan-assembler-times {\t.word\t1082130432\n} 4 } } */ /* Possible value for c.  */

> > > diff --git a/gcc/testsuite/gcc.target/arm/simd/pr100757.c b/gcc/testsuite/gcc.target/arm/simd/pr100757.c

> > > new file mode 100644

> > > index 00000000000..e51e716b4ec

> > > --- /dev/null

> > > +++ b/gcc/testsuite/gcc.target/arm/simd/pr100757.c

> > > @@ -0,0 +1,19 @@

> > > +/* { dg-do compile } */

> > > +/* { dg-require-effective-target arm_v8_1m_mve_ok } */

> > > +/* { dg-add-options arm_v8_1m_mve } */

> > > +/* { dg-additional-options "-O3" } */

> > > +/* Derived from gcc.c-torture/compile/20160205-1.c.  */

> > > +

> > > +int a[32];

> > > +int fn1(int d) {

> > > +  int c = 2;

> > > +  for (int b = 0; b < 32; b++)

> > > +    if (a[b])

> > > +      c = 3;

> > > +  return c;

> > > +}

> > > +

> > > +/* { dg-final { scan-assembler-times {\t.word\t1\n} 4 } } */ /* 'true' mask.  */

> > > +/* { dg-final { scan-assembler-times {\t.word\t0\n} 4 } } */ /* 'false' mask.  */

> > > +/* { dg-final { scan-assembler-times {\t.word\t2\n} 4 } } */ /* Initial value for c.  */

> > > +/* { dg-final { scan-assembler-times {\t.word\t3\n} 4 } } */ /* Possible value for c.  */
Thomas Koenig via Gcc-patches July 13, 2021, 9:11 a.m. | #4
Ping?

Le ven. 2 juil. 2021 à 10:53, Christophe Lyon <christophe.lyon@linaro.org>
a écrit :

> Hi,

>

> On Wed, 9 Jun 2021 at 17:04, Richard Sandiford

> <richard.sandiford@arm.com> wrote:

> >

> > Christophe Lyon <christophe.lyon@linaro.org> writes:

> > > The problem in this PR is that we call VPSEL with a mask of vector

> > > type instead of HImode. This happens because operand 3 in vcond_mask

> > > is the pre-computed vector comparison and has vector type. The fix is

> > > to transfer this value to VPR.P0 by comparing operand 3 with a vector

> > > of constant 1 of the same type as operand 3.

> >

> > The alternative is to implement TARGET_VECTORIZE_GET_MASK_MODE

> > and return HImode for MVE.  This is how AVX512 handles masks.

> >

> > It might be worth trying that to see how it works.  I'm not sure

> > off-hand whether it'll produce worse code or better code.  However,

> > using HImode as the mask mode would help when defining other

> > predicated optabs in future.

> >

>

> Here is my v2 of this patch, hopefully implementing what you suggested.

>

> Sorry it took me so long, but as implementing this hook was of course

> not sufficient, and it took me a while to figure out I needed to keep the

> non-HI expanders (vec_cmp ,...). Each time I fixed a bug, I created

> another one... I shouldn't have added so many tests ;-)

>

> I'm not sure how to improve the vectorizer doc, to better describe the

> vec_cmp/vcond patterns and see which ones the vectorizer is trying

> to use (to understand which ones I should implement).

>

> Then I realized I was about to break Neon support, so I decided

> it was safer to add Neon tests ;-)

>

> Is that version OK?

>

> Thanks,

>

> Christophe

>

>

> > Thanks,

> > Richard

> >

> > > The pr100757*.c testcases are derived from

> > > gcc.c-torture/compile/20160205-1.c, forcing the use of MVE, and using

> > > different types and return values different from 0 and 1 to avoid

> > > commonalization with boolean masks.

> > >

> > > Reducing the number of iterations in pr100757-3.c from 32 to 8, we

> > > generate the code below:

> > >

> > > float a[32];

> > > float fn1(int d) {

> > >   int c = 4;

> > >   for (int b = 0; b < 8; b++)

> > >     if (a[b] != 2.0f)

> > >       c = 5;

> > >   return c;

> > > }

> > >

> > > fn1:

> > >         ldr     r3, .L4+80

> > >       vpush.64        {d8, d9}

> > >       vldrw.32        q3, [r3]        // q3=a[0..3]

> > >       vldr.64 d8, .L4                 // q4=(2.0,2.0,2.0,2.0)

> > >       vldr.64 d9, .L4+8

> > >       adds    r3, r3, #16

> > >       vcmp.f32        eq, q3, q4      // cmp a[0..3] ==

> (2.0,2.0,2.0,2.0)

> > >       vldr.64 d2, .L4+16              // q1=(1,1,1,1)

> > >       vldr.64 d3, .L4+24

> > >       vldrw.32        q3, [r3]        // q3=a[4..7]

> > >       vldr.64 d4, .L4+32              // q2=(0,0,0,0)

> > >       vldr.64 d5, .L4+40

> > >       vpsel q0, q1, q2                // q0=select (a[0..3])

> > >       vcmp.f32        eq, q3, q4      // cmp a[4..7] ==

> (2.0,2.0,2.0,2.0)

> > >       vldm    sp!, {d8-d9}

> > >       vpsel q2, q1, q2                // q2=select (a[4..7])

> > >       vand    q2, q0, q2              // q2=select (a[0..3]) && select

> (a[4..7])

> > >       vldr.64 d6, .L4+48              // q3=(4.0,4.0,4.0,4.0)

> > >       vldr.64 d7, .L4+56

> > >       vldr.64 d0, .L4+64              // q0=(5.0,5.0,5.0,5.0)

> > >       vldr.64 d1, .L4+72

> > >       vcmp.i32  eq, q2, q1            // cmp mask(a[0..7]) == (1,1,1,1)

> > >       vpsel q3, q3, q0                // q3= vcond_mask(4.0,5.0)

> > >       vmov.32 r3, q3[0]               // keep the scalar

> maxv2-0001-arm-Fix-vcond_mask-expander-for-MVE-PR-target-100.patch

> > >       vmov.32 r1, q3[1]

> > >       vmov.32 r0, q3[3]

> > >       vmov.32 r2, q3[2]

> > >       vmov    s14, r1

> > >       vmov    s15, r3

> > >       vmaxnm.f32      s15, s15, s14

> > >       vmov    s14, r2

> > >       vmaxnm.f32      s15, s15, s14

> > >       vmov    s14, r0

> > >       vmaxnm.f32      s15, s15, s14

> > >       vmov    r0, s15

> > >       bx      lr

> > >       .L5:

> > >       .align  3

> > >       .L4:

> > >       .word   1073741824

> > >       .word   1073741824

> > >       .word   1073741824

> > >       .word   1073741824

> > >       .word   1

> > >       .word   1

> > >       .word   1

> > >       .word   1

> > >       .word   0

> > >       .word   0

> > >       .word   0

> > >       .word   0

> > >       .word   1082130432

> > >       .word   1082130432

> > >       .word   1082130432

> > >       .word   1082130432

> > >       .word   1084227584

> > >       .word   1084227584

> > >       .word   1084227584

> > >       .word   1084227584

> > >

> > > 2021-06-09  Christophe Lyon  <christophe.lyon@linaro.org>

> > >

> > >       PR target/100757

> > >       gcc/

> > >       * config/arm/vec-common.md (vcond_mask_<mode><v_cmp_result>): Fix

> > >       expansion for MVE.

> > >

> > >       gcc/testsuite/

> > >       * gcc.target/arm/simd/pr100757.c: New test.

> > >       * gcc.target/arm/simd/pr100757-2.c: New test.

> > >       * gcc.target/arm/simd/pr100757-3.c: New test.

> > > ---

> > >  gcc/config/arm/vec-common.md                  | 24 +++++++++++++++++--

> > >  .../gcc.target/arm/simd/pr100757-2.c          | 20 ++++++++++++++++

> > >  .../gcc.target/arm/simd/pr100757-3.c          | 20 ++++++++++++++++

> > >  gcc/testsuite/gcc.target/arm/simd/pr100757.c  | 19 +++++++++++++++

> > >  4 files changed, 81 insertions(+), 2 deletions(-)

> > >  create mode 100644 gcc/testsuite/gcc.target/arm/simd/pr100757-2.c

> > >  create mode 100644 gcc/testsuite/gcc.target/arm/simd/pr100757-3.c

> > >  create mode 100644 gcc/testsuite/gcc.target/arm/simd/pr100757.c

> > >

> > > diff --git a/gcc/config/arm/vec-common.md

> b/gcc/config/arm/vec-common.md

> > > index 0ffc7a9322c..ccdfaa8321f 100644

> > > --- a/gcc/config/arm/vec-common.md

> > > +++ b/gcc/config/arm/vec-common.md

> > > @@ -478,8 +478,28 @@ (define_expand "vcond_mask_<mode><v_cmp_result>"

> > >      }

> > >    else if (TARGET_HAVE_MVE)

> > >      {

> > > -      emit_insn (gen_mve_vpselq (VPSELQ_S, <MODE>mode, operands[0],

> > > -                                 operands[1], operands[2],

> operands[3]));

> > > +      /* Convert pre-computed vector comparison into VPR.P0 by

> comparing

> > > +         operand 3 with a vector of '1', then use VPSEL.  */

> > > +      machine_mode cmp_mode = GET_MODE (operands[3]);

> > > +      rtx vpr_p0 = gen_reg_rtx (HImode);

> > > +      rtx one = gen_reg_rtx (cmp_mode);

> > > +      emit_move_insn (one, CONST1_RTX (cmp_mode));

> > > +      emit_insn (gen_mve_vcmpq (EQ, cmp_mode, vpr_p0, operands[3],

> one));

> > > +

> > > +      switch (GET_MODE_CLASS (<MODE>mode))

> > > +        {

> > > +          case MODE_VECTOR_INT:

> > > +            emit_insn (gen_mve_vpselq (VPSELQ_S, <MODE>mode,

> operands[0], operands[1], operands[2], vpr_p0));

> > > +            break;

> > > +          case MODE_VECTOR_FLOAT:

> > > +         if (TARGET_HAVE_MVE_FLOAT)

> > > +              emit_insn (gen_mve_vpselq_f (<MODE>mode, operands[0],

> operands[1], operands[2], vpr_p0));

> > > +         else

> > > +           gcc_unreachable ();

> > > +            break;

> > > +          default:

> > > +            gcc_unreachable ();

> > > +         }

> > >      }

> > >    else

> > >      gcc_unreachable ();

> > > diff --git a/gcc/testsuite/gcc.target/arm/simd/pr100757-2.c

> b/gcc/testsuite/gcc.target/arm/simd/pr100757-2.c

> > > new file mode 100644

> > > index 00000000000..993ce369090

> > > --- /dev/null

> > > +++ b/gcc/testsuite/gcc.target/arm/simd/pr100757-2.c

> > > @@ -0,0 +1,20 @@

> > > +/* { dg-do compile } */

> > > +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */

> > > +/* { dg-add-options arm_v8_1m_mve_fp } */

> > > +/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */

> > > +/* Derived from gcc.c-torture/compile/20160205-1.c.  */

> > > +

> > > +float a[32];

> > > +int fn1(int d) {

> > > +  int c = 4;

> > > +  for (int b = 0; b < 32; b++)

> > > +    if (a[b] != 2.0f)

> > > +      c = 5;

> > > +  return c;

> > > +}

> > > +

> > > +/* { dg-final { scan-assembler-times {\t.word\t1073741824\n} 4 } } */

> /* Constant 2.0f.  */

> > > +/* { dg-final { scan-assembler-times {\t.word\t1\n} 4 } } */ /*

> 'true' mask.  */

> > > +/* { dg-final { scan-assembler-times {\t.word\t0\n} 4 } } */ /*

> 'false' mask.  */

> > > +/* { dg-final { scan-assembler-times {\t.word\t4\n} 4 } } */ /*

> Initial value for c.  */

> > > +/* { dg-final { scan-assembler-times {\t.word\t5\n} 4 } } */ /*

> Possible value for c.  */

> > > diff --git a/gcc/testsuite/gcc.target/arm/simd/pr100757-3.c

> b/gcc/testsuite/gcc.target/arm/simd/pr100757-3.c

> > > new file mode 100644

> > > index 00000000000..b94a73b2d2c

> > > --- /dev/null

> > > +++ b/gcc/testsuite/gcc.target/arm/simd/pr100757-3.c

> > > @@ -0,0 +1,20 @@

> > > +/* { dg-do compile } */

> > > +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */

> > > +/* { dg-add-options arm_v8_1m_mve_fp } */

> > > +/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */

> > > +/* Copied from gcc.c-torture/compile/20160205-1.c.  */

> > > +

> > > +float a[32];

> > > +float fn1(int d) {

> > > +  float c = 4;

> > > +  for (int b = 0; b < 32; b++)

> > > +    if (a[b] != 2.0f)

> > > +      c = 5;

> > > +  return c;

> > > +}

> > > +

> > > +/* { dg-final { scan-assembler-times {\t.word\t1073741824\n} 4 } } */

> /* Constant 2.0f.  */

> > > +/* { dg-final { scan-assembler-times {\t.word\t1\n} 4 } } */ /*

> 'true' mask.  */

> > > +/* { dg-final { scan-assembler-times {\t.word\t0\n} 4 } } */ /*

> 'false' mask.  */

> > > +/* { dg-final { scan-assembler-times {\t.word\t1084227584\n} 4 } } */

> /* Initial value for c.  */

> > > +/* { dg-final { scan-assembler-times {\t.word\t1082130432\n} 4 } } */

> /* Possible value for c.  */

> > > diff --git a/gcc/testsuite/gcc.target/arm/simd/pr100757.c

> b/gcc/testsuite/gcc.target/arm/simd/pr100757.c

> > > new file mode 100644

> > > index 00000000000..e51e716b4ec

> > > --- /dev/null

> > > +++ b/gcc/testsuite/gcc.target/arm/simd/pr100757.c

> > > @@ -0,0 +1,19 @@

> > > +/* { dg-do compile } */

> > > +/* { dg-require-effective-target arm_v8_1m_mve_ok } */

> > > +/* { dg-add-options arm_v8_1m_mve } */

> > > +/* { dg-additional-options "-O3" } */

> > > +/* Derived from gcc.c-torture/compile/20160205-1.c.  */

> > > +

> > > +int a[32];

> > > +int fn1(int d) {

> > > +  int c = 2;

> > > +  for (int b = 0; b < 32; b++)

> > > +    if (a[b])

> > > +      c = 3;

> > > +  return c;

> > > +}

> > > +

> > > +/* { dg-final { scan-assembler-times {\t.word\t1\n} 4 } } */ /*

> 'true' mask.  */

> > > +/* { dg-final { scan-assembler-times {\t.word\t0\n} 4 } } */ /*

> 'false' mask.  */

> > > +/* { dg-final { scan-assembler-times {\t.word\t2\n} 4 } } */ /*

> Initial value for c.  */

> > > +/* { dg-final { scan-assembler-times {\t.word\t3\n} 4 } } */ /*

> Possible value for c.  */

>
Thomas Koenig via Gcc-patches July 16, 2021, 2:06 p.m. | #5
Hi,

Sorry for the slow review.  I'd initially held off from reviewing
because it sounded like you were trying to treat predicates as
MODE_VECTOR_BOOL instead.  Is that right?  If so, how did that go?

It does feel like the right long-term direction.  Treating masks as
integers for AVX seems to make some things more difficult than they
should be.  Also, RTL like:

> +(define_expand "vec_cmp<mode>hi"

> +  [(set (match_operand:HI 0 "s_register_operand")

> +	(match_operator:HI 1 "comparison_operator"

> +	  [(match_operand:MVE_VLD_ST 2 "s_register_operand")

> +	   (match_operand:MVE_VLD_ST 3 "reg_or_zero_operand")]))]

> +  "TARGET_HAVE_MVE

> +   && (!<Is_float_mode> || flag_unsafe_math_optimizations)"

> +{

> +  arm_expand_vector_compare (operands[0], GET_CODE (operands[1]),

> +			     operands[2], operands[3], false, false);

> +  DONE;

> +})


seems kind-of suspect, since (as I think you said in a PR),
(eq:HI X Y) would normally be a single boolean.  Having MODE_VECTOR_BOOL
means that we can represent the comparisons “properly”, even in
define_insns.

But since this usage is confined to define_expands, I guess it
doesn't matter much for the purposes of this patch.  Any switch
to MODE_VECTOR_BOOL would leave most of the patch in tact (contrary
to what I'd initially assumed).

> @@ -31061,13 +31065,7 @@ arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1,

>  

>  	  /* If we are not expanding a vcond, build the result here.  */

>  	  if (!vcond_mve)

> -	    {

> -	      rtx zero = gen_reg_rtx (cmp_result_mode);

> -	      rtx one = gen_reg_rtx (cmp_result_mode);

> -	      emit_move_insn (zero, CONST0_RTX (cmp_result_mode));

> -	      emit_move_insn (one, CONST1_RTX (cmp_result_mode));

> -	      emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, target, one, zero, vpr_p0));

> -	    }

> +	    emit_move_insn (target, vpr_p0);


The code above this is:

	  if (vcond_mve)
	    vpr_p0 = target;
	  else
	    vpr_p0 = gen_reg_rtx (HImode);

so couldn't we simply use vpr_p0 = target unconditionally (or drop
vpr_p0 altogether)?  Same for the other cases.

> @@ -31178,20 +31164,21 @@ arm_expand_vcond (rtx *operands, machine_mode cmp_result_mode)

>  			    mask, operands[1], operands[2]));

>    else

>      {

> -      machine_mode cmp_mode = GET_MODE (operands[4]);

> +      machine_mode cmp_mode = GET_MODE (operands[0]);

>        rtx vpr_p0 = mask;

> -      rtx zero = gen_reg_rtx (cmp_mode);

> -      rtx one = gen_reg_rtx (cmp_mode);

> -      emit_move_insn (zero, CONST0_RTX (cmp_mode));

> -      emit_move_insn (one, CONST1_RTX (cmp_mode));

> +

>        switch (GET_MODE_CLASS (cmp_mode))

>  	{

>  	case MODE_VECTOR_INT:

> -	  emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, operands[0], one, zero, vpr_p0));

> +	  emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_mode, operands[0],

> +				     operands[1], operands[2], vpr_p0));

>  	  break;

>  	case MODE_VECTOR_FLOAT:

>  	  if (TARGET_HAVE_MVE_FLOAT)

> -	    emit_insn (gen_mve_vpselq_f (cmp_mode, operands[0], one, zero, vpr_p0));

> +	    emit_insn (gen_mve_vpselq_f (cmp_mode, operands[0],

> +					 operands[1], operands[2], vpr_p0));

> +	  else

> +	    gcc_unreachable ();

>  	  break;

>  	default:

>  	  gcc_unreachable ();


Here too vpr_p0 feels a bit redundant now.

> diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md

> index e393518ea88..a9840408bdd 100644

> --- a/gcc/config/arm/mve.md

> +++ b/gcc/config/arm/mve.md

> @@ -10516,3 +10516,58 @@ (define_insn "*movmisalign<mode>_mve_load"

>    "vldr<V_sz_elem1>.<V_sz_elem>\t%q0, %E1"

>    [(set_attr "type" "mve_load")]

>  )

> +

> +;; Expanders for vec_cmp and vcond

> +

> +(define_expand "vec_cmp<mode>hi"

> +  [(set (match_operand:HI 0 "s_register_operand")

> +	(match_operator:HI 1 "comparison_operator"

> +	  [(match_operand:MVE_VLD_ST 2 "s_register_operand")

> +	   (match_operand:MVE_VLD_ST 3 "reg_or_zero_operand")]))]

> +  "TARGET_HAVE_MVE

> +   && (!<Is_float_mode> || flag_unsafe_math_optimizations)"


Is flag_unsafe_math_optimizations needed for MVE?  For Neon we had
it because of flush to zero (at least, I think that was the only reason).

> +{

> +  arm_expand_vector_compare (operands[0], GET_CODE (operands[1]),

> +			     operands[2], operands[3], false, false);

> +  DONE;

> +})


The (snipped) tests look good, but if we do support
!flag_unsafe_math_optimizations, it would be good to have some tests
for unordered comparisons too.  E.g.:

#define ordered(A, B) (!__builtin_isunordered (A, B))
#define unordered(A, B) (__builtin_isunordered (A, B))
#define ueq(A, B) (!__builtin_islessgreater (A, B))
#define ult(A, B) (__builtin_isless (A, B))
#define ule(A, B) (__builtin_islessequal (A, B))
#define uge(A, B) (__builtin_isgreaterequal (A, B))
#define ugt(A, B) (__builtin_isgreater (A, B))
#define nueq(A, B) (__builtin_islessgreater (A, B))
#define nult(A, B) (!__builtin_isless (A, B))
#define nule(A, B) (!__builtin_islessequal (A, B))
#define nuge(A, B) (!__builtin_isgreaterequal (A, B))
#define nugt(A, B) (!__builtin_isgreater (A, B))

The main thing is just to check that they don't ICE.  It might be
difficult to check the assembly for correctness.

Thanks,
Richard
Thomas Koenig via Gcc-patches July 16, 2021, 2:32 p.m. | #6
On 16/07/2021 16:06, Richard Sandiford via Gcc-patches wrote:
> Hi,

>

> Sorry for the slow review.  I'd initially held off from reviewing

> because it sounded like you were trying to treat predicates as

> MODE_VECTOR_BOOL instead.  Is that right?  If so, how did that go?



Yes, that's part of PR 101325. It's still WIP as I wrote in the PR, I'm 
not sure I got it right yet. At the moment it seems it would imply a lot 
of changes, I'll have to look at AArch64's implementation in more details.

I hoped this fix could be merged before switching to MODE_VECTOR_BOOL.


> It does feel like the right long-term direction.  Treating masks as

> integers for AVX seems to make some things more difficult than they

> should be.  Also, RTL like:


OK, I see, good to know.


>

>> +(define_expand "vec_cmp<mode>hi"

>> +  [(set (match_operand:HI 0 "s_register_operand")

>> +	(match_operator:HI 1 "comparison_operator"

>> +	  [(match_operand:MVE_VLD_ST 2 "s_register_operand")

>> +	   (match_operand:MVE_VLD_ST 3 "reg_or_zero_operand")]))]

>> +  "TARGET_HAVE_MVE

>> +   && (!<Is_float_mode> || flag_unsafe_math_optimizations)"

>> +{

>> +  arm_expand_vector_compare (operands[0], GET_CODE (operands[1]),

>> +			     operands[2], operands[3], false, false);

>> +  DONE;

>> +})

> seems kind-of suspect, since (as I think you said in a PR),

> (eq:HI X Y) would normally be a single boolean.  Having MODE_VECTOR_BOOL

> means that we can represent the comparisons “properly”, even in

> define_insns.

>

> But since this usage is confined to define_expands, I guess it

> doesn't matter much for the purposes of this patch.  Any switch

> to MODE_VECTOR_BOOL would leave most of the patch in tact (contrary

> to what I'd initially assumed).

>

>> @@ -31061,13 +31065,7 @@ arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1,

>>   

>>   	  /* If we are not expanding a vcond, build the result here.  */

>>   	  if (!vcond_mve)

>> -	    {

>> -	      rtx zero = gen_reg_rtx (cmp_result_mode);

>> -	      rtx one = gen_reg_rtx (cmp_result_mode);

>> -	      emit_move_insn (zero, CONST0_RTX (cmp_result_mode));

>> -	      emit_move_insn (one, CONST1_RTX (cmp_result_mode));

>> -	      emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, target, one, zero, vpr_p0));

>> -	    }

>> +	    emit_move_insn (target, vpr_p0);

> The code above this is:

>

> 	  if (vcond_mve)

> 	    vpr_p0 = target;

> 	  else

> 	    vpr_p0 = gen_reg_rtx (HImode);

>

> so couldn't we simply use vpr_p0 = target unconditionally (or drop

> vpr_p0 altogether)?  Same for the other cases.

Probably, I'll check that.
>> @@ -31178,20 +31164,21 @@ arm_expand_vcond (rtx *operands, machine_mode cmp_result_mode)

>>   			    mask, operands[1], operands[2]));

>>     else

>>       {

>> -      machine_mode cmp_mode = GET_MODE (operands[4]);

>> +      machine_mode cmp_mode = GET_MODE (operands[0]);

>>         rtx vpr_p0 = mask;

>> -      rtx zero = gen_reg_rtx (cmp_mode);

>> -      rtx one = gen_reg_rtx (cmp_mode);

>> -      emit_move_insn (zero, CONST0_RTX (cmp_mode));

>> -      emit_move_insn (one, CONST1_RTX (cmp_mode));

>> +

>>         switch (GET_MODE_CLASS (cmp_mode))

>>   	{

>>   	case MODE_VECTOR_INT:

>> -	  emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, operands[0], one, zero, vpr_p0));

>> +	  emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_mode, operands[0],

>> +				     operands[1], operands[2], vpr_p0));

>>   	  break;

>>   	case MODE_VECTOR_FLOAT:

>>   	  if (TARGET_HAVE_MVE_FLOAT)

>> -	    emit_insn (gen_mve_vpselq_f (cmp_mode, operands[0], one, zero, vpr_p0));

>> +	    emit_insn (gen_mve_vpselq_f (cmp_mode, operands[0],

>> +					 operands[1], operands[2], vpr_p0));

>> +	  else

>> +	    gcc_unreachable ();

>>   	  break;

>>   	default:

>>   	  gcc_unreachable ();

> Here too vpr_p0 feels a bit redundant now.


Indeed, but it seemed clearer to me :-)

>> diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md

>> index e393518ea88..a9840408bdd 100644

>> --- a/gcc/config/arm/mve.md

>> +++ b/gcc/config/arm/mve.md

>> @@ -10516,3 +10516,58 @@ (define_insn "*movmisalign<mode>_mve_load"

>>     "vldr<V_sz_elem1>.<V_sz_elem>\t%q0, %E1"

>>     [(set_attr "type" "mve_load")]

>>   )

>> +

>> +;; Expanders for vec_cmp and vcond

>> +

>> +(define_expand "vec_cmp<mode>hi"

>> +  [(set (match_operand:HI 0 "s_register_operand")

>> +	(match_operator:HI 1 "comparison_operator"

>> +	  [(match_operand:MVE_VLD_ST 2 "s_register_operand")

>> +	   (match_operand:MVE_VLD_ST 3 "reg_or_zero_operand")]))]

>> +  "TARGET_HAVE_MVE

>> +   && (!<Is_float_mode> || flag_unsafe_math_optimizations)"

> Is flag_unsafe_math_optimizations needed for MVE?  For Neon we had

> it because of flush to zero (at least, I think that was the only reason).


Right, I inherited this from the vec_cmp in neon.md. I do not have the 
ARM ARM at hand right now to check.

However, your question makes me wonder about the other vec_cmp pattern 
in vec-common.md, which is common to Neon and MVE. It might need to be 
adjusted too.

>> +{

>> +  arm_expand_vector_compare (operands[0], GET_CODE (operands[1]),

>> +			     operands[2], operands[3], false, false);

>> +  DONE;

>> +})

> The (snipped) tests look good, but if we do support

> !flag_unsafe_math_optimizations, it would be good to have some tests

> for unordered comparisons too.  E.g.:

>

> #define ordered(A, B) (!__builtin_isunordered (A, B))

> #define unordered(A, B) (__builtin_isunordered (A, B))

> #define ueq(A, B) (!__builtin_islessgreater (A, B))

> #define ult(A, B) (__builtin_isless (A, B))

> #define ule(A, B) (__builtin_islessequal (A, B))

> #define uge(A, B) (__builtin_isgreaterequal (A, B))

> #define ugt(A, B) (__builtin_isgreater (A, B))

> #define nueq(A, B) (__builtin_islessgreater (A, B))

> #define nult(A, B) (!__builtin_isless (A, B))

> #define nule(A, B) (!__builtin_islessequal (A, B))

> #define nuge(A, B) (!__builtin_isgreaterequal (A, B))

> #define nugt(A, B) (!__builtin_isgreater (A, B))

>

> The main thing is just to check that they don't ICE.  It might be

> difficult to check the assembly for correctness.


Thanks I'll check that in August, when I'm back from holidays.

Christophe


>

> Thanks,

> Richard

Patch

diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
index 0ffc7a9322c..ccdfaa8321f 100644
--- a/gcc/config/arm/vec-common.md
+++ b/gcc/config/arm/vec-common.md
@@ -478,8 +478,28 @@  (define_expand "vcond_mask_<mode><v_cmp_result>"
     }
   else if (TARGET_HAVE_MVE)
     {
-      emit_insn (gen_mve_vpselq (VPSELQ_S, <MODE>mode, operands[0],
-                                 operands[1], operands[2], operands[3]));
+      /* Convert pre-computed vector comparison into VPR.P0 by comparing
+         operand 3 with a vector of '1', then use VPSEL.  */
+      machine_mode cmp_mode = GET_MODE (operands[3]);
+      rtx vpr_p0 = gen_reg_rtx (HImode);
+      rtx one = gen_reg_rtx (cmp_mode);
+      emit_move_insn (one, CONST1_RTX (cmp_mode));
+      emit_insn (gen_mve_vcmpq (EQ, cmp_mode, vpr_p0, operands[3], one));
+
+      switch (GET_MODE_CLASS (<MODE>mode))
+        {
+          case MODE_VECTOR_INT:
+            emit_insn (gen_mve_vpselq (VPSELQ_S, <MODE>mode, operands[0], operands[1], operands[2], vpr_p0));
+            break;
+          case MODE_VECTOR_FLOAT:
+	    if (TARGET_HAVE_MVE_FLOAT)
+              emit_insn (gen_mve_vpselq_f (<MODE>mode, operands[0], operands[1], operands[2], vpr_p0));
+	    else
+	      gcc_unreachable ();
+            break;
+          default:
+            gcc_unreachable ();
+         }
     }
   else
     gcc_unreachable ();
diff --git a/gcc/testsuite/gcc.target/arm/simd/pr100757-2.c b/gcc/testsuite/gcc.target/arm/simd/pr100757-2.c
new file mode 100644
index 00000000000..993ce369090
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/pr100757-2.c
@@ -0,0 +1,20 @@ 
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
+/* { dg-add-options arm_v8_1m_mve_fp } */
+/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */
+/* Derived from gcc.c-torture/compile/20160205-1.c.  */
+
+float a[32];
+int fn1(int d) {
+  int c = 4;
+  for (int b = 0; b < 32; b++)
+    if (a[b] != 2.0f)
+      c = 5;
+  return c;
+}
+
+/* { dg-final { scan-assembler-times {\t.word\t1073741824\n} 4 } } */ /* Constant 2.0f.  */
+/* { dg-final { scan-assembler-times {\t.word\t1\n} 4 } } */ /* 'true' mask.  */
+/* { dg-final { scan-assembler-times {\t.word\t0\n} 4 } } */ /* 'false' mask.  */
+/* { dg-final { scan-assembler-times {\t.word\t4\n} 4 } } */ /* Initial value for c.  */
+/* { dg-final { scan-assembler-times {\t.word\t5\n} 4 } } */ /* Possible value for c.  */
diff --git a/gcc/testsuite/gcc.target/arm/simd/pr100757-3.c b/gcc/testsuite/gcc.target/arm/simd/pr100757-3.c
new file mode 100644
index 00000000000..b94a73b2d2c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/pr100757-3.c
@@ -0,0 +1,20 @@ 
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
+/* { dg-add-options arm_v8_1m_mve_fp } */
+/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */
+/* Copied from gcc.c-torture/compile/20160205-1.c.  */
+
+float a[32];
+float fn1(int d) {
+  float c = 4;
+  for (int b = 0; b < 32; b++)
+    if (a[b] != 2.0f)
+      c = 5;
+  return c;
+}
+
+/* { dg-final { scan-assembler-times {\t.word\t1073741824\n} 4 } } */ /* Constant 2.0f.  */
+/* { dg-final { scan-assembler-times {\t.word\t1\n} 4 } } */ /* 'true' mask.  */
+/* { dg-final { scan-assembler-times {\t.word\t0\n} 4 } } */ /* 'false' mask.  */
+/* { dg-final { scan-assembler-times {\t.word\t1084227584\n} 4 } } */ /* Initial value for c.  */
+/* { dg-final { scan-assembler-times {\t.word\t1082130432\n} 4 } } */ /* Possible value for c.  */
diff --git a/gcc/testsuite/gcc.target/arm/simd/pr100757.c b/gcc/testsuite/gcc.target/arm/simd/pr100757.c
new file mode 100644
index 00000000000..e51e716b4ec
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/pr100757.c
@@ -0,0 +1,19 @@ 
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-add-options arm_v8_1m_mve } */
+/* { dg-additional-options "-O3" } */
+/* Derived from gcc.c-torture/compile/20160205-1.c.  */
+
+int a[32];
+int fn1(int d) {
+  int c = 2;
+  for (int b = 0; b < 32; b++)
+    if (a[b])
+      c = 3;
+  return c;
+}
+
+/* { dg-final { scan-assembler-times {\t.word\t1\n} 4 } } */ /* 'true' mask.  */
+/* { dg-final { scan-assembler-times {\t.word\t0\n} 4 } } */ /* 'false' mask.  */
+/* { dg-final { scan-assembler-times {\t.word\t2\n} 4 } } */ /* Initial value for c.  */
+/* { dg-final { scan-assembler-times {\t.word\t3\n} 4 } } */ /* Possible value for c.  */