fix detection of negative step DR groups

Message ID nycvar.YFH.7.76.2007071501400.4397@zhemvz.fhfr.qr
State New
Headers show
Series
  • fix detection of negative step DR groups
Related show

Commit Message

Richard Biener July 7, 2020, 1:03 p.m.
This fixes a condition that caused all negative step DR groups to
be detected as single element interleaving.  Such groups are
rejected by interleaving vectorization but miscompiled by SLP
which is fixed by forcing VMAT_STRIDED_SLP for now.

Bootstrap & regtest running on x86_64-unknown-linux-gnu.

	* tree-vect-data-refs.c (vect_analyze_data_ref_accesses): Fix
	group overlap condition to allow negative step DR groups.
	* tree-vect-stmts.c (get_group_load_store_type): For
	multi element SLP groups force VMAT_STRIDED_SLP when the step
	is negative.

	* gcc.dg/vect/slp-47.c: New testcase.
	* gcc.dg/vect/slp-48.c: Likewise.
---
 gcc/testsuite/gcc.dg/vect/slp-47.c | 56 ++++++++++++++++++++++++++++++
 gcc/testsuite/gcc.dg/vect/slp-48.c | 56 ++++++++++++++++++++++++++++++
 gcc/tree-vect-data-refs.c          |  8 +++--
 gcc/tree-vect-stmts.c              | 11 ++++--
 4 files changed, 126 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/slp-47.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/slp-48.c

-- 
2.26.2

Comments

Richard Sandiford July 7, 2020, 5:11 p.m. | #1
Richard Biener <rguenther@suse.de> writes:
> This fixes a condition that caused all negative step DR groups to

> be detected as single element interleaving.  Such groups are

> rejected by interleaving vectorization but miscompiled by SLP

> which is fixed by forcing VMAT_STRIDED_SLP for now.

>

> Bootstrap & regtest running on x86_64-unknown-linux-gnu.


Where does the link between VMAT_CONTIGUOUS_REVERSE and single-element
interleaving happen?  (Sorry, should probably spend more time figuring
that out for myself.)

In principle VMAT_CONTIGUOUS_REVERSE seems like the right classification
for the testcases.

Thanks,
Richard



>

> 	* tree-vect-data-refs.c (vect_analyze_data_ref_accesses): Fix

> 	group overlap condition to allow negative step DR groups.

> 	* tree-vect-stmts.c (get_group_load_store_type): For

> 	multi element SLP groups force VMAT_STRIDED_SLP when the step

> 	is negative.

>

> 	* gcc.dg/vect/slp-47.c: New testcase.

> 	* gcc.dg/vect/slp-48.c: Likewise.

> ---

>  gcc/testsuite/gcc.dg/vect/slp-47.c | 56 ++++++++++++++++++++++++++++++

>  gcc/testsuite/gcc.dg/vect/slp-48.c | 56 ++++++++++++++++++++++++++++++

>  gcc/tree-vect-data-refs.c          |  8 +++--

>  gcc/tree-vect-stmts.c              | 11 ++++--

>  4 files changed, 126 insertions(+), 5 deletions(-)

>  create mode 100644 gcc/testsuite/gcc.dg/vect/slp-47.c

>  create mode 100644 gcc/testsuite/gcc.dg/vect/slp-48.c

>

> diff --git a/gcc/testsuite/gcc.dg/vect/slp-47.c b/gcc/testsuite/gcc.dg/vect/slp-47.c

> new file mode 100644

> index 00000000000..7b2ddf664df

> --- /dev/null

> +++ b/gcc/testsuite/gcc.dg/vect/slp-47.c

> @@ -0,0 +1,56 @@

> +/* { dg-require-effective-target vect_int } */

> +

> +#include "tree-vect.h"

> +

> +int x[1024], y[1024];

> +

> +void __attribute__((noipa)) foo()

> +{

> +  for (int i = 0; i < 512; ++i)

> +    {

> +      x[2*i] = y[1023 - (2*i)];

> +      x[2*i+1] = y[1023 - (2*i+1)];

> +    }

> +}

> +

> +void __attribute__((noipa)) bar()

> +{

> +  for (int i = 0; i < 512; ++i)

> +    {

> +      x[2*i] = y[1023 - (2*i+1)];

> +      x[2*i+1] = y[1023 - (2*i)];

> +    }

> +}

> +

> +int 

> +main ()

> +{

> +  check_vect ();

> +

> +  for (int i = 0; i < 1024; ++i)

> +    {

> +      x[i] = 0;

> +      y[i] = i;

> +      __asm__ volatile ("");

> +    }

> +

> +  foo ();

> +  for (int i = 0; i < 1024; ++i)

> +    if (x[i] != y[1023 - i])

> +      abort ();

> +

> +  for (int i = 0; i < 1024; ++i)

> +    {

> +      x[i] = 0;

> +      __asm__ volatile ("");

> +    }

> +

> +  bar ();

> +  for (int i = 0; i < 1024; ++i)

> +    if (x[i] != y[1023 - i^1])

> +      abort ();

> +

> +  return 0;

> +}

> +

> +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" } } */

> diff --git a/gcc/testsuite/gcc.dg/vect/slp-48.c b/gcc/testsuite/gcc.dg/vect/slp-48.c

> new file mode 100644

> index 00000000000..0b327aede8e

> --- /dev/null

> +++ b/gcc/testsuite/gcc.dg/vect/slp-48.c

> @@ -0,0 +1,56 @@

> +/* { dg-require-effective-target vect_int } */

> +

> +#include "tree-vect.h"

> +

> +int x[1024], y[1024];

> +

> +void __attribute__((noipa)) foo()

> +{

> +  for (int i = 0; i < 512; ++i)

> +    {

> +      x[1023 - (2*i+1)] = y[2*i];

> +      x[1023 - (2*i)] = y[2*i+1];

> +    }

> +}

> +

> +void __attribute__((noipa)) bar()

> +{

> +  for (int i = 0; i < 512; ++i)

> +    {

> +      x[1023 - (2*i+1)] = y[2*i+1];

> +      x[1023 - (2*i)] = y[2*i];

> +    }

> +}

> +

> +int 

> +main ()

> +{

> +  check_vect ();

> +

> +  for (int i = 0; i < 1024; ++i)

> +    {

> +      x[i] = 0;

> +      y[i] = i;

> +      __asm__ volatile ("");

> +    }

> +

> +  foo ();

> +  for (int i = 0; i < 1024; ++i)

> +    if (x[i] != y[1023 - i^1])

> +      abort ();

> +

> +  for (int i = 0; i < 1024; ++i)

> +    {

> +      x[i] = 0;

> +      __asm__ volatile ("");

> +    }

> +

> +  bar ();

> +  for (int i = 0; i < 1024; ++i)

> +    if (x[i] != y[1023 - i])

> +      abort ();

> +

> +  return 0;

> +}

> +

> +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" } } */

> diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c

> index 959c2d3378f..2b4421b5fb4 100644

> --- a/gcc/tree-vect-data-refs.c

> +++ b/gcc/tree-vect-data-refs.c

> @@ -3074,13 +3074,15 @@ vect_analyze_data_ref_accesses (vec_info *vinfo)

>  	      if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)

>  		break;

>  

> -	      /* If the step (if not zero or non-constant) is greater than the

> +	      /* If the step (if not zero or non-constant) is smaller than the

>  		 difference between data-refs' inits this splits groups into

>  		 suitable sizes.  */

>  	      if (tree_fits_shwi_p (DR_STEP (dra)))

>  		{

> -		  HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));

> -		  if (step != 0 && step <= (init_b - init_a))

> +		  unsigned HOST_WIDE_INT step

> +		    = absu_hwi (tree_to_shwi (DR_STEP (dra)));

> +		  if (step != 0

> +		      && step <= (unsigned HOST_WIDE_INT)(init_b - init_a))

>  		    break;

>  		}

>  	    }

> diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c

> index f66c5f5b367..fcae3ef5f35 100644

> --- a/gcc/tree-vect-stmts.c

> +++ b/gcc/tree-vect-stmts.c

> @@ -2150,8 +2150,15 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,

>  	    }

>  	  int cmp = compare_step_with_zero (vinfo, stmt_info);

>  	  if (cmp < 0)

> -	    *memory_access_type = get_negative_load_store_type

> -	      (vinfo, stmt_info, vectype, vls_type, 1);

> +	    {

> +	      if (single_element_p)

> +		/* ???  The VMAT_CONTIGUOUS_REVERSE code generation is

> +		   only correct for single element "interleaving" SLP.  */

> +		*memory_access_type = get_negative_load_store_type

> +				       (vinfo, stmt_info, vectype, vls_type, 1);

> +	      else

> +		*memory_access_type = VMAT_STRIDED_SLP;

> +	    }

>  	  else

>  	    {

>  	      gcc_assert (!loop_vinfo || cmp > 0);
Richard Biener July 8, 2020, 7:33 a.m. | #2
On Tue, 7 Jul 2020, Richard Sandiford wrote:

> Richard Biener <rguenther@suse.de> writes:

> > This fixes a condition that caused all negative step DR groups to

> > be detected as single element interleaving.  Such groups are

> > rejected by interleaving vectorization but miscompiled by SLP

> > which is fixed by forcing VMAT_STRIDED_SLP for now.

> >

> > Bootstrap & regtest running on x86_64-unknown-linux-gnu.

> 

> Where does the link between VMAT_CONTIGUOUS_REVERSE and single-element

> interleaving happen?  (Sorry, should probably spend more time figuring

> that out for myself.)


You go via get_group_load_store_type to get_negative_load_store_type
but only if (slp) - as said the non-SLP path doesn't seem to support
grouped accesses (interleaving) with negative step.

> In principle VMAT_CONTIGUOUS_REVERSE seems like the right classification

> for the testcases.


Yes, it does.  It's just the existing implementation is wrong
for SLP at least - it works for single-elements only since
the permute mask for reverse is a fixed {n-1, n-2,...0} rather
than reversing the groups and also the offsetting done
by vector-size - 1 is not taking into account group size either.

I ran into this (and the other alignment issue) because I thought
representing non-grouped loads as single element interleaving
groups would make those easy to support for SLP but a naiive
implementation runs into all kind of interesting things ;)

Richard.

> Thanks,

> Richard

> 

> 

> 

> >

> > 	* tree-vect-data-refs.c (vect_analyze_data_ref_accesses): Fix

> > 	group overlap condition to allow negative step DR groups.

> > 	* tree-vect-stmts.c (get_group_load_store_type): For

> > 	multi element SLP groups force VMAT_STRIDED_SLP when the step

> > 	is negative.

> >

> > 	* gcc.dg/vect/slp-47.c: New testcase.

> > 	* gcc.dg/vect/slp-48.c: Likewise.

> > ---

> >  gcc/testsuite/gcc.dg/vect/slp-47.c | 56 ++++++++++++++++++++++++++++++

> >  gcc/testsuite/gcc.dg/vect/slp-48.c | 56 ++++++++++++++++++++++++++++++

> >  gcc/tree-vect-data-refs.c          |  8 +++--

> >  gcc/tree-vect-stmts.c              | 11 ++++--

> >  4 files changed, 126 insertions(+), 5 deletions(-)

> >  create mode 100644 gcc/testsuite/gcc.dg/vect/slp-47.c

> >  create mode 100644 gcc/testsuite/gcc.dg/vect/slp-48.c

> >

> > diff --git a/gcc/testsuite/gcc.dg/vect/slp-47.c b/gcc/testsuite/gcc.dg/vect/slp-47.c

> > new file mode 100644

> > index 00000000000..7b2ddf664df

> > --- /dev/null

> > +++ b/gcc/testsuite/gcc.dg/vect/slp-47.c

> > @@ -0,0 +1,56 @@

> > +/* { dg-require-effective-target vect_int } */

> > +

> > +#include "tree-vect.h"

> > +

> > +int x[1024], y[1024];

> > +

> > +void __attribute__((noipa)) foo()

> > +{

> > +  for (int i = 0; i < 512; ++i)

> > +    {

> > +      x[2*i] = y[1023 - (2*i)];

> > +      x[2*i+1] = y[1023 - (2*i+1)];

> > +    }

> > +}

> > +

> > +void __attribute__((noipa)) bar()

> > +{

> > +  for (int i = 0; i < 512; ++i)

> > +    {

> > +      x[2*i] = y[1023 - (2*i+1)];

> > +      x[2*i+1] = y[1023 - (2*i)];

> > +    }

> > +}

> > +

> > +int 

> > +main ()

> > +{

> > +  check_vect ();

> > +

> > +  for (int i = 0; i < 1024; ++i)

> > +    {

> > +      x[i] = 0;

> > +      y[i] = i;

> > +      __asm__ volatile ("");

> > +    }

> > +

> > +  foo ();

> > +  for (int i = 0; i < 1024; ++i)

> > +    if (x[i] != y[1023 - i])

> > +      abort ();

> > +

> > +  for (int i = 0; i < 1024; ++i)

> > +    {

> > +      x[i] = 0;

> > +      __asm__ volatile ("");

> > +    }

> > +

> > +  bar ();

> > +  for (int i = 0; i < 1024; ++i)

> > +    if (x[i] != y[1023 - i^1])

> > +      abort ();

> > +

> > +  return 0;

> > +}

> > +

> > +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" } } */

> > diff --git a/gcc/testsuite/gcc.dg/vect/slp-48.c b/gcc/testsuite/gcc.dg/vect/slp-48.c

> > new file mode 100644

> > index 00000000000..0b327aede8e

> > --- /dev/null

> > +++ b/gcc/testsuite/gcc.dg/vect/slp-48.c

> > @@ -0,0 +1,56 @@

> > +/* { dg-require-effective-target vect_int } */

> > +

> > +#include "tree-vect.h"

> > +

> > +int x[1024], y[1024];

> > +

> > +void __attribute__((noipa)) foo()

> > +{

> > +  for (int i = 0; i < 512; ++i)

> > +    {

> > +      x[1023 - (2*i+1)] = y[2*i];

> > +      x[1023 - (2*i)] = y[2*i+1];

> > +    }

> > +}

> > +

> > +void __attribute__((noipa)) bar()

> > +{

> > +  for (int i = 0; i < 512; ++i)

> > +    {

> > +      x[1023 - (2*i+1)] = y[2*i+1];

> > +      x[1023 - (2*i)] = y[2*i];

> > +    }

> > +}

> > +

> > +int 

> > +main ()

> > +{

> > +  check_vect ();

> > +

> > +  for (int i = 0; i < 1024; ++i)

> > +    {

> > +      x[i] = 0;

> > +      y[i] = i;

> > +      __asm__ volatile ("");

> > +    }

> > +

> > +  foo ();

> > +  for (int i = 0; i < 1024; ++i)

> > +    if (x[i] != y[1023 - i^1])

> > +      abort ();

> > +

> > +  for (int i = 0; i < 1024; ++i)

> > +    {

> > +      x[i] = 0;

> > +      __asm__ volatile ("");

> > +    }

> > +

> > +  bar ();

> > +  for (int i = 0; i < 1024; ++i)

> > +    if (x[i] != y[1023 - i])

> > +      abort ();

> > +

> > +  return 0;

> > +}

> > +

> > +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" } } */

> > diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c

> > index 959c2d3378f..2b4421b5fb4 100644

> > --- a/gcc/tree-vect-data-refs.c

> > +++ b/gcc/tree-vect-data-refs.c

> > @@ -3074,13 +3074,15 @@ vect_analyze_data_ref_accesses (vec_info *vinfo)

> >  	      if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)

> >  		break;

> >  

> > -	      /* If the step (if not zero or non-constant) is greater than the

> > +	      /* If the step (if not zero or non-constant) is smaller than the

> >  		 difference between data-refs' inits this splits groups into

> >  		 suitable sizes.  */

> >  	      if (tree_fits_shwi_p (DR_STEP (dra)))

> >  		{

> > -		  HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));

> > -		  if (step != 0 && step <= (init_b - init_a))

> > +		  unsigned HOST_WIDE_INT step

> > +		    = absu_hwi (tree_to_shwi (DR_STEP (dra)));

> > +		  if (step != 0

> > +		      && step <= (unsigned HOST_WIDE_INT)(init_b - init_a))

> >  		    break;

> >  		}

> >  	    }

> > diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c

> > index f66c5f5b367..fcae3ef5f35 100644

> > --- a/gcc/tree-vect-stmts.c

> > +++ b/gcc/tree-vect-stmts.c

> > @@ -2150,8 +2150,15 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,

> >  	    }

> >  	  int cmp = compare_step_with_zero (vinfo, stmt_info);

> >  	  if (cmp < 0)

> > -	    *memory_access_type = get_negative_load_store_type

> > -	      (vinfo, stmt_info, vectype, vls_type, 1);

> > +	    {

> > +	      if (single_element_p)

> > +		/* ???  The VMAT_CONTIGUOUS_REVERSE code generation is

> > +		   only correct for single element "interleaving" SLP.  */

> > +		*memory_access_type = get_negative_load_store_type

> > +				       (vinfo, stmt_info, vectype, vls_type, 1);

> > +	      else

> > +		*memory_access_type = VMAT_STRIDED_SLP;

> > +	    }

> >  	  else

> >  	    {

> >  	      gcc_assert (!loop_vinfo || cmp > 0);

> 


-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg,
Germany; GF: Felix Imendörffer; HRB 36809 (AG Nuernberg)

Patch

diff --git a/gcc/testsuite/gcc.dg/vect/slp-47.c b/gcc/testsuite/gcc.dg/vect/slp-47.c
new file mode 100644
index 00000000000..7b2ddf664df
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-47.c
@@ -0,0 +1,56 @@ 
+/* { dg-require-effective-target vect_int } */
+
+#include "tree-vect.h"
+
+int x[1024], y[1024];
+
+void __attribute__((noipa)) foo()
+{
+  for (int i = 0; i < 512; ++i)
+    {
+      x[2*i] = y[1023 - (2*i)];
+      x[2*i+1] = y[1023 - (2*i+1)];
+    }
+}
+
+void __attribute__((noipa)) bar()
+{
+  for (int i = 0; i < 512; ++i)
+    {
+      x[2*i] = y[1023 - (2*i+1)];
+      x[2*i+1] = y[1023 - (2*i)];
+    }
+}
+
+int 
+main ()
+{
+  check_vect ();
+
+  for (int i = 0; i < 1024; ++i)
+    {
+      x[i] = 0;
+      y[i] = i;
+      __asm__ volatile ("");
+    }
+
+  foo ();
+  for (int i = 0; i < 1024; ++i)
+    if (x[i] != y[1023 - i])
+      abort ();
+
+  for (int i = 0; i < 1024; ++i)
+    {
+      x[i] = 0;
+      __asm__ volatile ("");
+    }
+
+  bar ();
+  for (int i = 0; i < 1024; ++i)
+    if (x[i] != y[1023 - i^1])
+      abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-48.c b/gcc/testsuite/gcc.dg/vect/slp-48.c
new file mode 100644
index 00000000000..0b327aede8e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-48.c
@@ -0,0 +1,56 @@ 
+/* { dg-require-effective-target vect_int } */
+
+#include "tree-vect.h"
+
+int x[1024], y[1024];
+
+void __attribute__((noipa)) foo()
+{
+  for (int i = 0; i < 512; ++i)
+    {
+      x[1023 - (2*i+1)] = y[2*i];
+      x[1023 - (2*i)] = y[2*i+1];
+    }
+}
+
+void __attribute__((noipa)) bar()
+{
+  for (int i = 0; i < 512; ++i)
+    {
+      x[1023 - (2*i+1)] = y[2*i+1];
+      x[1023 - (2*i)] = y[2*i];
+    }
+}
+
+int 
+main ()
+{
+  check_vect ();
+
+  for (int i = 0; i < 1024; ++i)
+    {
+      x[i] = 0;
+      y[i] = i;
+      __asm__ volatile ("");
+    }
+
+  foo ();
+  for (int i = 0; i < 1024; ++i)
+    if (x[i] != y[1023 - i^1])
+      abort ();
+
+  for (int i = 0; i < 1024; ++i)
+    {
+      x[i] = 0;
+      __asm__ volatile ("");
+    }
+
+  bar ();
+  for (int i = 0; i < 1024; ++i)
+    if (x[i] != y[1023 - i])
+      abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" } } */
diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
index 959c2d3378f..2b4421b5fb4 100644
--- a/gcc/tree-vect-data-refs.c
+++ b/gcc/tree-vect-data-refs.c
@@ -3074,13 +3074,15 @@  vect_analyze_data_ref_accesses (vec_info *vinfo)
 	      if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
 		break;
 
-	      /* If the step (if not zero or non-constant) is greater than the
+	      /* If the step (if not zero or non-constant) is smaller than the
 		 difference between data-refs' inits this splits groups into
 		 suitable sizes.  */
 	      if (tree_fits_shwi_p (DR_STEP (dra)))
 		{
-		  HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
-		  if (step != 0 && step <= (init_b - init_a))
+		  unsigned HOST_WIDE_INT step
+		    = absu_hwi (tree_to_shwi (DR_STEP (dra)));
+		  if (step != 0
+		      && step <= (unsigned HOST_WIDE_INT)(init_b - init_a))
 		    break;
 		}
 	    }
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index f66c5f5b367..fcae3ef5f35 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -2150,8 +2150,15 @@  get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
 	    }
 	  int cmp = compare_step_with_zero (vinfo, stmt_info);
 	  if (cmp < 0)
-	    *memory_access_type = get_negative_load_store_type
-	      (vinfo, stmt_info, vectype, vls_type, 1);
+	    {
+	      if (single_element_p)
+		/* ???  The VMAT_CONTIGUOUS_REVERSE code generation is
+		   only correct for single element "interleaving" SLP.  */
+		*memory_access_type = get_negative_load_store_type
+				       (vinfo, stmt_info, vectype, vls_type, 1);
+	      else
+		*memory_access_type = VMAT_STRIDED_SLP;
+	    }
 	  else
 	    {
 	      gcc_assert (!loop_vinfo || cmp > 0);