Fix PR91178

Message ID alpine.LSU.2.20.1907171221200.2976@zhemvz.fhfr.qr
State New
Headers show
Series
  • Fix PR91178
Related show

Commit Message

Richard Biener July 17, 2019, 10:24 a.m.
This is the vectorizer part of the fix - currently when we
need to permute a load in contiguous accesses we load the
"gap" between two instances of a group as well.  That can
cause quite excessive code generation (fixed up by DCE / forwprop
later but confusing intermediate passes compile-time wise)
in case the gap is large.

The following addresses this in the SLP case, simply skipping
code generation of such loads.  This avoids the huge IV
increment chain which causes all of the followup issues.

Bootstrapped and tested on x86_64-unknown-linux-gnu, applied.

Richard.

2019-07-17  Richard Biener  <rguenther@suse.de>

	PR tree-optimization/91178
	* tree-vect-stmts.c (get_group_load_store_type): For SLP
	loads with a gap larger than the vector size always use
	VMAT_STRIDED_SLP.
	(vectorizable_load): For VMAT_STRIDED_SLP with a permutation
	avoid loading vectors that are only contained in the gap
	and thus are not needed.

	* gcc.dg/torture/pr91178.c: New testcase.

Patch

Index: gcc/testsuite/gcc.dg/torture/pr91178.c
===================================================================
--- gcc/testsuite/gcc.dg/torture/pr91178.c	(nonexistent)
+++ gcc/testsuite/gcc.dg/torture/pr91178.c	(working copy)
@@ -0,0 +1,11 @@ 
+/* { dg-do compile } */
+
+int a;
+extern int f[10][91125];
+int b[50];
+void c()
+{
+  for (int d = 6; d <= a; d++)
+    for (int e = 16; e <= 24; e++)
+      b[e] -= f[d][d];
+}
Index: gcc/tree-vect-stmts.c
===================================================================
--- gcc/tree-vect-stmts.c	(revision 273520)
+++ gcc/tree-vect-stmts.c	(working copy)
@@ -2267,6 +2267,14 @@  get_group_load_store_type (stmt_vec_info
 			/ vect_get_scalar_dr_size (first_dr_info)))
 	    overrun_p = false;
 
+	  /* If the gap at the end of the group exceeds a whole vector
+	     in size use the strided SLP code which can skip code-generation
+	     for the gap.  */
+	  if (vls_type == VLS_LOAD && known_gt (gap, nunits))
+	    *memory_access_type = VMAT_STRIDED_SLP;
+	  else
+	    *memory_access_type = VMAT_CONTIGUOUS;
+
 	  /* If the gap splits the vector in half and the target
 	     can do half-vector operations avoid the epilogue peeling
 	     by simply loading half of the vector only.  Usually
@@ -2274,7 +2282,8 @@  get_group_load_store_type (stmt_vec_info
 	  dr_alignment_support alignment_support_scheme;
 	  scalar_mode elmode = SCALAR_TYPE_MODE (TREE_TYPE (vectype));
 	  machine_mode vmode;
-	  if (overrun_p
+	  if (*memory_access_type == VMAT_CONTIGUOUS
+	      && overrun_p
 	      && !masked_p
 	      && (((alignment_support_scheme
 		      = vect_supportable_dr_alignment (first_dr_info, false)))
@@ -2297,7 +2306,6 @@  get_group_load_store_type (stmt_vec_info
 				 "Peeling for outer loop is not supported\n");
 	      return false;
 	    }
-	  *memory_access_type = VMAT_CONTIGUOUS;
 	}
     }
   else
@@ -8732,6 +8740,7 @@  vectorizable_load (stmt_vec_info stmt_in
       /* Checked by get_load_store_type.  */
       unsigned int const_nunits = nunits.to_constant ();
       unsigned HOST_WIDE_INT cst_offset = 0;
+      unsigned int group_gap = 0;
 
       gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
       gcc_assert (!nested_in_vect_loop);
@@ -8749,6 +8758,7 @@  vectorizable_load (stmt_vec_info stmt_in
       if (slp && grouped_load)
 	{
 	  group_size = DR_GROUP_SIZE (first_stmt_info);
+	  group_gap = DR_GROUP_GAP (first_stmt_info);
 	  ref_type = get_group_alias_ptr_type (first_stmt_info);
 	}
       else
@@ -8892,6 +8902,14 @@  vectorizable_load (stmt_vec_info stmt_in
 	  if (nloads > 1)
 	    vec_alloc (v, nloads);
 	  stmt_vec_info new_stmt_info = NULL;
+	  if (slp && slp_perm
+	      && (group_el % group_size) > group_size - group_gap
+	      && (group_el % group_size) + nloads * lnel < group_size)
+	    {
+	      dr_chain.quick_push (NULL_TREE);
+	      group_el += nloads * lnel;
+	      continue;
+	    }
 	  for (i = 0; i < nloads; i++)
 	    {
 	      tree this_off = build_int_cst (TREE_TYPE (alias_off),