[PR85720/partial] Support runtime loop versioning if loop can be distributed into builtin functions

Message ID DB6PR0802MB250489D51D3C653EF013F6E4E7940@DB6PR0802MB2504.eurprd08.prod.outlook.com
State New
Headers show
Series
  • [PR85720/partial] Support runtime loop versioning if loop can be distributed into builtin functions
Related show

Commit Message

Bin Cheng May 22, 2018, 4:38 p.m.
Hi,
This patch partially improves loop distribution for PR85720.  It now supports runtime
loop versioning if the loop can be distributed into builtin functions.  Note for this
moment only coarse-grain runtime alias is checked, while different overlapping cases
for different dependence relations are not supported yet.
Note changes in break_alias_scc_partitions and version_loop_by_alias_check do not
strictly match each other, with the latter more restricted.  Because it's hard to pass
information around.  Hopefully this will be resolved when classifying distributor.

Bootstrap and test on x86_64.  Is it OK?

Thanks,
bin

2018-05-22  Bin Cheng  <bin.cheng@arm.com>

	* tree-loop-distribution.c (break_alias_scc_partitions): Don't merge
	SCC if all partitions are builtins.
	(version_loop_by_alias_check): New parameter.  Generate cancelable
	runtime alias check if all partitions are builtins.
	(distribute_loop): Update call to above function.

gcc/testsuite
2018-05-22  Bin Cheng  <bin.cheng@arm.com>

	* gcc.dg/tree-ssa/pr85720.c: New test.
	* gcc.target/i386/avx256-unaligned-store-2.c: Disable loop pattern
	distribution.

Comments

Richard Biener May 23, 2018, 11:22 a.m. | #1
On Tue, May 22, 2018 at 6:38 PM Bin Cheng <Bin.Cheng@arm.com> wrote:

> Hi,

> This patch partially improves loop distribution for PR85720.  It now

supports runtime
> loop versioning if the loop can be distributed into builtin functions.

Note for this
> moment only coarse-grain runtime alias is checked, while different

overlapping cases
> for different dependence relations are not supported yet.

> Note changes in break_alias_scc_partitions and

version_loop_by_alias_check do not
> strictly match each other, with the latter more restricted.  Because it's

hard to pass
> information around.  Hopefully this will be resolved when classifying

distributor.

> Bootstrap and test on x86_64.  Is it OK?


OK.

Thanks,
Richard.

> Thanks,

> bin


> 2018-05-22  Bin Cheng  <bin.cheng@arm.com>


>          * tree-loop-distribution.c (break_alias_scc_partitions): Don't

merge
>          SCC if all partitions are builtins.

>          (version_loop_by_alias_check): New parameter.  Generate cancelable

>          runtime alias check if all partitions are builtins.

>          (distribute_loop): Update call to above function.


> gcc/testsuite

> 2018-05-22  Bin Cheng  <bin.cheng@arm.com>


>          * gcc.dg/tree-ssa/pr85720.c: New test.

>          * gcc.target/i386/avx256-unaligned-store-2.c: Disable loop pattern

>          distribution.

Patch

From 2518709d31440525010fa6692b531419fc81b426 Mon Sep 17 00:00:00 2001
From: Bin Cheng <binche01@e108451-lin.cambridge.arm.com>
Date: Mon, 21 May 2018 15:49:55 +0100
Subject: [PATCH] pr85720-20180520

---
 gcc/testsuite/gcc.dg/tree-ssa/pr85720.c            | 13 +++++++
 .../gcc.target/i386/avx256-unaligned-store-2.c     |  2 +-
 gcc/tree-loop-distribution.c                       | 40 +++++++++++++++++-----
 3 files changed, 45 insertions(+), 10 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr85720.c

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr85720.c b/gcc/testsuite/gcc.dg/tree-ssa/pr85720.c
new file mode 100644
index 0000000..18d8be9
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr85720.c
@@ -0,0 +1,13 @@ 
+/* { dg-do compile { target size32plus } } */
+/* { dg-options "-O2 -ftree-loop-distribution -ftree-loop-distribute-patterns -fdump-tree-ldist" } */
+
+void fill(char* A, char* B, unsigned n)
+{
+    for (unsigned i = 0; i < n; i++)
+    {
+        A[i] = 0;
+        B[i] = A[i] + 1;
+    }
+}
+
+/* { dg-final { scan-tree-dump-times "_builtin_memset" 2 "ldist" } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-2.c b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-2.c
index 87285c6..1e7969b 100644
--- a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-2.c
+++ b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-2.c
@@ -1,5 +1,5 @@ 
 /* { dg-do compile { target { ! ia32 } } } */
-/* { dg-options "-O3 -mtune-ctrl=sse_typeless_stores -dp -mavx -mavx256-split-unaligned-store -mno-prefer-avx128" } */
+/* { dg-options "-O3 -mtune-ctrl=sse_typeless_stores -dp -mavx -mavx256-split-unaligned-store -mno-prefer-avx128 -fno-tree-loop-distribute-patterns" } */
 
 #define N 1024
 
diff --git a/gcc/tree-loop-distribution.c b/gcc/tree-loop-distribution.c
index 5e327f4..c6e0a60 100644
--- a/gcc/tree-loop-distribution.c
+++ b/gcc/tree-loop-distribution.c
@@ -2268,21 +2268,26 @@  break_alias_scc_partitions (struct graph *rdg,
 	  for (j = 0; partitions->iterate (j, &first); ++j)
 	    if (pg->vertices[j].component == i)
 	      break;
+
+	  bool same_type = true, all_builtins = partition_builtin_p (first);
 	  for (++j; partitions->iterate (j, &partition); ++j)
 	    {
 	      if (pg->vertices[j].component != i)
 		continue;
 
-	      /* Note we Merge partitions of parallel type on purpose, though
-		 the result partition is sequential.  The reason is vectorizer
-		 can do more accurate runtime alias check in this case.  Also
-		 it results in more conservative distribution.  */
 	      if (first->type != partition->type)
 		{
-		  bitmap_clear_bit (sccs_to_merge, i);
+		  same_type = false;
 		  break;
 		}
+	      all_builtins &= partition_builtin_p (partition);
 	    }
+	  /* Merge SCC if all partitions in SCC have the same type, though the
+	     result partition is sequential, because vectorizer can do better
+	     runtime alias check.  One expecption is all partitions in SCC are
+	     builtins.  */
+	  if (!same_type || all_builtins)
+	    bitmap_clear_bit (sccs_to_merge, i);
 	}
 
       /* Initialize callback data for traversing.  */
@@ -2458,7 +2463,8 @@  compute_alias_check_pairs (struct loop *loop, vec<ddr_p> *alias_ddrs,
    checks and version LOOP under condition of these runtime alias checks.  */
 
 static void
-version_loop_by_alias_check (struct loop *loop, vec<ddr_p> *alias_ddrs)
+version_loop_by_alias_check (vec<struct partition *> *partitions,
+			     struct loop *loop, vec<ddr_p> *alias_ddrs)
 {
   profile_probability prob;
   basic_block cond_bb;
@@ -2481,9 +2487,25 @@  version_loop_by_alias_check (struct loop *loop, vec<ddr_p> *alias_ddrs)
 				      is_gimple_val, NULL_TREE);
 
   /* Depend on vectorizer to fold IFN_LOOP_DIST_ALIAS.  */
-  if (flag_tree_loop_vectorize)
+  bool cancelable_p = flag_tree_loop_vectorize;
+  if (cancelable_p)
+    {
+      unsigned i = 0;
+      struct partition *partition;
+      for (; partitions->iterate (i, &partition); ++i)
+	if (!partition_builtin_p (partition))
+	  break;
+
+     /* If all partitions are builtins, distributing it would be profitable and
+	we don't want to cancel the runtime alias checks.  */
+      if (i == partitions->length ())
+	cancelable_p = false;
+    }
+
+  /* Generate internal function call for loop distribution alias check if the
+     runtime alias check should be cancelable.  */
+  if (cancelable_p)
     {
-      /* Generate internal function call for loop distribution alias check.  */
       call_stmt = gimple_build_call_internal (IFN_LOOP_DIST_ALIAS,
 					      2, NULL_TREE, cond_expr);
       lhs = make_ssa_name (boolean_type_node);
@@ -2883,7 +2905,7 @@  distribute_loop (struct loop *loop, vec<gimple *> stmts,
     }
 
   if (version_for_distribution_p (&partitions, &alias_ddrs))
-    version_loop_by_alias_check (loop, &alias_ddrs);
+    version_loop_by_alias_check (&partitions, loop, &alias_ddrs);
 
   if (dump_file && (dump_flags & TDF_DETAILS))
     {
-- 
1.9.1