[og7,nvptx,openacc,PR85381,committed] Don't emit barriers for empty loops

Message ID d52f811f-4c6f-c4b6-418f-7d7c72d75461@mentor.com
State New
Headers show
Series
  • [og7,nvptx,openacc,PR85381,committed] Don't emit barriers for empty loops
Related show

Commit Message

Tom de Vries April 21, 2018, 9:59 a.m.
Hi,

when compiling this testcase with the og7 branch:
...
int
main (void)
{
   long long v1;
#pragma acc parallel num_gangs (640) num_workers(1) vector_length (128)
#pragma acc loop
   for (v1 = 0; v1 < 20; v1 += 2)
     ;

   return 0;
}
...

this ptx is generated:
...
{
   // fork 4; 

   bar.sync 0;
   // forked 4; 

   // joining 4; 

   bar.sync 0;
   // join 4; 

   ret;
}
...

This triggers some bug on my quadro m1200 (I'm assuming in the ptxas/JIT 
compiler) that hangs the testcase. I can work around this by adding a 
membar.cta before the bar.syc, or two membar.ctas inbetween, but I'm not 
really sure what a minimal workaround should look like (I reported the 
bug to nvidia, I'm hoping for them to answer that question).

This patch works around the bug by doing an optimization: we detect that 
this is an empty loop (a forked immediately followed by a joining), and 
don't emit the barriers.

Build x86_64 with nvptx accelerator and tested libgomp.

Committed to og7 branch.

Thanks,
- Tom

Patch

[nvptx, openacc] Don't emit barriers for empty loops

2018-04-21  Tom de Vries  <tom@codesourcery.com>

	PR target/85381
	* config/nvptx/nvptx.c (nvptx_process_pars): Don't emit barriers for
	empty loops.

	* testsuite/libgomp.oacc-c-c++-common/pr85381-2.c: New test.
	* testsuite/libgomp.oacc-c-c++-common/pr85381-3.c: New test.
	* testsuite/libgomp.oacc-c-c++-common/pr85381-4.c: New test.
	* testsuite/libgomp.oacc-c-c++-common/pr85381-5.c: New test.
	* testsuite/libgomp.oacc-c-c++-common/pr85381.c: New test.

---
 gcc/config/nvptx/nvptx.c                           | 15 +++++++---
 .../libgomp.oacc-c-c++-common/pr85381-2.c          | 35 ++++++++++++++++++++++
 .../libgomp.oacc-c-c++-common/pr85381-3.c          | 34 +++++++++++++++++++++
 .../libgomp.oacc-c-c++-common/pr85381-4.c          | 26 ++++++++++++++++
 .../libgomp.oacc-c-c++-common/pr85381-5.c          | 23 ++++++++++++++
 .../testsuite/libgomp.oacc-c-c++-common/pr85381.c  | 17 +++++++++++
 6 files changed, 146 insertions(+), 4 deletions(-)

diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index 8c478c8..3aee9cc 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -4467,9 +4467,12 @@  nvptx_process_pars (parallel *par)
     {
       nvptx_shared_propagate (false, is_call, par->forked_block,
 			      par->forked_insn, !worker);
-      bool empty = nvptx_shared_propagate (true, is_call,
-					   par->forked_block, par->fork_insn,
-					   !worker);
+      bool no_prop_p
+	= nvptx_shared_propagate (true, is_call, par->forked_block,
+				  par->fork_insn, !worker);
+      bool empty_loop_p
+	= !is_call && (NEXT_INSN (par->forked_insn)
+		       && NEXT_INSN (par->forked_insn) == par->joining_insn);
       rtx barrier = GEN_INT (0);
       int threads = 0;
 
@@ -4479,7 +4482,11 @@  nvptx_process_pars (parallel *par)
 	  threads = nvptx_mach_vector_length ();
 	}
 
-      if (!empty || !is_call)
+      if (no_prop_p && empty_loop_p)
+	;
+      else if (no_prop_p && is_call)
+	;
+      else
 	{
 	  /* Insert begin and end synchronizations.  */
 	  emit_insn_before (nvptx_cta_sync (barrier, threads),
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/pr85381-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/pr85381-2.c
new file mode 100644
index 0000000..e5d02cf
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/pr85381-2.c
@@ -0,0 +1,35 @@ 
+/* { dg-additional-options "-save-temps" } */
+/* { dg-skip-if "" { *-*-* } { "*" } { "-DACC_DEVICE_TYPE_nvidia=1 -O2" } } */
+
+int
+main (void)
+{
+  int v1;
+
+  #pragma acc parallel
+  #pragma acc loop worker
+  for (v1 = 0; v1 < 20; v1 += 2)
+    ;
+
+  return 0;
+}
+
+/* Todo: Boths bar.syncs can be removed.
+   Atm we generate this dead code inbetween forked and joining:
+
+                     mov.u32 %r28, %ntid.y;
+                     mov.u32 %r29, %tid.y;
+                     add.u32 %r30, %r29, %r29;
+                     setp.gt.s32     %r31, %r30, 19;
+             @%r31   bra     $L2;
+                     add.u32 %r25, %r28, %r28;
+                     mov.u32 %r24, %r30;
+     $L3:
+                     add.u32 %r24, %r24, %r25;
+                     setp.le.s32     %r33, %r24, 19;
+             @%r33   bra     $L3;
+     $L2:
+
+   so the loop is not recognized as empty loop (which we detect by seeing if
+   joining immediately follows forked).  */
+/* { dg-final { scan-assembler-times "bar.sync" 2 } } */
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/pr85381-3.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/pr85381-3.c
new file mode 100644
index 0000000..7d9ba1b
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/pr85381-3.c
@@ -0,0 +1,34 @@ 
+/* { dg-additional-options "-save-temps -w" } */
+/* { dg-skip-if "" { *-*-* } { "*" } { "-DACC_DEVICE_TYPE_nvidia=1 -O2" } } */
+
+int a;
+#pragma acc declare create(a)
+
+#pragma acc routine vector
+void __attribute__((noinline, noclone))
+foo_v (void)
+{
+  a = 1;
+}
+
+#pragma acc routine worker
+void __attribute__((noinline, noclone))
+foo_w (void)
+{
+  a = 2;
+}
+
+int
+main (void)
+{
+
+  #pragma acc parallel
+  foo_v ();
+
+  #pragma acc parallel
+  foo_w ();
+
+  return 0;
+}
+
+/* { dg-final { scan-assembler-not "bar.sync" } } */
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/pr85381-4.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/pr85381-4.c
new file mode 100644
index 0000000..477297d
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/pr85381-4.c
@@ -0,0 +1,26 @@ 
+/* { dg-additional-options "-save-temps -w" } */
+/* { dg-skip-if "" { *-*-* } { "*" } { "-DACC_DEVICE_TYPE_nvidia=1 -O2" } } */
+
+#define n 1024
+
+int
+main (void)
+{
+  #pragma acc parallel
+  {
+    #pragma acc loop worker
+    for (int i = 0; i < n; i++)
+      ;
+
+    #pragma acc loop worker
+    for (int i = 0; i < n; i++)
+      ;
+  }
+
+  return 0;
+}
+
+/* Atm, %ntid.y is broadcast from one loop to the next, so there are 2 bar.syncs
+   for that (the other two are there for the same reason as in pr85381-2.c).
+   Todo: Recompute %ntid.y instead of broadcasting it. */
+/* { dg-final { scan-assembler-times "bar.sync" 4 } } */
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/pr85381-5.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/pr85381-5.c
new file mode 100644
index 0000000..4653009
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/pr85381-5.c
@@ -0,0 +1,23 @@ 
+/* { dg-additional-options "-save-temps" } */
+/* { dg-skip-if "" { *-*-* } { "*" } { "-DACC_DEVICE_TYPE_nvidia=1 -O2" } } */
+
+#define n 1024
+
+int
+main (void)
+{
+  #pragma acc parallel vector_length(128)
+  {
+    #pragma acc loop vector
+    for (int i = 0; i < n; i++)
+      ;
+
+    #pragma acc loop vector
+    for (int i = 0; i < n; i++)
+      ;
+  }
+
+  return 0;
+}
+
+/* { dg-final { scan-assembler-not "bar.sync" } } */
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/pr85381.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/pr85381.c
new file mode 100644
index 0000000..f585ae5
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/pr85381.c
@@ -0,0 +1,17 @@ 
+/* { dg-additional-options "-save-temps" } */
+/* { dg-skip-if "" { *-*-* } { "*" } { "-DACC_DEVICE_TYPE_nvidia=1 -O2" } } */
+
+int
+main (void)
+{
+  int v1;
+
+  #pragma acc parallel vector_length (128)
+  #pragma acc loop vector
+  for (v1 = 0; v1 < 20; v1 += 2)
+    ;
+
+  return 0;
+}
+
+/* { dg-final { scan-assembler-not "bar.sync" } } */