amdgcn, nvptx: Disable OMP barriers in nested teams

Message ID 7024bb16-6b91-f5ff-f72e-92c298c73f3b@codesourcery.com
State New
Headers show
Series
  • amdgcn, nvptx: Disable OMP barriers in nested teams
Related show

Commit Message

Andrew Stubbs Sept. 18, 2020, 11:25 a.m.
This patch fixes a problem in which nested OpenMP parallel regions cause 
errors if the number of inner teams is not balanced (i.e. the number of 
loop iterations is not divisible by the number of physical threads). A 
testcase is included.

On NVPTX the symptom was a fatal error:

libgomp: cuCtxSynchronize error: an illegal instruction was encountered

This was caused by mismatched "bar.sync" instructions (one waiting for 
32 threads while another is waiting for 256). The source of the mismatch 
being that some threads were still busy while others had run out of work 
to do.

On GCN there was no such error (GCN barriers always wait for all 
threads), but it worked only by chance: the idle threads were "matching" 
different barriers to the busy threads, but it was harmless because the 
thread function pointer remained NULL.

This patch simply skips barriers when they would "wait" for only one 
thread (the current thread). This means that teams nested inside other 
teams now run independently, instead of strictly in lock-step, and is 
only valid as long as inner teams are limited to one thread each 
(currently the case). When the inner regions exit then the barriers for 
the outer region will sync everything up again.

OK to commit?

Andrew

P.S. I can approve the amdgcn portion myself; I'm seeking approval for 
the nvptx portion.

Comments

Andrew Stubbs Sept. 19, 2020, 6:30 p.m. | #1
On 18/09/2020 12:25, Andrew Stubbs wrote:
> This patch fixes a problem in which nested OpenMP parallel regions cause 

> errors if the number of inner teams is not balanced (i.e. the number of 

> loop iterations is not divisible by the number of physical threads). A 

> testcase is included.


This updated version removes an editing mistake that should have been 
spotted sooner.

Sorry for the inconvenience.

Andrew
libgomp: disable barriers in nested teams

Both GCN and NVPTX allow nested parallel regions, but the barrier
implementation did not allow the nested teams to run independently of each
other (due to hardware limitations).  This patch fixes that, under the
assumption that each thread will create a new subteam of one thread, by
simply not using barriers when there's no other thread to synchronise.

libgomp/ChangeLog:

	* config/gcn/bar.c (gomp_barrier_wait_end): Skip the barrier if the
	total number of threads is one.
	(gomp_team_barrier_wake): Likewise.
	(gomp_team_barrier_wait_end): Likewise.
	(gomp_team_barrier_wait_cancel_end): Likewise.
	* config/nvptx/bar.c (gomp_barrier_wait_end): Likewise.
	(gomp_team_barrier_wake): Likewise.
	(gomp_team_barrier_wait_end): Likewise.
	(gomp_team_barrier_wait_cancel_end): Likewise.
	* testsuite/libgomp.c-c++-common/nested-parallel-unbalanced.c: New test.

diff --git a/libgomp/config/gcn/bar.c b/libgomp/config/gcn/bar.c
index 02fd19710d4..a21529a624b 100644
--- a/libgomp/config/gcn/bar.c
+++ b/libgomp/config/gcn/bar.c
@@ -43,7 +43,8 @@ gomp_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state)
       __atomic_store_n (&bar->generation, bar->generation + BAR_INCR,
 			MEMMODEL_RELAXED);
     }
-  asm ("s_barrier" ::: "memory");
+  if (bar->total > 1)
+    asm ("s_barrier" ::: "memory");
 }
 
 void
@@ -71,7 +72,8 @@ gomp_barrier_wait_last (gomp_barrier_t *bar)
 void
 gomp_team_barrier_wake (gomp_barrier_t *bar, int count)
 {
-  asm ("s_barrier" ::: "memory");
+  if (bar->total > 1)
+    asm ("s_barrier" ::: "memory");
 }
 
 void
@@ -97,7 +99,8 @@ gomp_team_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state)
 	  state &= ~BAR_CANCELLED;
 	  state += BAR_INCR - BAR_WAS_LAST;
 	  __atomic_store_n (&bar->generation, state, MEMMODEL_RELAXED);
-	  asm ("s_barrier" ::: "memory");
+	  if (bar->total > 1)
+	    asm ("s_barrier" ::: "memory");
 	  return;
 	}
     }
@@ -172,7 +175,8 @@ gomp_team_barrier_wait_cancel_end (gomp_barrier_t *bar,
 	{
 	  state += BAR_INCR - BAR_WAS_LAST;
 	  __atomic_store_n (&bar->generation, state, MEMMODEL_RELAXED);
-	  asm ("s_barrier" ::: "memory");
+	  if (bar->total > 1)
+	    asm ("s_barrier" ::: "memory");
 	  return false;
 	}
     }
@@ -195,7 +199,8 @@ gomp_team_barrier_wait_cancel_end (gomp_barrier_t *bar,
 	  abort();
 	}
 
-      asm ("s_barrier" ::: "memory");
+      if (bar->total > 1)
+	asm ("s_barrier" ::: "memory");
       gen = __atomic_load_n (&bar->generation, MEMMODEL_RELAXED);
       if (__builtin_expect (gen & BAR_CANCELLED, 0))
 	return true;
diff --git a/libgomp/config/nvptx/bar.c b/libgomp/config/nvptx/bar.c
index 125ca3e49ec..1116561d931 100644
--- a/libgomp/config/nvptx/bar.c
+++ b/libgomp/config/nvptx/bar.c
@@ -41,7 +41,8 @@ gomp_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state)
       __atomic_store_n (&bar->generation, bar->generation + BAR_INCR,
 			MEMMODEL_RELEASE);
     }
-  asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
+  if (bar->total > 1)
+    asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
 }
 
 void
@@ -69,7 +70,8 @@ gomp_barrier_wait_last (gomp_barrier_t *bar)
 void
 gomp_team_barrier_wake (gomp_barrier_t *bar, int count)
 {
-  asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
+  if (bar->total > 1)
+    asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
 }
 
 void
@@ -95,7 +97,8 @@ gomp_team_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state)
 	  state &= ~BAR_CANCELLED;
 	  state += BAR_INCR - BAR_WAS_LAST;
 	  __atomic_store_n (&bar->generation, state, MEMMODEL_RELEASE);
-	  asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
+	  if (bar->total > 1)
+	    asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
 	  return;
 	}
     }
@@ -104,7 +107,8 @@ gomp_team_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state)
   state &= ~BAR_CANCELLED;
   do
     {
-      asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
+      if (bar->total > 1)
+	asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
       gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
       if (__builtin_expect (gen & BAR_TASK_PENDING, 0))
 	{
@@ -158,7 +162,8 @@ gomp_team_barrier_wait_cancel_end (gomp_barrier_t *bar,
 	{
 	  state += BAR_INCR - BAR_WAS_LAST;
 	  __atomic_store_n (&bar->generation, state, MEMMODEL_RELEASE);
-	  asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
+	  if (bar->total > 1)
+	    asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
 	  return false;
 	}
     }
@@ -169,7 +174,8 @@ gomp_team_barrier_wait_cancel_end (gomp_barrier_t *bar,
   generation = state;
   do
     {
-      asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
+      if (bar->total > 1)
+	asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
       gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
       if (__builtin_expect (gen & BAR_CANCELLED, 0))
 	return true;
diff --git a/libgomp/testsuite/libgomp.c-c++-common/nested-parallel-unbalanced.c b/libgomp/testsuite/libgomp.c-c++-common/nested-parallel-unbalanced.c
new file mode 100644
index 00000000000..e777271dde1
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/nested-parallel-unbalanced.c
@@ -0,0 +1,31 @@
+/* Ensure that nested parallel regions work even when the number of loop
+   iterations is not divisible by the number of threads.  */
+
+#include <stdlib.h>
+
+int main() {
+  int A[30][40], B[30][40];
+  size_t n = 30;
+
+  for (size_t i = 0; i < 30; ++i)
+    for (size_t j = 0; j < 40; ++j)
+    A[i][j] = 42;
+
+#pragma omp target map(A[0:30][0:40], B[0:30][0:40])
+  {
+#pragma omp parallel for num_threads(8)
+    for (size_t i = 0; i < n; ++i)
+      {
+#pragma omp parallel for
+	for (size_t j = 0; j < n; ++j)
+	  {
+	    B[i][j] = A[i][j];
+	  }
+      }
+  }
+
+for (size_t i = 0; i < n; ++i)
+  for (size_t j = 0; j < n; ++j)
+    if (B[i][j] != 42)
+      abort ();
+}
Tom de Vries Sept. 28, 2020, 2:02 p.m. | #2
On 9/18/20 1:25 PM, Andrew Stubbs wrote:
> This patch fixes a problem in which nested OpenMP parallel regions cause

> errors if the number of inner teams is not balanced (i.e. the number of

> loop iterations is not divisible by the number of physical threads). A

> testcase is included.

> 

> On NVPTX the symptom was a fatal error:

> 

> libgomp: cuCtxSynchronize error: an illegal instruction was encountered

> 

> This was caused by mismatched "bar.sync" instructions (one waiting for

> 32 threads while another is waiting for 256). The source of the mismatch

> being that some threads were still busy while others had run out of work

> to do.

> 

> On GCN there was no such error (GCN barriers always wait for all

> threads), but it worked only by chance: the idle threads were "matching"

> different barriers to the busy threads, but it was harmless because the

> thread function pointer remained NULL.

> 

> This patch simply skips barriers when they would "wait" for only one

> thread (the current thread). This means that teams nested inside other

> teams now run independently, instead of strictly in lock-step, and is

> only valid as long as inner teams are limited to one thread each

> (currently the case).


Is this inner-team-one-thread-limit coded or documented somewhere?

If so, it might be good to add a comment there referring to the code
this patch adds.

Follow-up patch is OK, thanks.
- Tom

> When the inner regions exit then the barriers for

> the outer region will sync everything up again.

> 

> OK to commit?

> 

> Andrew

> 

> P.S. I can approve the amdgcn portion myself; I'm seeking approval for

> the nvptx portion.
Andrew Stubbs Sept. 28, 2020, 2:17 p.m. | #3
On 28/09/2020 15:02, Tom de Vries wrote:
>> This patch simply skips barriers when they would "wait" for only one

>> thread (the current thread). This means that teams nested inside other

>> teams now run independently, instead of strictly in lock-step, and is

>> only valid as long as inner teams are limited to one thread each

>> (currently the case).

> 

> Is this inner-team-one-thread-limit coded or documented somewhere?


In libgomp/parallel.c, gomp_resolve_num_threads we have:

   else if (thr->ts.active_level >= 1 && !icv->nest_var)
     return 1;

> If so, it might be good to add a comment there referring to the code

> this patch adds.


   /* Accelerators with fixed thread counts require this to return 1 for
      nested parallel regions.  */

WDYT?

Andrew
Tom de Vries Sept. 28, 2020, 2:21 p.m. | #4
On 9/28/20 4:17 PM, Andrew Stubbs wrote:
> On 28/09/2020 15:02, Tom de Vries wrote:

>>> This patch simply skips barriers when they would "wait" for only one

>>> thread (the current thread). This means that teams nested inside other

>>> teams now run independently, instead of strictly in lock-step, and is

>>> only valid as long as inner teams are limited to one thread each

>>> (currently the case).

>>

>> Is this inner-team-one-thread-limit coded or documented somewhere?

> 

> In libgomp/parallel.c, gomp_resolve_num_threads we have:

> 

>   else if (thr->ts.active_level >= 1 && !icv->nest_var)

>     return 1;

> 

>> If so, it might be good to add a comment there referring to the code

>> this patch adds.

> 

>   /* Accelerators with fixed thread counts require this to return 1 for

>      nested parallel regions.  */

> 

> WDYT?


Yep, looks good, thanks.
- Tom

Patch

libgomp: disable barriers in nested teams

Both GCN and NVPTX allow nested parallel regions, but the barrier
implementation did not allow the nested teams to run independently of each
other (due to hardware limitations).  This patch fixes that, under the
assumption that each thread will create a new subteam of one thread, by
simply not using barriers when there's no other thread to synchronise.

libgomp/ChangeLog:

	* config/gcn/bar.c (gomp_barrier_wait_end): Skip the barrier if the
	total number of threads is one.
	(gomp_team_barrier_wake): Likewise.
	(gomp_team_barrier_wait_end): Likewise.
	(gomp_team_barrier_wait_cancel_end): Likewise.
	* config/nvptx/bar.c (gomp_barrier_wait_end): Likewise.
	(gomp_team_barrier_wake): Likewise.
	(gomp_team_barrier_wait_end): Likewise.
	(gomp_team_barrier_wait_cancel_end): Likewise.
	* testsuite/libgomp.c-c++-common/nested-parallel-unbalanced.c: New test.

diff --git a/libgomp/config/gcn/bar.c b/libgomp/config/gcn/bar.c
index 02fd19710d4..a21529a624b 100644
--- a/libgomp/config/gcn/bar.c
+++ b/libgomp/config/gcn/bar.c
@@ -43,7 +43,8 @@  gomp_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state)
       __atomic_store_n (&bar->generation, bar->generation + BAR_INCR,
 			MEMMODEL_RELAXED);
     }
-  asm ("s_barrier" ::: "memory");
+  if (bar->total > 1)
+    asm ("s_barrier" ::: "memory");
 }
 
 void
@@ -71,7 +72,8 @@  gomp_barrier_wait_last (gomp_barrier_t *bar)
 void
 gomp_team_barrier_wake (gomp_barrier_t *bar, int count)
 {
-  asm ("s_barrier" ::: "memory");
+  if (bar->total > 1)
+    asm ("s_barrier" ::: "memory");
 }
 
 void
@@ -97,7 +99,8 @@  gomp_team_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state)
 	  state &= ~BAR_CANCELLED;
 	  state += BAR_INCR - BAR_WAS_LAST;
 	  __atomic_store_n (&bar->generation, state, MEMMODEL_RELAXED);
-	  asm ("s_barrier" ::: "memory");
+	  if (bar->total > 1)
+	    asm ("s_barrier" ::: "memory");
 	  return;
 	}
     }
@@ -172,7 +175,8 @@  gomp_team_barrier_wait_cancel_end (gomp_barrier_t *bar,
 	{
 	  state += BAR_INCR - BAR_WAS_LAST;
 	  __atomic_store_n (&bar->generation, state, MEMMODEL_RELAXED);
-	  asm ("s_barrier" ::: "memory");
+	  if (bar->total > 1)
+	    asm ("s_barrier" ::: "memory");
 	  return false;
 	}
     }
@@ -195,7 +199,8 @@  gomp_team_barrier_wait_cancel_end (gomp_barrier_t *bar,
 	  abort();
 	}
 
-      asm ("s_barrier" ::: "memory");
+      if (bar->total > 1)
+	asm ("s_barrier" ::: "memory");
       gen = __atomic_load_n (&bar->generation, MEMMODEL_RELAXED);
       if (__builtin_expect (gen & BAR_CANCELLED, 0))
 	return true;
diff --git a/libgomp/config/nvptx/bar.c b/libgomp/config/nvptx/bar.c
index 125ca3e49ec..0a723087b9e 100644
--- a/libgomp/config/nvptx/bar.c
+++ b/libgomp/config/nvptx/bar.c
@@ -41,7 +41,8 @@  gomp_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state)
       __atomic_store_n (&bar->generation, bar->generation + BAR_INCR,
 			MEMMODEL_RELEASE);
     }
-  asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
+  if (bar->total > 1)
+    asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
 }
 
 void
@@ -69,7 +70,9 @@  gomp_barrier_wait_last (gomp_barrier_t *bar)
 void
 gomp_team_barrier_wake (gomp_barrier_t *bar, int count)
 {
-  asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
+  asm ("bar.sync 1, %0;" : : "r" (32 * 8/*bar->total*/));
+  if (bar->total > 1)
+    asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
 }
 
 void
@@ -95,7 +98,8 @@  gomp_team_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state)
 	  state &= ~BAR_CANCELLED;
 	  state += BAR_INCR - BAR_WAS_LAST;
 	  __atomic_store_n (&bar->generation, state, MEMMODEL_RELEASE);
-	  asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
+	  if (bar->total > 1)
+	    asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
 	  return;
 	}
     }
@@ -104,7 +108,8 @@  gomp_team_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state)
   state &= ~BAR_CANCELLED;
   do
     {
-      asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
+      if (bar->total > 1)
+	asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
       gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
       if (__builtin_expect (gen & BAR_TASK_PENDING, 0))
 	{
@@ -158,7 +163,8 @@  gomp_team_barrier_wait_cancel_end (gomp_barrier_t *bar,
 	{
 	  state += BAR_INCR - BAR_WAS_LAST;
 	  __atomic_store_n (&bar->generation, state, MEMMODEL_RELEASE);
-	  asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
+	  if (bar->total > 1)
+	    asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
 	  return false;
 	}
     }
@@ -169,7 +175,8 @@  gomp_team_barrier_wait_cancel_end (gomp_barrier_t *bar,
   generation = state;
   do
     {
-      asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
+      if (bar->total > 1)
+	asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
       gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
       if (__builtin_expect (gen & BAR_CANCELLED, 0))
 	return true;
diff --git a/libgomp/testsuite/libgomp.c-c++-common/nested-parallel-unbalanced.c b/libgomp/testsuite/libgomp.c-c++-common/nested-parallel-unbalanced.c
new file mode 100644
index 00000000000..e777271dde1
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/nested-parallel-unbalanced.c
@@ -0,0 +1,31 @@ 
+/* Ensure that nested parallel regions work even when the number of loop
+   iterations is not divisible by the number of threads.  */
+
+#include <stdlib.h>
+
+int main() {
+  int A[30][40], B[30][40];
+  size_t n = 30;
+
+  for (size_t i = 0; i < 30; ++i)
+    for (size_t j = 0; j < 40; ++j)
+    A[i][j] = 42;
+
+#pragma omp target map(A[0:30][0:40], B[0:30][0:40])
+  {
+#pragma omp parallel for num_threads(8)
+    for (size_t i = 0; i < n; ++i)
+      {
+#pragma omp parallel for
+	for (size_t j = 0; j < n; ++j)
+	  {
+	    B[i][j] = A[i][j];
+	  }
+      }
+  }
+
+for (size_t i = 0; i < n; ++i)
+  for (size_t j = 0; j < n; ++j)
+    if (B[i][j] != 42)
+      abort ();
+}