Enable firstprivate OpenACC reductions

Message ID f96a912e-ed90-b1cb-e864-416b779147c1@codesourcery.com
State New
Headers show
Series
  • Enable firstprivate OpenACC reductions
Related show

Commit Message

Cesar Philippidis July 20, 2018, 10:03 p.m.
At present, all reduction variables are transferred via an implicit
'copy' clause. As shown the the recent patches I've been posting, that
causes a lot of problems when the reduction variables are used by
multiple workers or vectors. This patch teaches the gimplifier to
transfer reduction variable as firstprivate in OpenACC parallel regions,
if the are in an inner loop. This matches the behavior of reductions in
OpenACC 2.6.

Is this patch OK for trunk? I bootstrapped and regtested on x86_64/nvptx.

Thanks,
Cesar

Patch

From 035be51a795ad8bed5342ba181220bf3102bcd6d Mon Sep 17 00:00:00 2001
From: Cesar Philippidis <cesar@codesourcery.com>
Date: Wed, 31 Jan 2018 07:21:53 -0800
Subject: [PATCH 4/5] Enable firstprivate OpenACC reductions

2018-XX-YY  Cesar Philippidis  <cesar@codesourcery.com>

	gcc/
	* gimplify.c (omp_add_variable): Allow certain OpenACC reduction
	variables to remain firstprivate.

	gcc/testsuite/
	* c-c++-common/goacc/reduction-8.c: New test.

(cherry picked from openacc-gcc-7-branch commit
441621739e2a067c97409f8b0e3e30362a7905be, cec00212ad8)
---
 gcc/gimplify.c                                | 30 ++++--
 .../c-c++-common/goacc/reduction-8.c          | 94 +++++++++++++++++++
 2 files changed, 117 insertions(+), 7 deletions(-)
 create mode 100644 gcc/testsuite/c-c++-common/goacc/reduction-8.c

diff --git a/gcc/gimplify.c b/gcc/gimplify.c
index 737a280cfe9..bcfb029275c 100644
--- a/gcc/gimplify.c
+++ b/gcc/gimplify.c
@@ -6858,9 +6858,16 @@  omp_add_variable (struct gimplify_omp_ctx *ctx, tree decl, unsigned int flags)
   else
     splay_tree_insert (ctx->variables, (splay_tree_key)decl, flags);
 
-  /* For reductions clauses in OpenACC loop directives, by default create a
-     copy clause on the enclosing parallel construct for carrying back the
-     results.  */
+  /* For OpenACC loop directives, when a reduction is immediately
+     enclosed within an acc parallel or kernels construct, it must
+     have an implied copy data mapping. E.g.
+
+       #pragma acc parallel
+	 {
+	   #pragma acc loop reduction (+:sum)
+
+     a copy clause for sum should be added on the enclosing parallel
+     construct for carrying back the results.  */
   if (ctx->region_type == ORT_ACC && (flags & GOVD_REDUCTION))
     {
       struct gimplify_omp_ctx *outer_ctx = ctx->outer_context;
@@ -6876,8 +6883,11 @@  omp_add_variable (struct gimplify_omp_ctx *ctx, tree decl, unsigned int flags)
 	    vector = true;
 	}
 
-      /* Set new copy map as 'private' if sure we're not gang-partitioning.  */
-      bool map_private;
+      /* Reduction data maps need to be marked as private for worker
+	 and vector loops, in order to ensure that value of the
+	 reduction carried back to the host.  Set new copy map as
+	 'private' if sure we're not gang-partitioning.  */
+      bool map_private, update_data_map = false;
 
       if (gang)
 	map_private = false;
@@ -6886,6 +6896,10 @@  omp_add_variable (struct gimplify_omp_ctx *ctx, tree decl, unsigned int flags)
       else
 	map_private = oacc_privatize_reduction (ctx->outer_context);
 
+      if (ctx->outer_context
+	  && ctx->outer_context->region_type == ORT_ACC_PARALLEL)
+	update_data_map = true;
+
       while (outer_ctx)
 	{
 	  n = splay_tree_lookup (outer_ctx->variables, (splay_tree_key)decl);
@@ -6902,7 +6916,8 @@  omp_add_variable (struct gimplify_omp_ctx *ctx, tree decl, unsigned int flags)
 		  gcc_assert (!(n->value & GOVD_FIRSTPRIVATE)
 			      && (n->value & GOVD_MAP));
 		}
-	      else if (outer_ctx->region_type == ORT_ACC_PARALLEL)
+	      else if (update_data_map
+		       && outer_ctx->region_type == ORT_ACC_PARALLEL)
 		{
 		  /* Remove firstprivate and make it a copy map.  */
 		  n->value &= ~GOVD_FIRSTPRIVATE;
@@ -6914,7 +6929,8 @@  omp_add_variable (struct gimplify_omp_ctx *ctx, tree decl, unsigned int flags)
 		    n->value |= GOVD_MAP_PRIVATE;
 		}
 	    }
-	  else if (outer_ctx->region_type == ORT_ACC_PARALLEL)
+	  else if (update_data_map
+		   && outer_ctx->region_type == ORT_ACC_PARALLEL)
 	    {
 	      unsigned f = GOVD_MAP | GOVD_SEEN;
 
diff --git a/gcc/testsuite/c-c++-common/goacc/reduction-8.c b/gcc/testsuite/c-c++-common/goacc/reduction-8.c
new file mode 100644
index 00000000000..8a0283f4ac3
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/reduction-8.c
@@ -0,0 +1,94 @@ 
+/* { dg-additional-options "-fdump-tree-gimple" } */
+
+#define n 1000
+
+int
+main(void)
+{
+  int i, j;
+  int result, array[n];
+
+#pragma acc parallel loop reduction (+:result)
+  for (i = 0; i < n; i++)
+    result ++;
+
+#pragma acc parallel
+#pragma acc loop reduction (+:result)
+  for (i = 0; i < n; i++)
+    result ++;
+
+#pragma acc parallel
+#pragma acc loop
+  for (i = 0; i < n; i++)
+    {
+      result = i;
+
+#pragma acc loop reduction(+:result)
+      for (j = 0; j < n; j++)
+	result ++;
+
+      array[i] = result;
+    }
+
+#pragma acc parallel
+#pragma acc loop
+  for (i = 0; i < n; i++)
+    {
+      result = i;
+
+#pragma acc loop worker vector reduction(+:result)
+      for (j = 0; j < n; j++)
+	result ++;
+
+      array[i] = result;
+    }
+
+#pragma acc parallel
+#pragma acc loop // { dg-warning "insufficient partitioning" }
+  for (i = 0; i < n; i++)
+    {
+      result = i;
+
+#pragma acc loop gang reduction(+:result)
+      for (j = 0; j < n; j++)
+	result ++;
+
+      array[i] = result;
+    }
+
+#pragma acc parallel copy(result)
+#pragma acc loop // { dg-warning "insufficient partitioning" }
+  for (i = 0; i < n; i++)
+    {
+      result = i;
+
+#pragma acc loop gang reduction(+:result)
+      for (j = 0; j < n; j++)
+	result ++;
+
+      array[i] = result;
+    }
+  
+#pragma acc kernels
+#pragma acc loop
+  for (i = 0; i < n; i++)
+    {
+      result = i;
+
+#pragma acc loop reduction(+:result)
+      for (j = 0; j < n; j++)
+	result ++;
+
+      array[i] = result;
+    }
+
+  return 0;
+}
+
+/* Check that default copy maps are generated for loop reductions.  */
+/* { dg-final { scan-tree-dump-times "reduction..:result. map.tofrom:result .len: 4.." 1 "gimple" } } */
+/* { dg-final { scan-tree-dump-times "oacc_parallel map.tofrom:result .len: 4.." 2 "gimple" } } */
+/* { dg-final { scan-tree-dump-times "map.tofrom:array .len: 4000.. firstprivate.result." 3 "gimple" } } */
+/* { dg-final { scan-tree-dump-times "map.tofrom:result .len: 4.. map.tofrom:array .len: 4000.." 1 "gimple" } } */
+/* { dg-final { scan-tree-dump-times "map.tofrom:array .len: 4000.. map.force_tofrom:result .len: 4.." 1 "gimple" } } */
+
-- 
2.17.1