[07/10,OpenACC] Launch kernels asynchronously in OpenACC kernels regions

Message ID 7de5cbfe-3078-99e2-8018-a2adfd69830a@codesourcery.com
State New
Headers show
Series
  • Rework handling of OpenACC kernels regions
Related show

Commit Message

Kwok Cheung Yeung July 17, 2019, 9:13 p.m.
Kernels regions are decomposed into one or more smaller regions that are to be 
executed in sequence. With this patch, all of these regions are launched 
asynchronously, and a wait directive is added after them. This means that the 
host only waits once for the kernels to complete, not once per kernel. If the 
original kernels region was marked async, that asynchronous behavior is 
preserved, and no wait is added.

2019-07-16  Gergö Barany  <gergo@codesourcery.com>

	gcc/
	* omp-oacc-kernels.c (add_async_clauses_and_wait): New function...
	(decompose_kernels_region_body): ... called from here.

	gcc/testsuite/
	* c-c++-common/goacc/kernels-conversion.c: Test automatically generated
	async clauses.
	* gfortran.dg/goacc/kernels-conversion.f95: Likewise.
---
  gcc/omp-oacc-kernels.c                             | 56 ++++++++++++++++++++--
  .../c-c++-common/goacc/kernels-conversion.c        |  5 ++
  .../gfortran.dg/goacc/kernels-conversion.f95       |  5 ++
  3 files changed, 63 insertions(+), 3 deletions(-)

"convert_oacc_kernels" } }
+
  ! Check that the original kernels region is removed.
  ! { dg-final { scan-tree-dump-not "oacc_kernels" "convert_oacc_kernels" } }
-- 
2.8.1

Patch

diff --git a/gcc/omp-oacc-kernels.c b/gcc/omp-oacc-kernels.c
index 11a960c..0fae74a 100644
--- a/gcc/omp-oacc-kernels.c
+++ b/gcc/omp-oacc-kernels.c
@@ -66,7 +66,13 @@  along with GCC; see the file COPYING3.  If not see
       gang-parallelizable loop inside an if statement is "gang-serialized" by
       the transformation.
       The transformation visits loops inside such new gang-single-regions and
-     removes and warns about any gang annotations.  */
+     removes and warns about any gang annotations.
+   - In order to make the host wait only once for the whole region instead
+     of once per kernel launch, the new parallel and serial regions are
+     annotated async.  Unless the original kernels region was marked async,
+     the entire region ends with a wait construct.  If the original kernels
+     region was marked async, the generated async statements use the async
+     queue the kernels region was annotated with (possibly implicitly).  */

  /* Helper function for decompose_kernels_region_body.  If STMT contains a
     "top-level" OMP_FOR statement, returns a pointer to that statement;
@@ -676,6 +682,38 @@  maybe_build_inner_data_region (location_t loc, gimple *body,
    return body;
  }

+/* Helper function of decompose_kernels_region_body.  The statements in
+   REGION_BODY are expected to be decomposed parallel regions; add an
+   "async" clause to each.  Also add a "wait" pragma at the end of the
+   sequence.  */
+
+static void
+add_async_clauses_and_wait (location_t loc, gimple_seq *region_body)
+{
+  tree default_async_queue
+    = build_int_cst (integer_type_node, GOMP_ASYNC_NOVAL);
+  for (gimple_stmt_iterator gsi = gsi_start (*region_body);
+       !gsi_end_p (gsi);
+       gsi_next (&gsi))
+    {
+      gimple *stmt = gsi_stmt (gsi);
+      tree target_clauses = gimple_omp_target_clauses (stmt);
+      tree new_async_clause = build_omp_clause (loc, OMP_CLAUSE_ASYNC);
+      OMP_CLAUSE_OPERAND (new_async_clause, 0) = default_async_queue;
+      OMP_CLAUSE_CHAIN (new_async_clause) = target_clauses;
+      target_clauses = new_async_clause;
+      gimple_omp_target_set_clauses (as_a <gomp_target *> (stmt),
+                                     target_clauses);
+    }
+  /* A "#pragma acc wait" is just a call GOACC_wait (acc_async_sync, 0).  */
+  tree wait_fn = builtin_decl_explicit (BUILT_IN_GOACC_WAIT);
+  tree sync_arg = build_int_cst (integer_type_node, GOMP_ASYNC_SYNC);
+  gimple *wait_call = gimple_build_call (wait_fn, 2,
+                                         sync_arg, integer_zero_node);
+  gimple_set_location (wait_call, loc);
+  gimple_seq_add_stmt (region_body, wait_call);
+}
+
  /* Auxiliary analysis of the body of a kernels region, to determine for each
     OpenACC loop whether it is control-dependent (i.e., not necessarily
     executed every time the kernels region is entered) or not.
@@ -890,10 +928,12 @@  decompose_kernels_region_body (gimple *kernels_region, 
tree kernels_clauses)
       except that the num_gangs, num_workers, and vector_length clauses will
       only be added to loop regions.  The other regions are "gang-single" and
       get an explicit num_gangs(1) clause.  So separate out the num_gangs,
-     num_workers, and vector_length clauses here.  */
+     num_workers, and vector_length clauses here.
+     Also check for the presence of an async clause but do not remove it
+     from the kernels clauses.  */
    tree num_gangs_clause = NULL, num_workers_clause = NULL,
         vector_length_clause = NULL;
-  tree prev_clause = NULL, next_clause = NULL;
+  tree prev_clause = NULL, next_clause = NULL, async_clause = NULL;
    tree parallel_clauses = kernels_clauses;
    for (tree c = parallel_clauses; c; c = next_clause)
      {
@@ -927,6 +967,8 @@  decompose_kernels_region_body (gimple *kernels_region, tree 
kernels_clauses)
          }
        else
          prev_clause = c;
+      if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_ASYNC)
+        async_clause = c;
      }

    gimple *kernels_body = gimple_omp_body (kernels_region);
@@ -1090,6 +1132,14 @@  decompose_kernels_region_body (gimple *kernels_region, 
tree kernels_clauses)
        gimple_seq_add_stmt (&region_body, single_region);
      }

+  /* We want to launch these kernels asynchronously.  If the original
+     kernels region had an async clause, this is done automatically because
+     that async clause was copied to the individual regions we created.
+     Otherwise, add an async clause to each newly created region, as well as
+     a wait directive at the end.  */
+  if (async_clause == NULL)
+    add_async_clauses_and_wait (loc, &region_body);
+
    tree kernels_locals = gimple_bind_vars (as_a <gbind *> (kernels_body));
    gimple *body = gimple_build_bind (kernels_locals, region_body,
                                      make_node (BLOCK));
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-conversion.c 
b/gcc/testsuite/c-c++-common/goacc/kernels-conversion.c
index ed4d642..3e52ec4 100644
--- a/gcc/testsuite/c-c++-common/goacc/kernels-conversion.c
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-conversion.c
@@ -49,5 +49,10 @@  main (void)
  /* { dg-final { scan-tree-dump-times "oacc_parallel_kernels_parallelized" 3 
"convert_oacc_kernels" } } */
  /* { dg-final { scan-tree-dump-times "oacc_parallel_kernels_gang_single" 2 
"convert_oacc_kernels" } } */

+/* Each of the parallel regions is async, and there is a final call to
+   __builtin_GOACC_wait.  */
+/* { dg-final { scan-tree-dump-times "oacc_parallel_kernels.* async\(-1\)" 5 
"convert_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-times "__builtin_GOACC_wait" 1 
"convert_oacc_kernels" } } */
+
  /* Check that the original kernels region is removed.  */
  /* { dg-final { scan-tree-dump-not "oacc_kernels" "convert_oacc_kernels" } } */
diff --git a/gcc/testsuite/gfortran.dg/goacc/kernels-conversion.f95 
b/gcc/testsuite/gfortran.dg/goacc/kernels-conversion.f95
index f89e46b..559916c 100644
--- a/gcc/testsuite/gfortran.dg/goacc/kernels-conversion.f95
+++ b/gcc/testsuite/gfortran.dg/goacc/kernels-conversion.f95
@@ -47,5 +47,10 @@  end program main
  ! { dg-final { scan-tree-dump-times "oacc_parallel_kernels_parallelized" 3 
"convert_oacc_kernels" } }
  ! { dg-final { scan-tree-dump-times "oacc_parallel_kernels_gang_single" 2 
"convert_oacc_kernels" } }

+! Each of the parallel regions is async, and there is a final call to
+! __builtin_GOACC_wait.
+! { dg-final { scan-tree-dump-times "oacc_parallel_kernels.* async\(-1\)" 5 
"convert_oacc_kernels" } }
+! { dg-final { scan-tree-dump-times "__builtin_GOACC_wait" 1