[3/7,OpenACC,libgomp,v5,stage1] Async re-work, libgomp/oacc-* changes

Message ID dd1b1448-bc89-35fc-af90-5c0b117d3da9@mentor.com
State New
Headers show
Series
  • Async re-work
Related show

Commit Message

Chung-Lin Tang Jan. 22, 2019, 2:52 p.m.
Hi Thomas,
in this patch, the issue on the synchronize/serialize hooks should be resolved;
all call sites should now handle the failing case.

As for the host _construct hook issue, I went your suggestion and used 0xffff...
as the dummy, the NULL pointer is still used for error here. I mentioned of
using NULL to mean synchronous earlier, but for the construct hook I think there's
no need to complicate it that way.

This patch still has the few lines to do the local host-side synchronization
for wait+async, IIRC we haven't reached a clear resolution to which is correct for
this. (It's fairly easy to change the behavior before we commit)

Thanks,
Chung-Lin

     * oacc-async.c (get_goacc_thread): New function.
     (get_goacc_thread_device): New function.
     (lookup_goacc_asyncqueue): New function.
     (get_goacc_asyncqueue): New function.
     (acc_async_test): Adjust code to use new async design.
     (acc_async_test_all): Likewise.
     (acc_wait): Likewise.
     (acc_wait_async): Likewise.
     (acc_wait_all): Likewise.
     (acc_wait_all_async): Likewise.
     (acc_get_default_async): New API function.
     (acc_set_default_async): Likewise.
     (goacc_async_unmap_tgt): New function.
     (goacc_async_copyout_unmap_vars): Likewise.
     (goacc_async_free): Likewise.
     (goacc_init_asyncqueues): Likewise.
     (goacc_fini_asyncqueues): Likewise.
     * oacc-cuda.c (acc_get_cuda_stream): Adjust code to use new async
     design.
     (acc_set_cuda_stream): Likewise.

     * oacc-host.c (host_version): Update to return GOMP_PLUGIN_IF_VERSION.
     (host_openacc_exec): Adjust parameters, remove 'async'.
     (host_openacc_register_async_cleanup): Remove.
     (host_openacc_async_exec): New function.
     (host_openacc_async_test): Adjust parameters.
     (host_openacc_async_test_all): Remove.
     (host_openacc_async_wait): Remove.
     (host_openacc_async_wait_async): Remove.
     (host_openacc_async_wait_all): Remove.
     (host_openacc_async_wait_all_async): Remove.
     (host_openacc_async_set_async): Remove.
     (host_openacc_async_synchronize): New function.
     (host_openacc_async_serialize): New function.
     (host_openacc_async_host2dev): New function.
     (host_openacc_async_dev2host): New function.
     (host_openacc_async_queue_callback): New function.
     (host_openacc_async_construct): New function.
     (host_openacc_async_destruct): New function.
     (struct gomp_device_descr host_dispatch): Remove initialization of old
     interface, add intialization of new async sub-struct.

     * oacc-init.c (acc_shutdown_1): Adjust to use gomp_fini_device.
     (goacc_attach_host_thread_to_device): Remove old async code usage, add
     initialization of per-thread default_async.

     * oacc-int.h (struct goacc_thread): Add default_async field.
     (goacc_init_asyncqueues): New declaration.
     (goacc_fini_asyncqueues): Likewise.
     (goacc_async_free): Likewise.
     (get_goacc_asyncqueue): Likewise.
     (lookup_goacc_asyncqueue): Likewise.

     * oacc-mem.c (memcpy_tofrom_device): Adjust code to use new async
     design.
     (present_create_copy): Likewise.
     (delete_copyout): Likewise.
     (update_dev_host): Likewise.
     (gomp_acc_insert_pointer): Add async parameter, adjust code to use new
     async design.
     (gomp_acc_remove_pointer): Adjust code to use new async design.

     * oacc-parallel.c (GOACC_parallel_keyed): Likewise.
     (GOACC_enter_exit_data): Likewise.
     (goacc_wait): Likewise.
     (GOACC_update): Likewise.

     * oacc-plugin.c (GOMP_PLUGIN_async_unmap_vars): Change to assert fail
     when called, warn as obsolete in comment.

Patch

Index: libgomp/oacc-async.c
===================================================================
--- libgomp/oacc-async.c	(revision 268142)
+++ libgomp/oacc-async.c	(working copy)
@@ -27,10 +27,99 @@ 
    <http://www.gnu.org/licenses/>.  */
 
 #include <assert.h>
+#include <string.h>
 #include "openacc.h"
 #include "libgomp.h"
 #include "oacc-int.h"
 
+static struct goacc_thread *
+get_goacc_thread (void)
+{
+  struct goacc_thread *thr = goacc_thread ();
+
+  if (!thr || !thr->dev)
+    gomp_fatal ("no device active");
+
+  return thr;
+}
+
+static struct gomp_device_descr *
+get_goacc_thread_device (void)
+{
+  struct goacc_thread *thr = goacc_thread ();
+
+  if (!thr || !thr->dev)
+    gomp_fatal ("no device active");
+
+  return thr->dev;
+}
+
+attribute_hidden struct goacc_asyncqueue *
+lookup_goacc_asyncqueue (struct goacc_thread *thr, bool create, int async)
+{
+  /* The special value acc_async_noval (-1) maps to the thread-specific
+     default async stream.  */
+  if (async == acc_async_noval)
+    async = thr->default_async;
+
+  if (async == acc_async_sync)
+    return NULL;
+
+  if (async < 0)
+    gomp_fatal ("bad async %d", async);
+
+  struct goacc_asyncqueue *ret_aq = NULL;
+  struct gomp_device_descr *dev = thr->dev;
+
+  gomp_mutex_lock (&dev->openacc.async.lock);
+
+  if (!create
+      && (async >= dev->openacc.async.nasyncqueue
+	  || !dev->openacc.async.asyncqueue[async]))
+    goto end;
+
+  if (async >= dev->openacc.async.nasyncqueue)
+    {
+      int diff = async + 1 - dev->openacc.async.nasyncqueue;
+      dev->openacc.async.asyncqueue
+	= gomp_realloc (dev->openacc.async.asyncqueue,
+			sizeof (goacc_aq) * (async + 1));
+      memset (dev->openacc.async.asyncqueue + dev->openacc.async.nasyncqueue,
+	      0, sizeof (goacc_aq) * diff);
+      dev->openacc.async.nasyncqueue = async + 1;
+    }
+
+  if (!dev->openacc.async.asyncqueue[async])
+    {
+      dev->openacc.async.asyncqueue[async] = dev->openacc.async.construct_func ();
+
+      if (!dev->openacc.async.asyncqueue[async])
+	{
+	  gomp_mutex_unlock (&dev->openacc.async.lock);
+	  gomp_fatal ("async %d creation failed", async);
+	}
+      
+      /* Link new async queue into active list.  */
+      goacc_aq_list n = gomp_malloc (sizeof (struct goacc_asyncqueue_list));
+      n->aq = dev->openacc.async.asyncqueue[async];
+      n->next = dev->openacc.async.active;
+      dev->openacc.async.active = n;
+    }
+
+  ret_aq = dev->openacc.async.asyncqueue[async];
+
+ end:
+  gomp_mutex_unlock (&dev->openacc.async.lock);
+  return ret_aq;
+}
+
+attribute_hidden struct goacc_asyncqueue *
+get_goacc_asyncqueue (int async)
+{
+  struct goacc_thread *thr = get_goacc_thread ();
+  return lookup_goacc_asyncqueue (thr, true, async);
+}
+
 int
 acc_async_test (int async)
 {
@@ -42,18 +131,25 @@  acc_async_test (int async)
   if (!thr || !thr->dev)
     gomp_fatal ("no device active");
 
-  return thr->dev->openacc.async_test_func (async);
+  goacc_aq aq = lookup_goacc_asyncqueue (thr, true, async);
+  return thr->dev->openacc.async.test_func (aq);
 }
 
 int
 acc_async_test_all (void)
 {
-  struct goacc_thread *thr = goacc_thread ();
+  struct goacc_thread *thr = get_goacc_thread ();
 
-  if (!thr || !thr->dev)
-    gomp_fatal ("no device active");
-
-  return thr->dev->openacc.async_test_all_func ();
+  int ret = 1;
+  gomp_mutex_lock (&thr->dev->openacc.async.lock);
+  for (goacc_aq_list l = thr->dev->openacc.async.active; l; l = l->next)
+    if (!thr->dev->openacc.async.test_func (l->aq))
+      {
+	ret = 0;
+	break;
+      }
+  gomp_mutex_unlock (&thr->dev->openacc.async.lock);
+  return ret;
 }
 
 void
@@ -62,12 +158,11 @@  acc_wait (int async)
   if (!async_valid_p (async))
     gomp_fatal ("invalid async argument: %d", async);
 
-  struct goacc_thread *thr = goacc_thread ();
+  struct goacc_thread *thr = get_goacc_thread ();
 
-  if (!thr || !thr->dev)
-    gomp_fatal ("no device active");
-
-  thr->dev->openacc.async_wait_func (async);
+  goacc_aq aq = lookup_goacc_asyncqueue (thr, true, async);
+  if (!thr->dev->openacc.async.synchronize_func (aq))
+    gomp_fatal ("wait on %d failed", async);
 }
 
 /* acc_async_wait is an OpenACC 1.0 compatibility name for acc_wait.  */
@@ -84,23 +179,36 @@  acc_async_wait (int async)
 void
 acc_wait_async (int async1, int async2)
 {
-  struct goacc_thread *thr = goacc_thread ();
+  struct goacc_thread *thr = get_goacc_thread ();
 
-  if (!thr || !thr->dev)
-    gomp_fatal ("no device active");
+  goacc_aq aq2 = lookup_goacc_asyncqueue (thr, true, async2);
+  goacc_aq aq1 = lookup_goacc_asyncqueue (thr, false, async1);
+  if (!aq1)
+    gomp_fatal ("invalid async 1");
 
-  thr->dev->openacc.async_wait_async_func (async1, async2);
+  /* An async queue is always synchronized with itself.  */
+  if (aq1 == aq2)
+    return;
+
+  if (!thr->dev->openacc.async.synchronize_func (aq1))
+    gomp_fatal ("wait on %d failed", async1);
+  if (!thr->dev->openacc.async.serialize_func (aq1, aq2))
+    gomp_fatal ("ordering of async ids %d and %d failed", async1, async2);
 }
 
 void
 acc_wait_all (void)
 {
-  struct goacc_thread *thr = goacc_thread ();
+  struct gomp_device_descr *dev = get_goacc_thread_device ();
 
-  if (!thr || !thr->dev)
-    gomp_fatal ("no device active");
+  bool ret = true;
+  gomp_mutex_lock (&dev->openacc.async.lock);
+  for (goacc_aq_list l = dev->openacc.async.active; l; l = l->next)
+    ret &= dev->openacc.async.synchronize_func (l->aq);
+  gomp_mutex_unlock (&dev->openacc.async.lock);
 
-  thr->dev->openacc.async_wait_all_func ();
+  if (!ret)
+    gomp_fatal ("wait all failed");
 }
 
 /* acc_async_wait_all is an OpenACC 1.0 compatibility name for acc_wait_all.  */
@@ -120,10 +228,80 @@  acc_wait_all_async (int async)
   if (!async_valid_p (async))
     gomp_fatal ("invalid async argument: %d", async);
 
-  struct goacc_thread *thr = goacc_thread ();
+  struct goacc_thread *thr = get_goacc_thread ();
 
-  if (!thr || !thr->dev)
-    gomp_fatal ("no device active");
+  goacc_aq waiting_queue = lookup_goacc_asyncqueue (thr, true, async);
 
-  thr->dev->openacc.async_wait_all_async_func (async);
+  bool ret = true;
+  gomp_mutex_lock (&thr->dev->openacc.async.lock);
+  for (goacc_aq_list l = thr->dev->openacc.async.active; l; l = l->next)
+    {
+      ret &= thr->dev->openacc.async.synchronize_func (l->aq);
+      if (waiting_queue)
+	ret &= thr->dev->openacc.async.serialize_func (l->aq, waiting_queue);
+    }
+  gomp_mutex_unlock (&thr->dev->openacc.async.lock);
+
+  if (!ret)
+    gomp_fatal ("wait all async(%d) failed", async);
 }
+
+int
+acc_get_default_async (void)
+{
+  struct goacc_thread *thr = get_goacc_thread ();
+  return thr->default_async;
+}
+
+void
+acc_set_default_async (int async)
+{
+  if (async < acc_async_sync)
+    gomp_fatal ("invalid async argument: %d", async);
+
+  struct goacc_thread *thr = get_goacc_thread ();
+  thr->default_async = async;
+}
+
+attribute_hidden void
+goacc_async_free (struct gomp_device_descr *devicep,
+		  struct goacc_asyncqueue *aq, void *ptr)
+{
+  if (!aq)
+    free (ptr);
+  else
+    devicep->openacc.async.queue_callback_func (aq, free, ptr);
+}
+
+attribute_hidden void
+goacc_init_asyncqueues (struct gomp_device_descr *devicep)
+{
+  devicep->openacc.async.nasyncqueue = 0;
+  devicep->openacc.async.asyncqueue = NULL;
+  devicep->openacc.async.active = NULL;
+  gomp_mutex_init (&devicep->openacc.async.lock);
+}
+
+attribute_hidden bool
+goacc_fini_asyncqueues (struct gomp_device_descr *devicep)
+{
+  bool ret = true;
+  gomp_mutex_lock (&devicep->openacc.async.lock);
+  if (devicep->openacc.async.nasyncqueue > 0)
+    {
+      goacc_aq_list next;
+      for (goacc_aq_list l = devicep->openacc.async.active; l; l = next)
+	{
+	  ret &= devicep->openacc.async.destruct_func (l->aq);
+	  next = l->next;
+	  free (l);
+	}
+      free (devicep->openacc.async.asyncqueue);
+      devicep->openacc.async.nasyncqueue = 0;
+      devicep->openacc.async.asyncqueue = NULL;
+      devicep->openacc.async.active = NULL;
+    }
+  gomp_mutex_unlock (&devicep->openacc.async.lock);
+  gomp_mutex_destroy (&devicep->openacc.async.lock);
+  return ret;
+}
Index: libgomp/oacc-cuda.c
===================================================================
--- libgomp/oacc-cuda.c	(revision 268142)
+++ libgomp/oacc-cuda.c	(working copy)
@@ -62,7 +62,11 @@  acc_get_cuda_stream (int async)
     return NULL;
 
   if (thr && thr->dev && thr->dev->openacc.cuda.get_stream_func)
-    return thr->dev->openacc.cuda.get_stream_func (async);
+    {
+      goacc_aq aq = lookup_goacc_asyncqueue (thr, false, async);
+      if (aq)
+	return thr->dev->openacc.cuda.get_stream_func (aq);
+    }
  
   return NULL;
 }
@@ -79,8 +83,14 @@  acc_set_cuda_stream (int async, void *stream)
 
   thr = goacc_thread ();
 
+  int ret = -1;
   if (thr && thr->dev && thr->dev->openacc.cuda.set_stream_func)
-    return thr->dev->openacc.cuda.set_stream_func (async, stream);
+    {
+      goacc_aq aq = get_goacc_asyncqueue (async);
+      gomp_mutex_lock (&thr->dev->openacc.async.lock);
+      ret = thr->dev->openacc.cuda.set_stream_func (aq, stream);
+      gomp_mutex_unlock (&thr->dev->openacc.async.lock);
+    }
 
-  return -1;
+  return ret;
 }
Index: libgomp/oacc-host.c
===================================================================
--- libgomp/oacc-host.c	(revision 268142)
+++ libgomp/oacc-host.c	(working copy)
@@ -75,7 +75,7 @@  host_fini_device (int n __attribute__ ((unused)))
 static unsigned
 host_version (void)
 {
-  return GOMP_VERSION;
+  return GOMP_PLUGIN_IF_VERSION;
 }
 
 static int
@@ -140,8 +140,7 @@  host_openacc_exec (void (*fn) (void *),
 		   size_t mapnum __attribute__ ((unused)),
 		   void **hostaddrs,
 		   void **devaddrs __attribute__ ((unused)),
-		   int async __attribute__ ((unused)),
-		   unsigned *dims __attribute ((unused)),
+		   unsigned *dims __attribute__ ((unused)),
 		   void *targ_mem_desc __attribute__ ((unused)))
 {
   fn (hostaddrs);
@@ -148,49 +147,84 @@  host_openacc_exec (void (*fn) (void *),
 }
 
 static void
-host_openacc_register_async_cleanup (void *targ_mem_desc __attribute__ ((unused)),
-				     int async __attribute__ ((unused)))
+host_openacc_async_exec (void (*fn) (void *),
+			 size_t mapnum __attribute__ ((unused)),
+			 void **hostaddrs,
+			 void **devaddrs __attribute__ ((unused)),
+			 unsigned *dims __attribute__ ((unused)),
+			 void *targ_mem_desc __attribute__ ((unused)),
+			 struct goacc_asyncqueue *aq __attribute__ ((unused)))
 {
+  fn (hostaddrs);
 }
 
 static int
-host_openacc_async_test (int async __attribute__ ((unused)))
+host_openacc_async_test (struct goacc_asyncqueue *aq __attribute__ ((unused)))
 {
   return 1;
 }
 
-static int
-host_openacc_async_test_all (void)
+static bool
+host_openacc_async_synchronize (struct goacc_asyncqueue *aq
+				__attribute__ ((unused)))
 {
-  return 1;
+  return true;
 }
 
-static void
-host_openacc_async_wait (int async __attribute__ ((unused)))
+static bool
+host_openacc_async_serialize (struct goacc_asyncqueue *aq1
+			      __attribute__ ((unused)),
+			      struct goacc_asyncqueue *aq2
+			      __attribute__ ((unused)))
 {
+  return true;
 }
 
-static void
-host_openacc_async_wait_async (int async1 __attribute__ ((unused)),
-			       int async2 __attribute__ ((unused)))
+static bool
+host_openacc_async_host2dev (int ord __attribute__ ((unused)),
+			     void *dst __attribute__ ((unused)),
+			     const void *src __attribute__ ((unused)),
+			     size_t n __attribute__ ((unused)),
+			     struct goacc_asyncqueue *aq
+			     __attribute__ ((unused)))
 {
+  return true;
 }
 
-static void
-host_openacc_async_wait_all (void)
+static bool
+host_openacc_async_dev2host (int ord __attribute__ ((unused)),
+			     void *dst __attribute__ ((unused)),
+			     const void *src __attribute__ ((unused)),
+			     size_t n __attribute__ ((unused)),
+			     struct goacc_asyncqueue *aq
+			     __attribute__ ((unused)))
 {
+  return true;
 }
 
 static void
-host_openacc_async_wait_all_async (int async __attribute__ ((unused)))
+host_openacc_async_queue_callback (struct goacc_asyncqueue *aq
+				   __attribute__ ((unused)),
+				   void (*callback_fn)(void *)
+				   __attribute__ ((unused)),
+				   void *userptr __attribute__ ((unused)))
 {
 }
 
-static void
-host_openacc_async_set_async (int async __attribute__ ((unused)))
+static struct goacc_asyncqueue *
+host_openacc_async_construct (void)
 {
+  /* Non-NULL 0xffff... value as opaque dummy.  */
+  return (struct goacc_asyncqueue *) -1;
 }
 
+static bool
+host_openacc_async_destruct (struct goacc_asyncqueue *aq
+			     __attribute__ ((unused)))
+{
+  return true;
+}
+
 static void *
 host_openacc_create_thread_data (int ord __attribute__ ((unused)))
 {
@@ -235,19 +269,21 @@  static struct gomp_device_descr host_dispatch =
 
       .exec_func = host_openacc_exec,
 
-      .register_async_cleanup_func = host_openacc_register_async_cleanup,
-
-      .async_test_func = host_openacc_async_test,
-      .async_test_all_func = host_openacc_async_test_all,
-      .async_wait_func = host_openacc_async_wait,
-      .async_wait_async_func = host_openacc_async_wait_async,
-      .async_wait_all_func = host_openacc_async_wait_all,
-      .async_wait_all_async_func = host_openacc_async_wait_all_async,
-      .async_set_async_func = host_openacc_async_set_async,
-
       .create_thread_data_func = host_openacc_create_thread_data,
       .destroy_thread_data_func = host_openacc_destroy_thread_data,
 
+      .async = {
+	.construct_func = host_openacc_async_construct,
+	.destruct_func = host_openacc_async_destruct,
+	.test_func = host_openacc_async_test,
+	.synchronize_func = host_openacc_async_synchronize,
+	.serialize_func = host_openacc_async_serialize,
+	.queue_callback_func = host_openacc_async_queue_callback,
+	.exec_func = host_openacc_async_exec,
+	.dev2host_func = host_openacc_async_dev2host,
+	.host2dev_func = host_openacc_async_host2dev,
+      },
+
       .cuda = {
 	.get_current_device_func = NULL,
 	.get_current_context_func = NULL,
Index: libgomp/oacc-init.c
===================================================================
--- libgomp/oacc-init.c	(revision 268142)
+++ libgomp/oacc-init.c	(working copy)
@@ -309,7 +309,7 @@  acc_shutdown_1 (acc_device_t d)
       if (acc_dev->state == GOMP_DEVICE_INITIALIZED)
         {
 	  devices_active = true;
-	  ret &= acc_dev->fini_device_func (acc_dev->target_id);
+	  ret &= gomp_fini_device (acc_dev);
 	  acc_dev->state = GOMP_DEVICE_UNINITIALIZED;
 	}
       gomp_mutex_unlock (&acc_dev->lock);
@@ -426,8 +426,8 @@  goacc_attach_host_thread_to_device (int ord)
   
   thr->target_tls
     = acc_dev->openacc.create_thread_data_func (ord);
-  
-  acc_dev->openacc.async_set_async_func (acc_async_sync);
+
+  thr->default_async = acc_async_default;
 }
 
 /* OpenACC 2.0a (3.2.12, 3.2.13) doesn't specify whether the serialization of
Index: libgomp/oacc-int.h
===================================================================
--- libgomp/oacc-int.h	(revision 268142)
+++ libgomp/oacc-int.h	(working copy)
@@ -73,6 +73,9 @@  struct goacc_thread
 
   /* Target-specific data (used by plugin).  */
   void *target_tls;
+
+  /* Default OpenACC async queue for current thread, exported to plugin.  */
+  int default_async;
 };
 
 #if defined HAVE_TLS || defined USE_EMUTLS
@@ -99,6 +102,13 @@  void goacc_restore_bind (void);
 void goacc_lazy_initialize (void);
 void goacc_host_init (void);
 
+void goacc_init_asyncqueues (struct gomp_device_descr *);
+bool goacc_fini_asyncqueues (struct gomp_device_descr *);
+void goacc_async_free (struct gomp_device_descr *, struct goacc_asyncqueue *,
+		       void *);
+struct goacc_asyncqueue *get_goacc_asyncqueue (int);
+struct goacc_asyncqueue *lookup_goacc_asyncqueue (struct goacc_thread *, bool,
+						  int);
 static inline bool
 async_valid_stream_id_p (int async)
 {
Index: libgomp/oacc-mem.c
===================================================================
--- libgomp/oacc-mem.c	(revision 268142)
+++ libgomp/oacc-mem.c	(working copy)
@@ -172,18 +172,11 @@  memcpy_tofrom_device (bool from, void *d, void *h,
       return;
     }
 
-  if (async > acc_async_sync)
-    thr->dev->openacc.async_set_async_func (async);
-
-  bool ret = (from
-	      ? thr->dev->dev2host_func (thr->dev->target_id, h, d, s)
-	      : thr->dev->host2dev_func (thr->dev->target_id, d, h, s));
-
-  if (async > acc_async_sync)
-    thr->dev->openacc.async_set_async_func (acc_async_sync);
-
-  if (!ret)
-    gomp_fatal ("error in %s", libfnname);
+  goacc_aq aq = get_goacc_asyncqueue (async);
+  if (from)
+    gomp_copy_dev2host (thr->dev, aq, h, d, s);
+  else
+    gomp_copy_host2dev (thr->dev, aq, d, h, s, /* TODO: cbuf? */ NULL);
 }
 
 void
@@ -509,17 +502,13 @@  present_create_copy (unsigned f, void *h, size_t s
 
       gomp_mutex_unlock (&acc_dev->lock);
 
-      if (async > acc_async_sync)
-	acc_dev->openacc.async_set_async_func (async);
+      goacc_aq aq = get_goacc_asyncqueue (async);
 
-      tgt = gomp_map_vars (acc_dev, mapnum, &hostaddrs, NULL, &s, &kinds, true,
-			   GOMP_MAP_VARS_OPENACC);
+      tgt = gomp_map_vars_async (acc_dev, aq, mapnum, &hostaddrs, NULL, &s,
+				 &kinds, true, GOMP_MAP_VARS_OPENACC);
       /* Initialize dynamic refcount.  */
       tgt->list[0].key->dynamic_refcount = 1;
 
-      if (async > acc_async_sync)
-	acc_dev->openacc.async_set_async_func (acc_async_sync);
-
       gomp_mutex_lock (&acc_dev->lock);
 
       d = tgt->to_free;
@@ -676,13 +665,9 @@  delete_copyout (unsigned f, void *h, size_t s, int
 
       if (f & FLAG_COPYOUT)
 	{
-	  if (async > acc_async_sync)
-	    acc_dev->openacc.async_set_async_func (async);
-	  acc_dev->dev2host_func (acc_dev->target_id, h, d, s);
-	  if (async > acc_async_sync)
-	    acc_dev->openacc.async_set_async_func (acc_async_sync);
+	  goacc_aq aq = get_goacc_asyncqueue (async);
+	  gomp_copy_dev2host (acc_dev, aq, h, d, s);
 	}
-
       gomp_remove_var (acc_dev, n);
     }
 
@@ -765,17 +750,13 @@  update_dev_host (int is_dev, void *h, size_t s, in
   d = (void *) (n->tgt->tgt_start + n->tgt_offset
 		+ (uintptr_t) h - n->host_start);
 
-  if (async > acc_async_sync)
-    acc_dev->openacc.async_set_async_func (async);
+  goacc_aq aq = get_goacc_asyncqueue (async);
 
   if (is_dev)
-    acc_dev->host2dev_func (acc_dev->target_id, d, h, s);
+    gomp_copy_host2dev (acc_dev, aq, d, h, s, /* TODO: cbuf? */ NULL);
   else
-    acc_dev->dev2host_func (acc_dev->target_id, h, d, s);
+    gomp_copy_dev2host (acc_dev, aq, h, d, s);
 
-  if (async > acc_async_sync)
-    acc_dev->openacc.async_set_async_func (acc_async_sync);
-
   gomp_mutex_unlock (&acc_dev->lock);
 }
 
@@ -805,7 +786,7 @@  acc_update_self_async (void *h, size_t s, int asyn
 
 void
 gomp_acc_insert_pointer (size_t mapnum, void **hostaddrs, size_t *sizes,
-			 void *kinds)
+			 void *kinds, int async)
 {
   struct target_mem_desc *tgt;
   struct goacc_thread *thr = goacc_thread ();
@@ -835,8 +816,9 @@  gomp_acc_insert_pointer (size_t mapnum, void **hos
     }
 
   gomp_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
-  tgt = gomp_map_vars (acc_dev, mapnum, hostaddrs,
-		       NULL, sizes, kinds, true, GOMP_MAP_VARS_OPENACC);
+  goacc_aq aq = get_goacc_asyncqueue (async);
+  tgt = gomp_map_vars_async (acc_dev, aq, mapnum, hostaddrs,
+			     NULL, sizes, kinds, true, GOMP_MAP_VARS_OPENACC);
   gomp_debug (0, "  %s: mappings prepared\n", __FUNCTION__);
 
   /* Initialize dynamic refcount.  */
@@ -930,7 +912,10 @@  gomp_acc_remove_pointer (void *h, size_t s, bool f
       if (async < acc_async_noval)
 	gomp_unmap_vars (t, true);
       else
-	t->device_descr->openacc.register_async_cleanup_func (t, async);
+	{
+	  goacc_aq aq = get_goacc_asyncqueue (async);
+	  gomp_unmap_vars_async (t, true, aq);
+	}
     }
 
   gomp_mutex_unlock (&acc_dev->lock);
Index: libgomp/oacc-parallel.c
===================================================================
--- libgomp/oacc-parallel.c	(revision 268142)
+++ libgomp/oacc-parallel.c	(working copy)
@@ -219,8 +219,6 @@  GOACC_parallel_keyed (int flags_m, void (*fn) (voi
     }
   va_end (ap);
   
-  acc_dev->openacc.async_set_async_func (async);
-
   if (!(acc_dev->capabilities & GOMP_OFFLOAD_CAP_NATIVE_EXEC))
     {
       k.host_start = (uintptr_t) fn;
@@ -237,44 +235,29 @@  GOACC_parallel_keyed (int flags_m, void (*fn) (voi
   else
     tgt_fn = (void (*)) fn;
 
-  tgt = gomp_map_vars (acc_dev, mapnum, hostaddrs, NULL, sizes, kinds, true,
-		       GOMP_MAP_VARS_OPENACC);
+  goacc_aq aq = get_goacc_asyncqueue (async);
 
+  tgt = gomp_map_vars_async (acc_dev, aq, mapnum, hostaddrs, NULL, sizes, kinds,
+			     true, GOMP_MAP_VARS_OPENACC);
+  
   devaddrs = gomp_alloca (sizeof (void *) * mapnum);
   for (i = 0; i < mapnum; i++)
     devaddrs[i] = (void *) (tgt->list[i].key->tgt->tgt_start
 			    + tgt->list[i].key->tgt_offset
 			    + tgt->list[i].offset);
-
-  acc_dev->openacc.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs,
-			      async, dims, tgt);
-
-  /* If running synchronously, unmap immediately.  */
-  bool copyfrom = true;
-  if (async_synchronous_p (async))
-    gomp_unmap_vars (tgt, true);
+  if (aq == NULL)
+    {
+      acc_dev->openacc.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs,
+				  dims, tgt);
+      /* If running synchronously, unmap immediately.  */
+      gomp_unmap_vars (tgt, true);
+    }
   else
     {
-      bool async_unmap = false;
-      for (size_t i = 0; i < tgt->list_count; i++)
-	{
-	  splay_tree_key k = tgt->list[i].key;
-	  if (k && k->refcount == 1)
-	    {
-	      async_unmap = true;
-	      break;
-	    }
-	}
-      if (async_unmap)
-	tgt->device_descr->openacc.register_async_cleanup_func (tgt, async);
-      else
-	{
-	  copyfrom = false;
-	  gomp_unmap_vars (tgt, copyfrom);
-	}
+      acc_dev->openacc.async.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs,
+					dims, tgt, aq);
+      gomp_unmap_vars_async (tgt, true, aq);
     }
-
-  acc_dev->openacc.async_set_async_func (acc_async_sync);
 }
 
 /* Legacy entry point, only provide host execution.  */
@@ -385,8 +368,6 @@  GOACC_enter_exit_data (int flags_m, size_t mapnum,
 	finalize = true;
     }
 
-  acc_dev->openacc.async_set_async_func (async);
-
   /* Determine if this is an "acc enter data".  */
   for (i = 0; i < mapnum; ++i)
     {
@@ -439,11 +420,11 @@  GOACC_enter_exit_data (int flags_m, size_t mapnum,
 		{
 		case GOMP_MAP_ALLOC:
 		case GOMP_MAP_FORCE_ALLOC:
-		  acc_create (hostaddrs[i], sizes[i]);
+		  acc_create_async (hostaddrs[i], sizes[i], async);
 		  break;
 		case GOMP_MAP_TO:
 		case GOMP_MAP_FORCE_TO:
-		  acc_copyin (hostaddrs[i], sizes[i]);
+		  acc_copyin_async (hostaddrs[i], sizes[i], async);
 		  break;
 		default:
 		  gomp_fatal (">>>> GOACC_enter_exit_data UNHANDLED kind 0x%.2x",
@@ -454,7 +435,7 @@  GOACC_enter_exit_data (int flags_m, size_t mapnum,
 	  else
 	    {
 	      gomp_acc_insert_pointer (pointer, &hostaddrs[i],
-				       &sizes[i], &kinds[i]);
+				       &sizes[i], &kinds[i], async);
 	      /* Increment 'i' by two because OpenACC requires fortran
 		 arrays to be contiguous, so each PSET is associated with
 		 one of MAP_FORCE_ALLOC/MAP_FORCE_PRESET/MAP_FORCE_TO, and
@@ -479,17 +460,17 @@  GOACC_enter_exit_data (int flags_m, size_t mapnum,
 		if (acc_is_present (hostaddrs[i], sizes[i]))
 		  {
 		    if (finalize)
-		      acc_delete_finalize (hostaddrs[i], sizes[i]);
+		      acc_delete_finalize_async (hostaddrs[i], sizes[i], async);
 		    else
-		      acc_delete (hostaddrs[i], sizes[i]);
+		      acc_delete_async (hostaddrs[i], sizes[i], async);
 		  }
 		break;
 	      case GOMP_MAP_FROM:
 	      case GOMP_MAP_FORCE_FROM:
 		if (finalize)
-		  acc_copyout_finalize (hostaddrs[i], sizes[i]);
+		  acc_copyout_finalize_async (hostaddrs[i], sizes[i], async);
 		else
-		  acc_copyout (hostaddrs[i], sizes[i]);
+		  acc_copyout_async (hostaddrs[i], sizes[i], async);
 		break;
 	      default:
 		gomp_fatal (">>>> GOACC_enter_exit_data UNHANDLED kind 0x%.2x",
@@ -507,8 +488,6 @@  GOACC_enter_exit_data (int flags_m, size_t mapnum,
 	    i += pointer - 1;
 	  }
       }
-
-  acc_dev->openacc.async_set_async_func (acc_async_sync);
 }
 
 static void
@@ -520,18 +499,28 @@  goacc_wait (int async, int num_waits, va_list *ap)
   while (num_waits--)
     {
       int qid = va_arg (*ap, int);
-      
-      if (acc_async_test (qid))
+
+      goacc_aq aq = get_goacc_asyncqueue (qid);
+      if (acc_dev->openacc.async.test_func (aq))
 	continue;
-
       if (async == acc_async_sync)
-	acc_wait (qid);
+	{
+	  if (!acc_dev->openacc.async.synchronize_func (aq))
+	    gomp_fatal ("wait(%d) failed", qid);
+	}
       else if (qid == async)
-	;/* If we're waiting on the same asynchronous queue as we're
-	    launching on, the queue itself will order work as
-	    required, so there's no need to wait explicitly.  */
+	/* If we're waiting on the same asynchronous queue as we're
+	   launching on, the queue itself will order work as
+	   required, so there's no need to wait explicitly.  */
+	;
       else
-	acc_dev->openacc.async_wait_async_func (qid, async);
+	{
+	  goacc_aq aq2 = get_goacc_asyncqueue (async);
+	  if (!acc_dev->openacc.async.synchronize_func (aq))
+	    gomp_fatal ("wait(%d) failed", qid);
+	  if (!acc_dev->openacc.async.serialize_func (aq, aq2))
+	    gomp_fatal ("wait(%d) async(%d) failed", qid, async);
+	}
     }
 }
 
@@ -562,8 +551,6 @@  GOACC_update (int flags_m, size_t mapnum,
       va_end (ap);
     }
 
-  acc_dev->openacc.async_set_async_func (async);
-
   bool update_device = false;
   for (i = 0; i < mapnum; ++i)
     {
@@ -603,7 +590,7 @@  GOACC_update (int flags_m, size_t mapnum,
 	  /* Fallthru  */
 	case GOMP_MAP_FORCE_TO:
 	  update_device = true;
-	  acc_update_device (hostaddrs[i], sizes[i]);
+	  acc_update_device_async (hostaddrs[i], sizes[i], async);
 	  break;
 
 	case GOMP_MAP_FROM:
@@ -615,7 +602,7 @@  GOACC_update (int flags_m, size_t mapnum,
 	  /* Fallthru  */
 	case GOMP_MAP_FORCE_FROM:
 	  update_device = false;
-	  acc_update_self (hostaddrs[i], sizes[i]);
+	  acc_update_self_async (hostaddrs[i], sizes[i], async);
 	  break;
 
 	default:
@@ -623,8 +610,6 @@  GOACC_update (int flags_m, size_t mapnum,
 	  break;
 	}
     }
-
-  acc_dev->openacc.async_set_async_func (acc_async_sync);
 }
 
 void
Index: libgomp/oacc-plugin.c
===================================================================
--- libgomp/oacc-plugin.c	(revision 268142)
+++ libgomp/oacc-plugin.c	(working copy)
@@ -30,15 +30,12 @@ 
 #include "oacc-plugin.h"
 #include "oacc-int.h"
 
+/* This plugin function is now obsolete.  */
 void
-GOMP_PLUGIN_async_unmap_vars (void *ptr, int async)
+GOMP_PLUGIN_async_unmap_vars (void *ptr __attribute__((unused)),
+			      int async __attribute__((unused)))
 {
-  struct target_mem_desc *tgt = ptr;
-  struct gomp_device_descr *devicep = tgt->device_descr;
-
-  devicep->openacc.async_set_async_func (async);
-  gomp_unmap_vars (tgt, true);
-  devicep->openacc.async_set_async_func (acc_async_sync);
+  gomp_fatal ("invalid plugin function");
 }
 
 /* Return the target-specific part of the TLS data for the current thread.  */