[og9] Fix OpenACC "ephemeral" asynchronous host-to-device copies

Message ID 20190917172156.111727-5-julian@codesourcery.com
State New
Headers show
Series
  • [og9] Fix OpenACC "ephemeral" asynchronous host-to-device copies
Related show

Commit Message

Julian Brown Sept. 17, 2019, 5:21 p.m.
This patch fixes an issue with back-to-back asynchronous compute regions
working on the same data with intervening copyout/copyins. For such
regions, there was a likelihood that asynchronous "copyin" operations
on the second region would take place before the copyout from the first
region had completed.

I'd previously thought that copying asynchronous "host" data for
host-to-device transfers immediately to a temporary buffer was always
safe, but not necessarily optimal. But that turns out to not be true
if the source data in question originates from the user program, and is
used for the output of earlier asynchronous operations.

Unfortunately I don't think there's a way of fixing this problem without
knowing where the source data for a particular host-to-device copy comes
from -- several places (e.g. in gomp_map_vars_internal) have that data
coming from a host stack location, which may be long gone by the time
the asynchronous host-to-device copy takes place.

So, this patch introduces an "ephemeral" parameter to host-to-device
copying functions -- right down to the async entry point for such
copies in the offload plugin. The parameter must be set accurately: if
it is TRUE for copies from user data as above, then stale data may be
copied to the device. If it is FALSE for host-stack originated copies,
or for heap locations that might disappear before the copy takes place,
the host-to-device copy will transfer garbage.

The patch also disables coalescing buffers for asynchronous copies in
target.c, because those too may cause stale data to be copied to the target.

Tested with offloading to AMD GCN. I will apply to the
openacc-gcc-9-branch shortly.

Julian

ChangeLog

	libgomp/
	* libgomp-plugin.h (GOMP_OFFLOAD_openacc_async_host2dev): Update
	prototype.
	* libgomp.h (gomp_copy_host2dev): Update prototype.
	* oacc-host.c (host_openacc_async_host2dev): Add ephemeral parameter.
	* oacc-mem.c (memcpy_tofrom_device): Update call to gomp_copy_host2dev.
	(update_dev_host): Likewise.
	* oacc-parallel.c (GOACC_enter_exit_data): Call async versions of
	acc_attach/acc_detach/acc_detach_finalize functions.
	* plugin/plugin-gcn.c (wait_for_queue_nonfull): Don't lock/unlock
	aq->mutex here.
	(queue_push_launch): Lock aq->mutex before calling
	wait_for_queue_nonfull.
	(queue_push_callback): Likewise.
	(queue_push_asyncwait): Likewise.
	(queue_push_placeholder): Likewise.
	(GOMP_OFFLOAD_openacc_async_host2dev): Add ephemeral parameter.  Copy
	source data to temporary space immediately if true, and pass to
	queue_push_copy.
	(goacc_device_copy_async): Remove.
	(gomp_copy_host2dev): Add ephemeral parameter. Update function comment.
	Call async host2dev plugin hook directly.
	(gomp_copy_dev2host): Call async dev2host plugin hook directly.
	(gomp_map_vars_existing, gomp_map_pointer, gomp_attach_pointer,
	gomp_detach_pointer): Update calls to gomp_copy_host2dev.
	(gomp_map_vars_internal): Don't use coalescing buffer for asynchronous
	copies. Update calls to gomp_copy_host2dev.
	(gomp_update): Update calls to gomp_copy_host2dev.
	* testsuite/libgomp.oacc-c-c++-common/deep-copy-10.c (main): Fix
	async-safety issue. Increase number of iterations.
	* testsuite/libgomp.oacc-fortran/lib-16-2.f90: Fix async-safety issue.
---
 libgomp/ChangeLog.openacc                     |  33 ++++++
 libgomp/libgomp-plugin.h                      |   3 +-
 libgomp/libgomp.h                             |   2 +-
 libgomp/oacc-host.c                           |   1 +
 libgomp/oacc-mem.c                            |   4 +-
 libgomp/oacc-parallel.c                       |  10 +-
 libgomp/plugin/plugin-gcn.c                   |  43 ++++----
 libgomp/target.c                              | 101 +++++++++---------
 .../libgomp.oacc-c-c++-common/deep-copy-10.c  |  20 ++--
 .../libgomp.oacc-fortran/lib-16-2.f90         |   5 +
 10 files changed, 135 insertions(+), 87 deletions(-)

-- 
2.22.0

Patch

diff --git a/libgomp/ChangeLog.openacc b/libgomp/ChangeLog.openacc
index 5f39fae6f51..1006b8149c8 100644
--- a/libgomp/ChangeLog.openacc
+++ b/libgomp/ChangeLog.openacc
@@ -1,3 +1,36 @@ 
+2019-09-17  Julian Brown  <julian@codesourcery.com>
+
+	* libgomp-plugin.h (GOMP_OFFLOAD_openacc_async_host2dev): Update
+	prototype.
+	* libgomp.h (gomp_copy_host2dev): Update prototype.
+	* oacc-host.c (host_openacc_async_host2dev): Add ephemeral parameter.
+	* oacc-mem.c (memcpy_tofrom_device): Update call to gomp_copy_host2dev.
+	(update_dev_host): Likewise.
+	* oacc-parallel.c (GOACC_enter_exit_data): Call async versions of
+	acc_attach/acc_detach/acc_detach_finalize functions.
+	* plugin/plugin-gcn.c (wait_for_queue_nonfull): Don't lock/unlock
+	aq->mutex here.
+	(queue_push_launch): Lock aq->mutex before calling
+	wait_for_queue_nonfull.
+	(queue_push_callback): Likewise.
+	(queue_push_asyncwait): Likewise.
+	(queue_push_placeholder): Likewise.
+	(GOMP_OFFLOAD_openacc_async_host2dev): Add ephemeral parameter.  Copy
+	source data to temporary space immediately if true, and pass to
+	queue_push_copy.
+	(goacc_device_copy_async): Remove.
+	(gomp_copy_host2dev): Add ephemeral parameter. Update function comment.
+	Call async host2dev plugin hook directly.
+	(gomp_copy_dev2host): Call async dev2host plugin hook directly.
+	(gomp_map_vars_existing, gomp_map_pointer, gomp_attach_pointer,
+	gomp_detach_pointer): Update calls to gomp_copy_host2dev.
+	(gomp_map_vars_internal): Don't use coalescing buffer for asynchronous
+	copies. Update calls to gomp_copy_host2dev.
+	(gomp_update): Update calls to gomp_copy_host2dev.
+	* testsuite/libgomp.oacc-c-c++-common/deep-copy-10.c (main): Fix
+	async-safety issue. Increase number of iterations.
+	* testsuite/libgomp.oacc-fortran/lib-16-2.f90: Fix async-safety issue.
+
 2019-09-17  Julian Brown  <julian@codesourcery.com>
 
 	* oacc-host.c (host_openacc_async_queue_callback): Invoke callback
diff --git a/libgomp/libgomp-plugin.h b/libgomp/libgomp-plugin.h
index bd63c422b0c..fcd47279332 100644
--- a/libgomp/libgomp-plugin.h
+++ b/libgomp/libgomp-plugin.h
@@ -141,7 +141,8 @@  extern void GOMP_OFFLOAD_openacc_async_exec_params (void (*) (void *), size_t,
 						    struct goacc_asyncqueue *);
 extern bool GOMP_OFFLOAD_openacc_async_dev2host (int, void *, const void *, size_t,
 						 struct goacc_asyncqueue *);
-extern bool GOMP_OFFLOAD_openacc_async_host2dev (int, void *, const void *, size_t,
+extern bool GOMP_OFFLOAD_openacc_async_host2dev (int, void *, const void *,
+						 size_t, bool,
 						 struct goacc_asyncqueue *);
 extern void *GOMP_OFFLOAD_openacc_cuda_get_current_device (void);
 extern void *GOMP_OFFLOAD_openacc_cuda_get_current_context (void);
diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index 803f72db922..ab216a31206 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -1120,7 +1120,7 @@  extern void gomp_acc_declare_allocate (bool, size_t, void **, size_t *,
 struct gomp_coalesce_buf;
 extern void gomp_copy_host2dev (struct gomp_device_descr *,
 				struct goacc_asyncqueue *, void *, const void *,
-				size_t, struct gomp_coalesce_buf *);
+				size_t, bool, struct gomp_coalesce_buf *);
 extern void gomp_copy_dev2host (struct gomp_device_descr *,
 				struct goacc_asyncqueue *, void *, const void *,
 				size_t);
diff --git a/libgomp/oacc-host.c b/libgomp/oacc-host.c
index 0231b597114..4bc2eeb3c53 100644
--- a/libgomp/oacc-host.c
+++ b/libgomp/oacc-host.c
@@ -230,6 +230,7 @@  host_openacc_async_host2dev (int ord __attribute__ ((unused)),
 			     void *dst __attribute__ ((unused)),
 			     const void *src __attribute__ ((unused)),
 			     size_t n __attribute__ ((unused)),
+			     bool eph __attribute__ ((unused)),
 			     struct goacc_asyncqueue *aq
 			     __attribute__ ((unused)))
 {
diff --git a/libgomp/oacc-mem.c b/libgomp/oacc-mem.c
index c07a5eb42a7..f8c71bf04c5 100644
--- a/libgomp/oacc-mem.c
+++ b/libgomp/oacc-mem.c
@@ -203,7 +203,7 @@  memcpy_tofrom_device (bool from, void *d, void *h, size_t s, int async,
   if (from)
     gomp_copy_dev2host (thr->dev, aq, h, d, s);
   else
-    gomp_copy_host2dev (thr->dev, aq, d, h, s, /* TODO: cbuf? */ NULL);
+    gomp_copy_host2dev (thr->dev, aq, d, h, s, false, /* TODO: cbuf? */ NULL);
 
   if (profiling_p)
     {
@@ -819,7 +819,7 @@  update_dev_host (int is_dev, void *h, size_t s, int async)
   goacc_aq aq = get_goacc_asyncqueue (async);
 
   if (is_dev)
-    gomp_copy_host2dev (acc_dev, aq, d, h, s, /* TODO: cbuf? */ NULL);
+    gomp_copy_host2dev (acc_dev, aq, d, h, s, false, /* TODO: cbuf? */ NULL);
   else
     gomp_copy_dev2host (acc_dev, aq, h, d, s);
 
diff --git a/libgomp/oacc-parallel.c b/libgomp/oacc-parallel.c
index 0c9cb3c461c..a3ec0ed2adf 100644
--- a/libgomp/oacc-parallel.c
+++ b/libgomp/oacc-parallel.c
@@ -1022,7 +1022,7 @@  GOACC_enter_exit_data (int flags_m, size_t mapnum,
 	  if (!pointer)
 	    {
 	      if (kind == GOMP_MAP_ATTACH)
-		acc_attach (hostaddrs[i]);
+		acc_attach_async (hostaddrs[i], async);
 	      else if (kind == GOMP_MAP_STRUCT)
 	        i += sizes[i];
 	    }
@@ -1042,9 +1042,9 @@  GOACC_enter_exit_data (int flags_m, size_t mapnum,
 	  if (!pointer)
 	    {
 	      if (kind == GOMP_MAP_DETACH)
-		acc_detach (hostaddrs[i]);
+		acc_detach_async (hostaddrs[i], async);
 	      else if (kind == GOMP_MAP_FORCE_DETACH)
-		acc_detach_finalize (hostaddrs[i]);
+		acc_detach_finalize_async (hostaddrs[i], async);
 	      else if (kind == GOMP_MAP_STRUCT)
 	        i += sizes[i];
 	    }
@@ -1053,9 +1053,9 @@  GOACC_enter_exit_data (int flags_m, size_t mapnum,
 	      unsigned char kind2 = kinds[i + pointer - 1] & 0xff;
 
 	      if (kind2 == GOMP_MAP_DETACH)
-		acc_detach (hostaddrs[i + pointer - 1]);
+		acc_detach_async (hostaddrs[i + pointer - 1], async);
 	      else if (kind2 == GOMP_MAP_FORCE_DETACH)
-	        acc_detach_finalize (hostaddrs[i + pointer - 1]);
+	        acc_detach_finalize_async (hostaddrs[i + pointer - 1], async);
 
 	      i += pointer - 1;
 	    }
diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c
index b8ec96391f7..b5995af0a06 100644
--- a/libgomp/plugin/plugin-gcn.c
+++ b/libgomp/plugin/plugin-gcn.c
@@ -1408,13 +1408,9 @@  wait_for_queue_nonfull (struct goacc_asyncqueue *aq)
 {
   if (aq->queue_n == ASYNC_QUEUE_SIZE)
     {
-      pthread_mutex_lock (&aq->mutex);
-
       /* Queue is full.  Wait for it to not be full.  */
       while (aq->queue_n == ASYNC_QUEUE_SIZE)
 	pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
-
-      pthread_mutex_unlock (&aq->mutex);
     }
 }
 
@@ -1424,10 +1420,10 @@  queue_push_launch (struct goacc_asyncqueue *aq, struct kernel_info *kernel,
 {
   assert (aq->agent == kernel->agent);
 
-  wait_for_queue_nonfull (aq);
-
   pthread_mutex_lock (&aq->mutex);
 
+  wait_for_queue_nonfull (aq);
+
   int queue_last = ((aq->queue_first + aq->queue_n)
 		    % ASYNC_QUEUE_SIZE);
   if (DEBUG_QUEUES)
@@ -1453,10 +1449,10 @@  static void
 queue_push_callback (struct goacc_asyncqueue *aq, void (*fn)(void *),
 		     void *data)
 {
-  wait_for_queue_nonfull (aq);
-
   pthread_mutex_lock (&aq->mutex);
 
+  wait_for_queue_nonfull (aq);
+
   int queue_last = ((aq->queue_first + aq->queue_n)
 		    % ASYNC_QUEUE_SIZE);
   if (DEBUG_QUEUES)
@@ -1484,10 +1480,10 @@  static void
 queue_push_asyncwait (struct goacc_asyncqueue *aq,
 		      struct placeholder *placeholderp)
 {
-  wait_for_queue_nonfull (aq);
-
   pthread_mutex_lock (&aq->mutex);
 
+  wait_for_queue_nonfull (aq);
+
   int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
   if (DEBUG_QUEUES)
     HSA_DEBUG ("queue_push_asyncwait %d:%d: at %i\n", aq->agent->device_id,
@@ -1511,10 +1507,10 @@  queue_push_placeholder (struct goacc_asyncqueue *aq)
 {
   struct placeholder *placeholderp;
 
-  wait_for_queue_nonfull (aq);
-
   pthread_mutex_lock (&aq->mutex);
 
+  wait_for_queue_nonfull (aq);
+
   int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
   if (DEBUG_QUEUES)
     HSA_DEBUG ("queue_push_placeholder %d:%d: at %i\n", aq->agent->device_id,
@@ -3683,19 +3679,22 @@  GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
 
 bool
 GOMP_OFFLOAD_openacc_async_host2dev (int device, void *dst, const void *src,
-				     size_t n, struct goacc_asyncqueue *aq)
+				     size_t n, bool ephemeral,
+				     struct goacc_asyncqueue *aq)
 {
   struct agent_info *agent = get_agent_info (device);
   assert (agent == aq->agent);
-  /* The source data does not necessarily remain live until the deferred
-     copy happens.  Taking a snapshot of the data here avoids reading
-     uninitialised data later, but means that (a) data is copied twice and
-     (b) modifications to the copied data between the "spawning" point of
-     the asynchronous kernel and when it is executed will not be seen.
-     But, that is probably correct.  */
-  void *src_copy = GOMP_PLUGIN_malloc (n);
-  memcpy (src_copy, src, n);
-  queue_push_copy (aq, dst, src_copy, n, true);
+
+  if (ephemeral)
+    {
+      /* The source data is on the stack or otherwise may be deallocated
+	 before the asynchronous copy takes place.  Take a copy of the source
+	 data.  */
+      void *src_copy = GOMP_PLUGIN_malloc (n);
+      memcpy (src_copy, src, n);
+      src = src_copy;
+    }
+  queue_push_copy (aq, dst, src, n, ephemeral);
   return true;
 }
 
diff --git a/libgomp/target.c b/libgomp/target.c
index 0656df19613..683a42b1164 100644
--- a/libgomp/target.c
+++ b/libgomp/target.c
@@ -194,22 +194,6 @@  gomp_device_copy (struct gomp_device_descr *devicep,
     }
 }
 
-static inline void
-goacc_device_copy_async (struct gomp_device_descr *devicep,
-			 bool (*copy_func) (int, void *, const void *, size_t,
-					    struct goacc_asyncqueue *),
-			 const char *dst, void *dstaddr,
-			 const char *src, const void *srcaddr,
-			 size_t size, struct goacc_asyncqueue *aq)
-{
-  if (!copy_func (devicep->target_id, dstaddr, srcaddr, size, aq))
-    {
-      gomp_mutex_unlock (&devicep->lock);
-      gomp_fatal ("Copying of %s object [%p..%p) to %s object [%p..%p) failed",
-		  src, srcaddr, srcaddr + size, dst, dstaddr, dstaddr + size);
-    }
-}
-
 /* Infrastructure for coalescing adjacent or nearly adjacent (in device
    addresses) host to device memory transfers.  */
 
@@ -303,15 +287,17 @@  gomp_to_device_kind_p (int kind)
 }
 
 /* Copy host memory to an offload device.  In asynchronous mode (if AQ is
-   non-NULL), H may point to a stack location.  It is up to the underlying
-   plugin to ensure that this data is read immediately, rather than at some
-   later point when the stack frame will likely have been destroyed.  */
+   non-NULL), when the source data is stack or may otherwise be deallocated
+   before the asynchronous copy takes place, EPHEMERAL must be passed as
+   TRUE.  The CBUF isn't used for non-ephemeral asynchronous copies, because
+   the host data might not be computed yet (by an earlier asynchronous compute
+   region).  */
 
 attribute_hidden void
 gomp_copy_host2dev (struct gomp_device_descr *devicep,
 		    struct goacc_asyncqueue *aq,
 		    void *d, const void *h, size_t sz,
-		    struct gomp_coalesce_buf *cbuf)
+		    bool ephemeral, struct gomp_coalesce_buf *cbuf)
 {
   if (cbuf)
     {
@@ -339,8 +325,15 @@  gomp_copy_host2dev (struct gomp_device_descr *devicep,
 	}
     }
   if (__builtin_expect (aq != NULL, 0))
-    goacc_device_copy_async (devicep, devicep->openacc.async.host2dev_func,
-			     "dev", d, "host", h, sz, aq);
+    {
+      if (!devicep->openacc.async.host2dev_func (devicep->target_id, d, h, sz,
+						 ephemeral, aq))
+	{
+	  gomp_mutex_unlock (&devicep->lock);
+	  gomp_fatal ("Copying of host object [%p..%p) to dev object [%p..%p) "
+		      "failed", h, h + sz, d, d + sz);
+	}
+    }
   else
     gomp_device_copy (devicep, devicep->host2dev_func, "dev", d, "host", h, sz);
 }
@@ -351,8 +344,15 @@  gomp_copy_dev2host (struct gomp_device_descr *devicep,
 		    void *h, const void *d, size_t sz)
 {
   if (__builtin_expect (aq != NULL, 0))
-    goacc_device_copy_async (devicep, devicep->openacc.async.dev2host_func,
-			     "host", h, "dev", d, sz, aq);
+    {
+      if (!devicep->openacc.async.dev2host_func (devicep->target_id, h, d, sz,
+						 aq))
+	{
+	  gomp_mutex_unlock (&devicep->lock);
+	  gomp_fatal ("Copying of dev object [%p..%p) to host object [%p..%p) "
+		      "failed", d, d + sz, h, h + sz);
+	}
+    }
   else
     gomp_device_copy (devicep, devicep->dev2host_func, "host", h, "dev", d, sz);
 }
@@ -579,7 +579,7 @@  gomp_map_vars_existing (struct gomp_device_descr *devicep,
 			(void *) (oldn->tgt->tgt_start + oldn->tgt_offset
 				  + newn->host_start - oldn->host_start),
 			(void *) newn->host_start,
-			newn->host_end - newn->host_start, cbuf);
+			newn->host_end - newn->host_start, false, cbuf);
 
   if (oldn->refcount != REFCOUNT_INFINITY)
     oldn->refcount++;
@@ -607,8 +607,8 @@  gomp_map_pointer (struct target_mem_desc *tgt, struct goacc_asyncqueue *aq,
       cur_node.tgt_offset = (uintptr_t) NULL;
       gomp_copy_host2dev (devicep, aq,
 			  (void *) (tgt->tgt_start + target_offset),
-			  (void *) &cur_node.tgt_offset, sizeof (void *),
-			  cbuf);
+			  (void *) &cur_node.tgt_offset,
+			  sizeof (void *), true, cbuf);
       return;
     }
   /* Add bias to the pointer value.  */
@@ -628,7 +628,8 @@  gomp_map_pointer (struct target_mem_desc *tgt, struct goacc_asyncqueue *aq,
      to initialize the pointer with.  */
   cur_node.tgt_offset -= bias;
   gomp_copy_host2dev (devicep, aq, (void *) (tgt->tgt_start + target_offset),
-		      (void *) &cur_node.tgt_offset, sizeof (void *), cbuf);
+		      (void *) &cur_node.tgt_offset, sizeof (void *), true,
+		      cbuf);
 }
 
 static void
@@ -760,7 +761,7 @@  gomp_attach_pointer (struct gomp_device_descr *devicep,
 		  (void *) (n->tgt->tgt_start + n->tgt_offset), (void *) data);
 
       gomp_copy_host2dev (devicep, aq, (void *) devptr, (void *) &data,
-			  sizeof (void *), cbufp);
+			  sizeof (void *), true, cbufp);
     }
   else
     gomp_debug (1, "%s: attach count for %p -> %u\n", __FUNCTION__,
@@ -815,7 +816,7 @@  gomp_detach_pointer (struct gomp_device_descr *devicep,
 		  (void *) target);
 
       gomp_copy_host2dev (devicep, aq, (void *) devptr, (void *) &target,
-			  sizeof (void *), cbufp);
+			  sizeof (void *), true, cbufp);
     }
   else
     gomp_debug (1, "%s: attach count for %p -> %u\n", __FUNCTION__,
@@ -1147,8 +1148,9 @@  gomp_map_vars_internal (struct gomp_device_descr *devicep,
 	      for (i = first; i <= last; i++)
 		{
 		  tgt->list[i].key = NULL;
-		  if (gomp_to_device_kind_p (get_kind (short_mapkind, kinds, i)
-					     & typemask))
+		  if (!aq
+		      && gomp_to_device_kind_p (get_kind (short_mapkind, kinds,
+							  i) & typemask))
 		    gomp_coalesce_buf_add (&cbuf,
 					   tgt_size - cur_node.host_end
 					   + (uintptr_t) hostaddrs[i],
@@ -1209,8 +1211,9 @@  gomp_map_vars_internal (struct gomp_device_descr *devicep,
 	  if (tgt_align < align)
 	    tgt_align = align;
 	  tgt_size = (tgt_size + align - 1) & ~(align - 1);
-	  gomp_coalesce_buf_add (&cbuf, tgt_size,
-				 cur_node.host_end - cur_node.host_start);
+	  if (!aq)
+	    gomp_coalesce_buf_add (&cbuf, tgt_size,
+				   cur_node.host_end - cur_node.host_start);
 	  tgt_size += cur_node.host_end - cur_node.host_start;
 	  has_firstprivate = true;
 	  continue;
@@ -1240,7 +1243,7 @@  gomp_map_vars_internal (struct gomp_device_descr *devicep,
 	  if (tgt_align < align)
 	    tgt_align = align;
 	  tgt_size = (tgt_size + align - 1) & ~(align - 1);
-	  if (gomp_to_device_kind_p (kind & typemask))
+	  if (!aq && gomp_to_device_kind_p (kind & typemask))
 	    gomp_coalesce_buf_add (&cbuf, tgt_size,
 				   cur_node.host_end - cur_node.host_start);
 	  tgt_size += cur_node.host_end - cur_node.host_start;
@@ -1395,7 +1398,7 @@  gomp_map_vars_internal (struct gomp_device_descr *devicep,
 		len = sizes[i];
 		gomp_copy_host2dev (devicep, aq,
 				    (void *) (tgt->tgt_start + tgt_size),
-				    (void *) hostaddrs[i], len, cbufp);
+				    (void *) hostaddrs[i], len, false, cbufp);
 		tgt_size += len;
 		continue;
 	      case GOMP_MAP_FIRSTPRIVATE_INT:
@@ -1448,12 +1451,11 @@  gomp_map_vars_internal (struct gomp_device_descr *devicep,
 		if (cur_node.tgt_offset)
 		  cur_node.tgt_offset -= sizes[i];
 		gomp_copy_host2dev (devicep, aq,
-				    (void *) (n->tgt->tgt_start
-					      + n->tgt_offset
+				    (void *) (n->tgt->tgt_start + n->tgt_offset
 					      + cur_node.host_start
 					      - n->host_start),
 				    (void *) &cur_node.tgt_offset,
-				    sizeof (void *), cbufp);
+				    sizeof (void *), true, cbufp);
 		cur_node.tgt_offset = n->tgt->tgt_start + n->tgt_offset
 				      + cur_node.host_start - n->host_start;
 		continue;
@@ -1612,7 +1614,8 @@  gomp_map_vars_internal (struct gomp_device_descr *devicep,
 					(void *) (tgt->tgt_start
 						  + k->tgt_offset),
 					(void *) k->host_start,
-					k->host_end - k->host_start, cbufp);
+					k->host_end - k->host_start, false,
+					cbufp);
 		    break;
 		  case GOMP_MAP_POINTER:
 		    gomp_map_pointer (tgt, aq,
@@ -1624,7 +1627,8 @@  gomp_map_vars_internal (struct gomp_device_descr *devicep,
 					(void *) (tgt->tgt_start
 						  + k->tgt_offset),
 					(void *) k->host_start,
-					k->host_end - k->host_start, cbufp);
+					k->host_end - k->host_start, false,
+					cbufp);
 
 		    for (j = i + 1; j < mapnum; j++)
 		      if (!GOMP_MAP_POINTER_P (get_kind (short_mapkind, kinds,
@@ -1676,7 +1680,7 @@  gomp_map_vars_internal (struct gomp_device_descr *devicep,
 					(void *) (tgt->tgt_start
 						  + k->tgt_offset),
 					(void *) k->host_start,
-					sizeof (void *), cbufp);
+					sizeof (void *), false, cbufp);
 		    break;
 		  default:
 		    gomp_mutex_unlock (&devicep->lock);
@@ -1692,7 +1696,7 @@  gomp_map_vars_internal (struct gomp_device_descr *devicep,
 		    /* We intentionally do not use coalescing here, as it's not
 		       data allocated by the current call to this function.  */
 		    gomp_copy_host2dev (devicep, aq, (void *) n->tgt_offset,
-					&tgt_addr, sizeof (void *), NULL);
+					&tgt_addr, sizeof (void *), true, NULL);
 		  }
 		array++;
 	      }
@@ -1779,7 +1783,7 @@  gomp_map_vars_internal (struct gomp_device_descr *devicep,
 					    (void *) tgt->tgt_start
 						     + k->tgt_offset,
 					    (void *) k->host_start,
-					    da->data_row_size, cbufp);
+					    da->data_row_size, false, cbufp);
 		      array++;
 		    }
 		  target_data_rows[row_start + j] = (void *) target_row_addr;
@@ -1793,7 +1797,7 @@  gomp_map_vars_internal (struct gomp_device_descr *devicep,
 		  void *ptrblock = gomp_dynamic_array_create_ptrblock
 		    (da, target_ptrblock, target_data_rows + row_start);
 		  gomp_copy_host2dev (devicep, aq, target_ptrblock, ptrblock,
-				      da->ptrblock_size, cbufp);
+				      da->ptrblock_size, true, cbufp);
 		  free (ptrblock);
 		}
 
@@ -1817,7 +1821,7 @@  gomp_map_vars_internal (struct gomp_device_descr *devicep,
 	  gomp_copy_host2dev (devicep, aq,
 			      (void *) (tgt->tgt_start + i * sizeof (void *)),
 			      (void *) &cur_node.tgt_offset, sizeof (void *),
-			      cbufp);
+			      true, cbufp);
 	}
     }
 
@@ -1829,7 +1833,8 @@  gomp_map_vars_internal (struct gomp_device_descr *devicep,
 			    (void *) (tgt->tgt_start + cbuf.chunks[c].start),
 			    (char *) cbuf.buf + (cbuf.chunks[c].start
 						 - cbuf.chunks[0].start),
-			    cbuf.chunks[c].end - cbuf.chunks[c].start, NULL);
+			    cbuf.chunks[c].end - cbuf.chunks[c].start, true,
+			    NULL);
       free (cbuf.buf);
       cbuf.buf = NULL;
       cbufp = NULL;
@@ -2099,7 +2104,7 @@  gomp_update (struct gomp_device_descr *devicep, size_t mapnum, void **hostaddrs,
 
 	    if (GOMP_MAP_COPY_TO_P (kind & typemask))
 	      gomp_copy_host2dev (devicep, NULL, devaddr, hostaddr, size,
-				  NULL);
+				  false, NULL);
 	    if (GOMP_MAP_COPY_FROM_P (kind & typemask))
 	      gomp_copy_dev2host (devicep, NULL, hostaddr, devaddr, size);
 	  }
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/deep-copy-10.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/deep-copy-10.c
index 37cde4ef059..0bc43e9477f 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/deep-copy-10.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/deep-copy-10.c
@@ -1,6 +1,10 @@ 
 #include <stdlib.h>
+#include <unistd.h>
+#include <stdio.h>
 
-/* Test asyncronous attach and detach operation.  */
+#define ITERATIONS 1023
+
+/* Test asynchronous attach and detach operation.  */
 
 typedef struct {
   int *a;
@@ -25,13 +29,13 @@  main (int argc, char* argv[])
 
 #pragma acc enter data copyin(m)
 
-  for (int i = 0; i < 99; i++)
+  for (int i = 0; i < ITERATIONS; i++)
     {
       int j;
-#pragma acc parallel loop copy(m.a[0:N]) async(i % 2)
+#pragma acc parallel loop copy(m.a[0:N]) async(0)
       for (j = 0; j < N; j++)
         m.a[j]++;
-#pragma acc parallel loop copy(m.b[0:N]) async((i + 1) % 2)
+#pragma acc parallel loop copy(m.b[0:N]) async(1)
       for (j = 0; j < N; j++)
         m.b[j]++;
     }
@@ -40,10 +44,10 @@  main (int argc, char* argv[])
 
   for (i = 0; i < N; i++)
     {
-      if (m.a[i] != 99)
-        abort ();
-      if (m.b[i] != 99)
-        abort ();
+      if (m.a[i] != ITERATIONS)
+	abort ();
+      if (m.b[i] != ITERATIONS)
+	abort ();
     }
 
   free (m.a);
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/lib-16-2.f90 b/libgomp/testsuite/libgomp.oacc-fortran/lib-16-2.f90
index fa76f65912f..94b80d07f4f 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/lib-16-2.f90
+++ b/libgomp/testsuite/libgomp.oacc-fortran/lib-16-2.f90
@@ -27,6 +27,9 @@  program main
 
   if (acc_is_present (h) .neqv. .TRUE.) call abort
 
+  ! We must wait for the update to be done.
+  call acc_wait (async)
+
   h(:) = 0
 
   call acc_copyout_async (h, sizeof (h), async)
@@ -45,6 +48,8 @@  program main
   
   if (acc_is_present (h) .neqv. .TRUE.) call abort
 
+  call acc_wait (async)
+
   do i = 1, N
     if (h(i) /= i + i) call abort
   end do