[og9] OpenACC profiling-interface fixes for asynchronous operations

Message ID 20190917172156.111727-4-julian@codesourcery.com
State New
Headers show
Series
  • [og9] OpenACC profiling-interface fixes for asynchronous operations
Related show

Commit Message

Julian Brown Sept. 17, 2019, 5:21 p.m.
This patch fixes some problems with the OpenACC profiling interface when
used with asynchronous offload operations. Essentially, the profiling
operations themselves must be launched asynchronously, otherwise they
will measure the wrong thing, and/or execute at the same time as the
operation they are supposed to be measuring.

A consequence of this change is that "enqueueing" profiling callbacks
are no longer predictably ordered with respect to the callbacks
relating to the execution of the related asynchronous operations. The
acc_prof-parallel-1.c test is adjusted accordingly.

Tested with offloading to AMD GCN. I will apply to the
openacc-gcc-9-branch shortly.

Julian

ChangeLog

	libgomp/
	* oacc-host.c (host_openacc_async_queue_callback): Invoke callback
	function immediately.
	* oacc-parallel.c (struct async_prof_callback_info, async_prof_dispatch,
	queue_async_prof_dispatch): New.
	(GOACC_parallel_keyed): Call queue_async_prof_dispatch for asynchronous
	profile-event dispatches.
	(GOACC_enter_exit_data): Likewise.
	(GOACC_update): Likewise.
	* testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1.c
	(cb_compute_construct_start): Remove/fix TODO.
	* testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c
	(cb_exit_data_start): Tweak expected state values.
	(cb_exit_data_end): Likewise.
	(cb_compute_construct_start): Remove/fix TODO.
	(cb_compute_construct_end): Don't do adjustments for
	acc_ev_enqueue_launch_start/acc_ev_enqueue_launch_end callbacks.
	(cb_compute_construct_end): Tweak expected state values.
	(cb_enqueue_launch_start, cb_enqueue_launch_end): Don't expect
	launch-enqueue operations to happen synchronously with respect to
	profiling events on async streams.
	(main): Tweak expected state values.
	* testsuite/libgomp.oacc-c-c++-common/lib-94.c (main): Reorder
	operations for async-safety.
---
 libgomp/ChangeLog.openacc                     |  26 +++
 libgomp/oacc-host.c                           |   5 +-
 libgomp/oacc-parallel.c                       | 181 +++++++++++++++---
 .../acc_prof-init-1.c                         |   5 +-
 .../acc_prof-parallel-1.c                     |  64 ++-----
 .../libgomp.oacc-c-c++-common/lib-94.c        |   4 +-
 6 files changed, 197 insertions(+), 88 deletions(-)

-- 
2.22.0

Patch

diff --git a/libgomp/ChangeLog.openacc b/libgomp/ChangeLog.openacc
index 41e05e9c61b..5f39fae6f51 100644
--- a/libgomp/ChangeLog.openacc
+++ b/libgomp/ChangeLog.openacc
@@ -1,3 +1,29 @@ 
+2019-09-17  Julian Brown  <julian@codesourcery.com>
+
+	* oacc-host.c (host_openacc_async_queue_callback): Invoke callback
+	function immediately.
+	* oacc-parallel.c (struct async_prof_callback_info, async_prof_dispatch,
+	queue_async_prof_dispatch): New.
+	(GOACC_parallel_keyed): Call queue_async_prof_dispatch for asynchronous
+	profile-event dispatches.
+	(GOACC_enter_exit_data): Likewise.
+	(GOACC_update): Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1.c
+	(cb_compute_construct_start): Remove/fix TODO.
+	* testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c
+	(cb_exit_data_start): Tweak expected state values.
+	(cb_exit_data_end): Likewise.
+	(cb_compute_construct_start): Remove/fix TODO.
+	(cb_compute_construct_end): Don't do adjustments for
+	acc_ev_enqueue_launch_start/acc_ev_enqueue_launch_end callbacks.
+	(cb_compute_construct_end): Tweak expected state values.
+	(cb_enqueue_launch_start, cb_enqueue_launch_end): Don't expect
+	launch-enqueue operations to happen synchronously with respect to
+	profiling events on async streams.
+	(main): Tweak expected state values.
+	* testsuite/libgomp.oacc-c-c++-common/lib-94.c (main): Reorder
+	operations for async-safety.
+
 2019-09-17  Julian Brown  <julian@codesourcery.com>
 
 	* target.c (gomp_map_vars_internal): Remove read of uninitialised
diff --git a/libgomp/oacc-host.c b/libgomp/oacc-host.c
index 21f73302f03..0231b597114 100644
--- a/libgomp/oacc-host.c
+++ b/libgomp/oacc-host.c
@@ -250,10 +250,9 @@  host_openacc_async_dev2host (int ord __attribute__ ((unused)),
 static void
 host_openacc_async_queue_callback (struct goacc_asyncqueue *aq
 				   __attribute__ ((unused)),
-				   void (*callback_fn)(void *)
-				   __attribute__ ((unused)),
-				   void *userptr __attribute__ ((unused)))
+				   void (*callback_fn)(void *), void *userptr)
 {
+  callback_fn (userptr);
 }
 
 static struct goacc_asyncqueue *
diff --git a/libgomp/oacc-parallel.c b/libgomp/oacc-parallel.c
index 1bd0775f226..0c9cb3c461c 100644
--- a/libgomp/oacc-parallel.c
+++ b/libgomp/oacc-parallel.c
@@ -169,6 +169,62 @@  goacc_call_host_fn (void (*fn) (void *), size_t mapnum, void **hostaddrs,
   fn (hostaddrs);
 }
 
+struct async_prof_callback_info {
+  acc_prof_info prof_info;
+  acc_event_info event_info;
+  acc_api_info api_info;
+  struct async_prof_callback_info *start_info;
+};
+
+static void
+async_prof_dispatch (void *ptr)
+{
+  struct async_prof_callback_info *info
+    = (struct async_prof_callback_info *) ptr;
+
+  if (info->start_info)
+    {
+      /* The TOOL_INFO must be preserved from a start event to the
+	 corresponding end event.  Copy that here.  */
+      void *tool_info = info->start_info->event_info.other_event.tool_info;
+      info->event_info.other_event.tool_info = tool_info;
+    }
+
+  goacc_profiling_dispatch (&info->prof_info, &info->event_info,
+			    &info->api_info);
+
+  /* The async_prof_dispatch function is (so far) always used for start/end
+     profiling event pairs: the start and end parts are queued, then each is
+     dispatched (or the dispatches might be interleaved before the end part is
+     queued).
+     In any case, it's not safe to delete either info structure before the
+     whole bracketed event is complete.  */
+
+  if (info->start_info)
+    {
+      free (info->start_info);
+      free (info);
+    }
+}
+
+static struct async_prof_callback_info *
+queue_async_prof_dispatch (struct gomp_device_descr *devicep, goacc_aq aq,
+			   acc_prof_info *prof_info, acc_event_info *event_info,
+			   acc_api_info *api_info,
+			   struct async_prof_callback_info *prev_info)
+{
+  struct async_prof_callback_info *info = malloc (sizeof (*info));
+
+  info->prof_info = *prof_info;
+  info->event_info = *event_info;
+  info->api_info = *api_info;
+  info->start_info = prev_info;
+
+  devicep->openacc.async.queue_callback_func (aq, async_prof_dispatch,
+					      (void *) info);
+  return info;
+}
+
 /* Launch a possibly offloaded function with FLAGS.  FN is the host fn
    address.  MAPNUM, HOSTADDRS, SIZES & KINDS  describe the memory
    blocks to be copied to/from the device.  Varadic arguments are
@@ -194,6 +250,8 @@  GOACC_parallel_keyed (int flags_m, void (*fn) (void *), size_t mapnum,
   unsigned dims[GOMP_DIM_MAX];
   unsigned tag;
   bool args_exploded = false;
+  struct async_prof_callback_info *comp_start_info = NULL,
+				  *data_start_info = NULL;
 
 #ifdef HAVE_INTTYPES_H
   gomp_debug (0, "%s: mapnum=%"PRIu64", hostaddrs=%p, size=%p, kinds=%p\n",
@@ -255,10 +313,6 @@  GOACC_parallel_keyed (int flags_m, void (*fn) (void *), size_t mapnum,
       api_info.async_handle = NULL;
     }
 
-  if (profiling_p)
-    goacc_profiling_dispatch (&prof_info, &compute_construct_event_info,
-			      &api_info);
-
   handle_ftn_pointers (mapnum, hostaddrs, sizes, kinds);
 
   /* Default: let the runtime choose.  */
@@ -294,11 +348,12 @@  GOACC_parallel_keyed (int flags_m, void (*fn) (void *), size_t mapnum,
 	    if (async == GOMP_LAUNCH_OP_MAX)
 	      async = va_arg (ap, unsigned);
 
-	    if (profiling_p)
-	      {
-		prof_info.async = async;
-		prof_info.async_queue = prof_info.async;
-	      }
+	    /* Set async number in profiling data, unless the device is the
+	       host or we're doing host fallback.  */
+	    if (profiling_p
+	        && !(flags & GOACC_FLAG_HOST_FALLBACK)
+		&& acc_device_type (acc_dev->type) != acc_device_host)
+	      prof_info.async = prof_info.async_queue = async;
 
 	    break;
 	  }
@@ -321,6 +376,20 @@  GOACC_parallel_keyed (int flags_m, void (*fn) (void *), size_t mapnum,
     }
   va_end (ap);
 
+  goacc_aq aq = get_goacc_asyncqueue (async);
+
+  if (profiling_p)
+    {
+      if (aq)
+	comp_start_info
+	  = queue_async_prof_dispatch (acc_dev, aq, &prof_info,
+				       &compute_construct_event_info,
+				       &api_info, NULL);
+      else
+	goacc_profiling_dispatch (&prof_info, &compute_construct_event_info,
+				  &api_info);
+    }
+
   /* Host fallback if "if" clause is false or if the current device is set to
      the host.  */
   if (flags & GOACC_FLAG_HOST_FALLBACK)
@@ -368,12 +437,16 @@  GOACC_parallel_keyed (int flags_m, void (*fn) (void *), size_t mapnum,
 	= compute_construct_event_info.other_event.parent_construct;
       enter_exit_data_event_info.other_event.implicit = 1;
       enter_exit_data_event_info.other_event.tool_info = NULL;
-      goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
-				&api_info);
+      if (aq)
+	data_start_info
+	  = queue_async_prof_dispatch (acc_dev, aq, &prof_info,
+				       &enter_exit_data_event_info, &api_info,
+				       NULL);
+      else
+	goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
+				  &api_info);
     }
 
-  goacc_aq aq = get_goacc_asyncqueue (async);
-
   tgt = gomp_map_vars_async (acc_dev, aq, mapnum, hostaddrs, NULL, sizes, kinds,
 			     true, GOMP_MAP_VARS_OPENACC);
 
@@ -391,8 +464,13 @@  GOACC_parallel_keyed (int flags_m, void (*fn) (void *), size_t mapnum,
       prof_info.event_type = acc_ev_enter_data_end;
       enter_exit_data_event_info.other_event.event_type
 	= prof_info.event_type;
-      goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
-				&api_info);
+      if (aq)
+	queue_async_prof_dispatch (acc_dev, aq, &prof_info,
+				   &enter_exit_data_event_info, &api_info,
+				   data_start_info);
+      else
+	goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
+				  &api_info);
     }
 
   devaddrs = gomp_alloca (sizeof (void *) * mapnum);
@@ -423,8 +501,14 @@  GOACC_parallel_keyed (int flags_m, void (*fn) (void *), size_t mapnum,
       prof_info.event_type = acc_ev_exit_data_start;
       enter_exit_data_event_info.other_event.event_type = prof_info.event_type;
       enter_exit_data_event_info.other_event.tool_info = NULL;
-      goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
-				&api_info);
+      if (aq)
+	data_start_info
+	  = queue_async_prof_dispatch (acc_dev, aq, &prof_info,
+				       &enter_exit_data_event_info, &api_info,
+				       NULL);
+      else
+	goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
+				  &api_info);
     }
 
   /* If running synchronously, unmap immediately.  */
@@ -437,8 +521,13 @@  GOACC_parallel_keyed (int flags_m, void (*fn) (void *), size_t mapnum,
     {
       prof_info.event_type = acc_ev_exit_data_end;
       enter_exit_data_event_info.other_event.event_type = prof_info.event_type;
-      goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
-				&api_info);
+      if (aq)
+	queue_async_prof_dispatch (acc_dev, aq, &prof_info,
+				   &enter_exit_data_event_info, &api_info,
+				   data_start_info);
+      else
+	goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
+				  &api_info);
     }
 
 #ifdef RC_CHECKING
@@ -453,8 +542,13 @@  GOACC_parallel_keyed (int flags_m, void (*fn) (void *), size_t mapnum,
       prof_info.event_type = acc_ev_compute_construct_end;
       compute_construct_event_info.other_event.event_type
 	= prof_info.event_type;
-      goacc_profiling_dispatch (&prof_info, &compute_construct_event_info,
-				&api_info);
+      if (aq)
+	queue_async_prof_dispatch (acc_dev, aq, &prof_info,
+				   &compute_construct_event_info, &api_info,
+				   comp_start_info);
+      else
+	goacc_profiling_dispatch (&prof_info, &compute_construct_event_info,
+				  &api_info);
 
       thr->prof_info = NULL;
       thr->api_info = NULL;
@@ -697,6 +791,7 @@  GOACC_enter_exit_data (int flags_m, size_t mapnum,
   struct gomp_device_descr *acc_dev;
   bool data_enter = false;
   size_t i;
+  struct async_prof_callback_info *data_start_info = NULL;
 
   goacc_lazy_initialize ();
 
@@ -806,9 +901,19 @@  GOACC_enter_exit_data (int flags_m, size_t mapnum,
       api_info.async_handle = NULL;
     }
 
+  goacc_aq aq = get_goacc_asyncqueue (async);
+
   if (profiling_p)
-    goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
-			      &api_info);
+    {
+      if (aq)
+	data_start_info
+	  = queue_async_prof_dispatch (acc_dev, aq, &prof_info,
+				       &enter_exit_data_event_info, &api_info,
+				       NULL);
+      else
+	goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
+				  &api_info);
+    }
 
   if ((acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
       || (flags & GOACC_FLAG_HOST_FALLBACK))
@@ -867,7 +972,6 @@  GOACC_enter_exit_data (int flags_m, size_t mapnum,
 		case GOMP_MAP_STRUCT:
 		  {
 		    int elems = sizes[i];
-		    goacc_aq aq = get_goacc_asyncqueue (async);
 		    gomp_map_vars_async (acc_dev, aq, elems + 1, &hostaddrs[i],
 					 NULL, &sizes[i], &kinds[i], true,
 					 GOMP_MAP_VARS_OPENACC_ENTER_DATA);
@@ -890,7 +994,6 @@  GOACC_enter_exit_data (int flags_m, size_t mapnum,
 					   &sizes[i], &kinds[i]);
 	      else
 	        {
-		  goacc_aq aq = get_goacc_asyncqueue (async);
 		  for (int j = 0; j < 2; j++)
 		    gomp_map_vars_async (acc_dev, aq,
 					 (j == 0 || pointer == 2) ? 1 : 2,
@@ -1003,7 +1106,6 @@  GOACC_enter_exit_data (int flags_m, size_t mapnum,
 		case GOMP_MAP_STRUCT:
 		  {
 		    int elems = sizes[i];
-		    goacc_aq aq = get_goacc_asyncqueue (async);
 		    for (int j = 1; j <= elems; j++)
 		      {
 			struct splay_tree_key_s k;
@@ -1067,8 +1169,13 @@  GOACC_enter_exit_data (int flags_m, size_t mapnum,
       prof_info.event_type
 	= data_enter ? acc_ev_enter_data_end : acc_ev_exit_data_end;
       enter_exit_data_event_info.other_event.event_type = prof_info.event_type;
-      goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
-				&api_info);
+      if (aq)
+	queue_async_prof_dispatch (acc_dev, aq, &prof_info,
+				   &enter_exit_data_event_info, &api_info,
+				   data_start_info);
+      else
+	goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
+				  &api_info);
 
       thr->prof_info = NULL;
       thr->api_info = NULL;
@@ -1120,6 +1227,8 @@  GOACC_update (int flags_m, size_t mapnum,
 
   struct goacc_thread *thr = goacc_thread ();
   struct gomp_device_descr *acc_dev = thr->dev;
+  goacc_aq aq = NULL;
+  struct async_prof_callback_info *update_start_info = NULL;
 
   bool profiling_p = GOACC_PROFILING_DISPATCH_P (true);
 
@@ -1169,7 +1278,15 @@  GOACC_update (int flags_m, size_t mapnum,
     }
 
   if (profiling_p)
-    goacc_profiling_dispatch (&prof_info, &update_event_info, &api_info);
+    {
+      aq = get_goacc_asyncqueue (async);
+      if (aq)
+	update_start_info
+	  = queue_async_prof_dispatch (acc_dev, aq, &prof_info,
+				       &update_event_info, &api_info, NULL);
+      else
+	goacc_profiling_dispatch (&prof_info, &update_event_info, &api_info);
+    }
 
   if ((acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
       || (flags & GOACC_FLAG_HOST_FALLBACK))
@@ -1257,7 +1374,11 @@  GOACC_update (int flags_m, size_t mapnum,
     {
       prof_info.event_type = acc_ev_update_end;
       update_event_info.other_event.event_type = prof_info.event_type;
-      goacc_profiling_dispatch (&prof_info, &update_event_info, &api_info);
+      if (aq)
+	queue_async_prof_dispatch (acc_dev, aq, &prof_info, &update_event_info,
+				   &api_info, update_start_info);
+      else
+	goacc_profiling_dispatch (&prof_info, &update_event_info, &api_info);
 
       thr->prof_info = NULL;
       thr->api_info = NULL;
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1.c
index cf980f1baec..1af53cb72b9 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1.c
@@ -159,7 +159,10 @@  static void cb_compute_construct_start (acc_prof_info *prof_info, acc_event_info
   assert (prof_info->device_type == acc_device_type);
   assert (prof_info->device_number == acc_device_num);
   assert (prof_info->thread_id == -1);
-  assert (prof_info->async == /* TODO acc_async */ acc_async_sync);
+  if (acc_device_type == acc_device_host)
+    assert (prof_info->async == acc_async_sync);
+  else
+    assert (prof_info->async == acc_async);
   assert (prof_info->async_queue == prof_info->async);
   assert (prof_info->src_file == NULL);
   assert (prof_info->func_name == NULL);
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c
index 5d392511592..0cb0369168b 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c
@@ -284,9 +284,9 @@  static void cb_exit_data_start (acc_prof_info *prof_info, acc_event_info *event_
 {
   DEBUG_printf ("%s\n", __FUNCTION__);
 
-  assert (state == 7
+  assert (state == 5
 #if ASYNC_EXIT_DATA
-	  || state == 107
+	  || state == 105
 #endif
 	  );
   STATE_OP (state, ++);
@@ -340,9 +340,9 @@  static void cb_exit_data_end (acc_prof_info *prof_info, acc_event_info *event_in
 {
   DEBUG_printf ("%s\n", __FUNCTION__);
 
-  assert (state == 8
+  assert (state == 6
 #if ASYNC_EXIT_DATA
-	  || state == 108
+	  || state == 106
 #endif
 	  );
   STATE_OP (state, ++);
@@ -426,7 +426,10 @@  static void cb_compute_construct_start (acc_prof_info *prof_info, acc_event_info
   assert (prof_info->device_type == acc_device_type);
   assert (prof_info->device_number == acc_device_num);
   assert (prof_info->thread_id == -1);
-  assert (prof_info->async == /* TODO acc_async */ acc_async_sync);
+  if (acc_device_type == acc_device_host)
+    assert (prof_info->async == acc_async_sync);
+  else
+    assert (prof_info->async == acc_async);
   assert (prof_info->async_queue == prof_info->async);
   assert (prof_info->src_file == NULL);
   assert (prof_info->func_name == NULL);
@@ -467,9 +470,6 @@  static void cb_compute_construct_end (acc_prof_info *prof_info, acc_event_info *
     {
       /* Compensate for the missing 'acc_ev_enter_data_end'.  */
       state += 1;
-      /* Compensate for the missing 'acc_ev_enqueue_launch_start' and
-	 'acc_ev_enqueue_launch_end'.  */
-      state += 2;
       /* Compensate for the missing 'acc_ev_exit_data_start' and
 	 'acc_ev_exit_data_end'.  */
       state += 2;
@@ -482,8 +482,8 @@  static void cb_compute_construct_end (acc_prof_info *prof_info, acc_event_info *
       state += 2;
     }
 #endif
-  assert (state == 9
-	  || state == 109);
+  assert (state == 7
+	  || state == 107);
   STATE_OP (state, ++);
 
   assert (tool_info != NULL);
@@ -537,17 +537,6 @@  static void cb_enqueue_launch_start (acc_prof_info *prof_info, acc_event_info *e
 
   assert (acc_device_type != acc_device_host);
 
-  assert (state == 5
-	  || state == 105);
-  STATE_OP (state, ++);
-
-  assert (tool_info != NULL);
-  assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
-  assert (tool_info->nested == NULL);
-  tool_info->nested = (struct tool_info *) malloc(sizeof *tool_info);
-  assert (tool_info->nested != NULL);
-  tool_info->nested->nested = NULL;
-
   assert (prof_info->event_type == acc_ev_enqueue_launch_start);
   assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
   assert (prof_info->version == _ACC_PROF_INFO_VERSION);
@@ -591,13 +580,6 @@  static void cb_enqueue_launch_start (acc_prof_info *prof_info, acc_event_info *e
   assert (api_info->device_handle == NULL);
   assert (api_info->context_handle == NULL);
   assert (api_info->async_handle == NULL);
-
-  tool_info->nested->event_info.launch_event.event_type = event_info->launch_event.event_type;
-  tool_info->nested->event_info.launch_event.kernel_name = strdup (event_info->launch_event.kernel_name);
-  tool_info->nested->event_info.launch_event.num_gangs = event_info->launch_event.num_gangs;
-  tool_info->nested->event_info.launch_event.num_workers = event_info->launch_event.num_workers;
-  tool_info->nested->event_info.launch_event.vector_length = event_info->launch_event.vector_length;
-  event_info->other_event.tool_info = tool_info->nested;
 }
 
 static void cb_enqueue_launch_end (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
@@ -606,19 +588,6 @@  static void cb_enqueue_launch_end (acc_prof_info *prof_info, acc_event_info *eve
 
   assert (acc_device_type != acc_device_host);
 
-  assert (state == 6
-	  || state == 106);
-  STATE_OP (state, ++);
-
-  assert (tool_info != NULL);
-  assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
-  assert (tool_info->nested != NULL);
-  assert (tool_info->nested->event_info.launch_event.event_type == acc_ev_enqueue_launch_start);
-  assert (tool_info->nested->event_info.launch_event.kernel_name != NULL);
-  assert (tool_info->nested->event_info.launch_event.num_gangs >= 1);
-  assert (tool_info->nested->event_info.launch_event.num_workers >= 1);
-  assert (tool_info->nested->event_info.launch_event.vector_length >= 1);
-
   assert (prof_info->event_type == acc_ev_enqueue_launch_end);
   assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
   assert (prof_info->version == _ACC_PROF_INFO_VERSION);
@@ -638,12 +607,7 @@  static void cb_enqueue_launch_end (acc_prof_info *prof_info, acc_event_info *eve
   assert (event_info->launch_event.valid_bytes == _ACC_LAUNCH_EVENT_INFO_VALID_BYTES);
   assert (event_info->launch_event.parent_construct == acc_construct_parallel);
   assert (event_info->launch_event.implicit == 1);
-  assert (event_info->launch_event.tool_info == tool_info->nested);
   assert (event_info->launch_event.kernel_name != NULL);
-  assert (strcmp (event_info->launch_event.kernel_name, tool_info->nested->event_info.launch_event.kernel_name) == 0);
-  assert (event_info->launch_event.num_gangs == tool_info->nested->event_info.launch_event.num_gangs);
-  assert (event_info->launch_event.num_workers == tool_info->nested->event_info.launch_event.num_workers);
-  assert (event_info->launch_event.vector_length == tool_info->nested->event_info.launch_event.vector_length);
 
   if (acc_device_type == acc_device_host)
     assert (api_info->device_api == acc_device_api_none);
@@ -657,10 +621,6 @@  static void cb_enqueue_launch_end (acc_prof_info *prof_info, acc_event_info *eve
   assert (api_info->device_handle == NULL);
   assert (api_info->context_handle == NULL);
   assert (api_info->async_handle == NULL);
-
-  free ((void *) tool_info->nested->event_info.launch_event.kernel_name);
-  free (tool_info->nested);
-  tool_info->nested = NULL;
 }
 
 
@@ -707,7 +667,7 @@  int main()
     }
     assert (state_init == 4);
   }
-  assert (state == 10);
+  assert (state == 8);
 
   STATE_OP (state, = 100);
 
@@ -723,7 +683,7 @@  int main()
 #pragma acc wait
     assert (state_init == 104);
   }
-  assert (state == 110);
+  assert (state == 108);
 
   return 0;
 }
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-94.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-94.c
index 54497237b0c..baa3ac83f04 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-94.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-94.c
@@ -22,10 +22,10 @@  main (int argc, char **argv)
 
   acc_copyin_async (h, N, async);
 
-  memset (h, 0, N);
-
   acc_wait (async);
 
+  memset (h, 0, N);
+
   acc_copyout_async (h, N, async + 1);
 
   acc_wait (async + 1);