[OpenACC] Adjustments and additions to testcases

Message ID 01750e29-e88c-e456-30f4-e8385e99e1b7@mentor.com
State New
Headers show
Series
  • [OpenACC] Adjustments and additions to testcases
Related show

Commit Message

Chung-Lin Tang Oct. 22, 2018, 4:07 p.m.
Hi Thomas,
this patch is a collection of testcase patches we had, ready to be committed to trunk.
I believe this only touches those parts where you can review, is this okay to apply?

Thanks,
Chung-Lin

2018-10-22  Cesar Philippidis  <cesar@codesourcery.com>

	gcc/testsuite/
	* g++.dg/goacc/loop-1.c: New test.
	* g++.dg/goacc/loop-2.c: New test.
	* g++.dg/goacc/loop-3.c: New test.


2018-10-22  James Norris  <jnorris@codesourcery.com>
	    Cesar Philippidis  <cesar@codesourcery.com>
	    Tom de Vries  <tom@codesourcery.com>

	libgomp/
	* testsuite/libgomp.oacc-c-c++-common/data-2.c: Update parallel
	regions to denote variables copyied in via acc enter data as
	present.
	* testsuite/libgomp.oacc-fortran/data-3.f90: Likewise.
	* testsuite/libgomp.oacc-fortran/data-4.f90: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/subr.h: Reimplement.
	* testsuite/libgomp.oacc-c-c++-common/subr.ptx: Regenerated PTX.
	* testsuite/libgomp.oacc-c-c++-common/timer.h: Removed.
	* testsuite/libgomp.oacc-c-c++-common/lib-69.c: Change async checks.
	* testsuite/libgomp.oacc-c-c++-common/lib-70.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/lib-71.c: Rework kernel i/f.
	* testsuite/libgomp.oacc-c-c++-common/lib-72.c: Rework kernel i/f and
	change async checks.
	* testsuite/libgomp.oacc-c-c++-common/lib-73.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/lib-74.c: Rework kernel i/f and
	timing checks.
	* testsuite/libgomp.oacc-c-c++-common/lib-75.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/lib-76.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/lib-77.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/lib-78.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/lib-79.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/lib-80.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/lib-81.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/lib-82.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/lib-93.c: New test.

Patch

Index: gcc/testsuite/g++.dg/goacc/loop-1.c
===================================================================
--- gcc/testsuite/g++.dg/goacc/loop-1.c	(nonexistent)
+++ gcc/testsuite/g++.dg/goacc/loop-1.c	(working copy)
@@ -0,0 +1,23 @@ 
+void
+f (int i, float j, int k)
+{
+#pragma acc parallel num_gangs (i) num_workers (i) vector_length (i)
+#pragma acc loop gang
+  for (i = 0; i < 20; ++i)
+    ;
+
+#pragma acc parallel num_gangs (j) /* { dg-error "'num_gangs' expression must be integral" } */
+#pragma acc loop gang
+  for (i = 0; i < 20; ++i)
+    ;
+
+#pragma acc parallel num_workers (j) /* { dg-error "'num_workers' expression must be integral" } */
+#pragma acc loop gang
+  for (i = 0; i < 20; ++i)
+    ;
+
+#pragma acc parallel vector_length (j) /* { dg-error "'vector_length' expression must be integral" } */
+#pragma acc loop gang
+  for (i = 0; i < 20; ++i)
+    ;
+}
Index: gcc/testsuite/g++.dg/goacc/loop-2.c
===================================================================
--- gcc/testsuite/g++.dg/goacc/loop-2.c	(nonexistent)
+++ gcc/testsuite/g++.dg/goacc/loop-2.c	(working copy)
@@ -0,0 +1,70 @@ 
+void
+f (int i, int j, int k)
+{
+#pragma acc kernels
+#pragma acc loop gang
+  for (i = 0; i < 20; ++i)
+    ;
+
+#pragma acc kernels
+#pragma acc loop gang (num: 10)
+  for (i = 0; i < 20; ++i)
+    ;
+
+#pragma acc kernels
+#pragma acc loop gang (static: 10)
+  for (i = 0; i < 20; ++i)
+    ;
+
+#pragma acc kernels
+#pragma acc loop gang (static: 5, num: 10)
+  for (i = 0; i < 20; ++i)
+    ;
+
+
+#pragma acc kernels
+#pragma acc loop gang (static: 5, num: 10, *) /* { dg-error "duplicate operand to clause" } */
+  for (i = 0; i < 20; ++i)
+    ;
+
+#pragma acc kernels
+#pragma acc loop gang (static: 5, num: 10, static: *) /* { dg-error "duplicate 'num' argument" } */
+  for (i = 0; i < 20; ++i)
+    ;
+
+#pragma acc kernels
+#pragma acc loop worker (static: 234) /* { dg-error "expected 'num' before" } */
+  for (i = 0; i < 20; ++i)
+    ;
+
+#pragma acc kernels
+#pragma acc loop worker (num: 234)
+  for (i = 0; i < 20; ++i)
+    ;
+
+#pragma acc kernels
+#pragma acc loop worker (num: 234, num: 12) /* { dg-error "duplicate operand to clause" } */
+  for (i = 0; i < 20; ++i)
+    ;
+
+#pragma acc kernels
+#pragma acc loop vector /* { dg-error "gang, worker and vector must occur in this order in a loop nest" } */
+  for (i = 0; i < 20; ++i)
+#pragma acc loop worker
+    for (j = 0; j < 25; ++j)
+      ;
+
+#pragma acc kernels
+#pragma acc loop worker (length: 20) /* { dg-error "expected 'num' before 'length'" } */
+  for (i = 0; i < 20; ++i)
+#pragma acc loop vector (length: 10)
+    for (j = 0; j < 25; ++j)
+      ;
+
+#pragma acc kernels
+#pragma acc loop worker
+  for (i = 0; i < 20; ++i)
+#pragma acc loop vector
+    for (j = 0; j < 25; ++j)
+      ;
+}
Index: gcc/testsuite/g++.dg/goacc/loop-3.c
===================================================================
--- gcc/testsuite/g++.dg/goacc/loop-3.c	(nonexistent)
+++ gcc/testsuite/g++.dg/goacc/loop-3.c	(working copy)
@@ -0,0 +1,43 @@ 
+void
+f (int i, int j, int k)
+{
+#pragma acc kernels num_gangs (10) /* { dg-error "'num_gangs' is not valid" } */
+#pragma acc loop gang
+  for (i = 0; i < 20; ++i)
+    ;
+
+#pragma acc kernels num_workers (10) /* { dg-error "'num_workers' is not valid" } */
+#pragma acc loop worker
+  for (i = 0; i < 20; ++i)
+    ;
+
+#pragma acc kernels vector_length (10) /* { dg-error "'vector_length' is not valid" } */
+#pragma acc loop vector
+  for (i = 0; i < 20; ++i)
+    ;
+
+#pragma acc parallel num_gangs (10) num_workers (20) vector_length (32)
+#pragma acc loop gang
+  for (i = 0; i < 20; ++i)
+    ;
+
+#pragma acc parallel num_gangs (i) num_workers (j) vector_length (k)
+#pragma acc loop gang
+  for (i = 0; i < 20; ++i)
+    ;
+
+#pragma acc parallel num_gangs (10, i) /* { dg-error "expected '\\)' before ',' token" } */
+#pragma acc loop gang
+  for (i = 0; i < 20; ++i)
+    ;
+
+#pragma acc parallel num_workers (10, i) /* { dg-error "expected '\\)' before ',' token" } */
+#pragma acc loop gang
+  for (i = 0; i < 20; ++i)
+    ;
+
+#pragma acc parallel vector_length (10, i) /* { dg-error "expected '\\)' before ',' token" } */
+#pragma acc loop gang
+  for (i = 0; i < 20; ++i)
+    ;
+}
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/data-2.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/data-2.c	(revision 265394)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/data-2.c	(working copy)
@@ -54,7 +54,7 @@  main (int argc, char **argv)
 #pragma acc enter data copyin (a[0:N]) async 
 #pragma acc enter data copyin (b[0:N]) async wait
 #pragma acc enter data copyin (N) async wait
-#pragma acc parallel async wait
+#pragma acc parallel async wait present (a[0:N]) present (b[0:N]) present (N)
 #pragma acc loop
   for (i = 0; i < N; i++)
     b[i] = a[i];
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/lib-69.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/lib-69.c	(revision 265394)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/lib-69.c	(working copy)
@@ -9,48 +9,16 @@ 
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay;
   CUmodule module;
   CUresult r;
   CUstream stream;
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float dtime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
+  r = cuModuleLoad (&module, "./subr.ptx");
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
-      abort ();
-    }
-
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuModuleLoad (&module, "subr.ptx");
-  if (r != CUDA_SUCCESS)
-    {
       fprintf (stderr, "cuModuleLoad failed: %d\n", r);
       abort ();
     }
@@ -62,20 +30,6 @@  main (int argc, char **argv)
       abort ();
     }
 
-  nbytes = nprocs * sizeof (unsigned long);
-
-  dtime = 200.0;
-
-  dticks = (unsigned long) (dtime * clkrate);
-
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
-
-  acc_map_data (a, d_a, nbytes);
-
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
-
   stream = (CUstream) acc_get_cuda_stream (0);
   if (stream != NULL)
     abort ();
@@ -90,7 +44,7 @@  main (int argc, char **argv)
   if (!acc_set_cuda_stream (0, stream))
     abort ();
 
-  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, NULL, 0);
   if (r != CUDA_SUCCESS)
     {
       fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -97,25 +51,15 @@  main (int argc, char **argv)
       abort ();
     }
 
-  if (acc_async_test (0) != 0)
-    {
-      fprintf (stderr, "asynchronous operation not running\n");
-      abort ();
-    }
+  if (acc_async_test (0) == 1)
+    fprintf (stderr, "expected asynchronous operation to be running\n");
 
-  sleep (1);
+  acc_wait_all ();
 
-  if (acc_async_test (0) != 1)
-    {
-      fprintf (stderr, "found asynchronous operation still running\n");
-      abort ();
-    }
+  if (acc_async_test (0) == 0)
+    fprintf (stderr, "expected asynchronous operation to be running\n");
 
-  acc_unmap_data (a);
 
-  free (a);
-  acc_free (d_a);
-
   acc_shutdown (acc_device_nvidia);
 
   exit (0);
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/lib-70.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/lib-70.c	(revision 265394)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/lib-70.c	(working copy)
@@ -1,6 +1,7 @@ 
 /* { dg-do run { target openacc_nvidia_accel_selected } } */
 /* { dg-additional-options "-lcuda" } */
 
+#include <sys/time.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
@@ -10,47 +11,17 @@ 
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay;
   CUmodule module;
   CUresult r;
-  const int N = 10;
+  const int N = 3;
   int i;
   CUstream streams[N];
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float dtime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
+  struct timeval tv1, tv2;
+  time_t diff;
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
-      abort ();
-    }
-
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
   r = cuModuleLoad (&module, "subr.ptx");
   if (r != CUDA_SUCCESS)
     {
@@ -65,20 +36,6 @@  main (int argc, char **argv)
       abort ();
     }
 
-  nbytes = nprocs * sizeof (unsigned long);
-
-  dtime = 200.0;
-
-  dticks = (unsigned long) (dtime * clkrate);
-
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
-
-  acc_map_data (a, d_a, nbytes);
-
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
-
   for (i = 0; i < N; i++)
     {
       streams[i] = (CUstream) acc_get_cuda_stream (i);
@@ -96,9 +53,29 @@  main (int argc, char **argv)
 	  abort ();
     }
 
+  gettimeofday (&tv1, NULL);
+
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[0], NULL, 0);
+  if (r != CUDA_SUCCESS)
+    {
+      fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
+      abort ();
+    }
+
+  r = cuCtxSynchronize ();
+  if (r != CUDA_SUCCESS)
+    {
+      fprintf (stderr, "cuCtxLaunch failed: %d\n", r);
+      abort ();
+    }
+
+  gettimeofday (&tv2, NULL);
+
+  diff = tv2.tv_sec - tv1.tv_sec;
+
   for (i = 0; i < N; i++)
     {
-      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[i], kargs, 0);
+      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[i], NULL, 0);
       if (r != CUDA_SUCCESS)
 	{
 	  fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -112,7 +89,7 @@  main (int argc, char **argv)
 	}
     }
 
-  sleep ((int) (dtime / 1000.0f) + 1);
+  sleep ((diff + 1) * N);
 
   for (i = 0; i < N; i++)
     {
@@ -123,11 +100,7 @@  main (int argc, char **argv)
 	}
     }
 
-  acc_unmap_data (a);
 
-  free (a);
-  acc_free (d_a);
-
   acc_shutdown (acc_device_nvidia);
 
   exit (0);
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/lib-71.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/lib-71.c	(revision 265394)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/lib-71.c	(working copy)
@@ -9,45 +9,13 @@ 
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay;
   CUmodule module;
   CUresult r;
   CUstream stream;
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float dtime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
-      abort ();
-    }
-
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
   r = cuModuleLoad (&module, "subr.ptx");
   if (r != CUDA_SUCCESS)
     {
@@ -62,20 +30,6 @@  main (int argc, char **argv)
       abort ();
     }
 
-  nbytes = nprocs * sizeof (unsigned long);
-
-  dtime = 200.0;
-
-  dticks = (unsigned long) (dtime * clkrate);
-
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
-
-  acc_map_data (a, d_a, nbytes);
-
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
-
   r = cuStreamCreate (&stream, CU_STREAM_DEFAULT);
   if (r != CUDA_SUCCESS)
 	{
@@ -85,7 +39,7 @@  main (int argc, char **argv)
 
   acc_set_cuda_stream (0, stream);
 
-  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, NULL, 0);
   if (r != CUDA_SUCCESS)
     {
       fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -99,7 +53,7 @@  main (int argc, char **argv)
       abort ();
     }
 
-  sleep ((int) (dtime / 1000.0f) + 1);
+  sleep (1);
 
   if (acc_async_test (1) != 1)
     {
@@ -107,11 +61,6 @@  main (int argc, char **argv)
       abort ();
     }
 
-  acc_unmap_data (a);
-
-  free (a);
-  acc_free (d_a);
-
   acc_shutdown (acc_device_nvidia);
 
   return 0;
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/lib-72.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/lib-72.c	(revision 265394)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/lib-72.c	(working copy)
@@ -10,45 +10,13 @@ 
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay;
   CUmodule module;
   CUresult r;
   CUstream stream;
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float dtime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
-      abort ();
-    }
-
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
   r = cuModuleLoad (&module, "subr.ptx");
   if (r != CUDA_SUCCESS)
     {
@@ -63,20 +31,6 @@  main (int argc, char **argv)
       abort ();
     }
 
-  nbytes = nprocs * sizeof (unsigned long);
-
-  dtime = 200.0;
-
-  dticks = (unsigned long) (dtime * clkrate);
-
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
-
-  acc_map_data (a, d_a, nbytes);
-
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
-
   r = cuStreamCreate (&stream, CU_STREAM_DEFAULT);
   if (r != CUDA_SUCCESS)
     {
@@ -87,7 +41,7 @@  main (int argc, char **argv)
   if (!acc_set_cuda_stream (0, stream))
     abort ();
     
-  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, NULL, 0);
   if (r != CUDA_SUCCESS)
     {
       fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -100,7 +54,12 @@  main (int argc, char **argv)
       abort ();
     }
 
-  sleep ((int) (dtime / 1000.f) + 1);
+  r = cuCtxSynchronize ();
+  if (r != CUDA_SUCCESS)
+    {
+      fprintf (stderr, "cuCtxSynchronize () failed: %d\n", r);
+      abort ();
+    }
 
   if (acc_async_test_all () != 1)
     {
@@ -108,11 +67,6 @@  main (int argc, char **argv)
       abort ();
     }
 
-  acc_unmap_data (a);
-
-  free (a);
-  acc_free (d_a);
-
   acc_shutdown (acc_device_nvidia);
 
   exit (0);
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/lib-73.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/lib-73.c	(revision 265394)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/lib-73.c	(working copy)
@@ -1,6 +1,7 @@ 
 /* { dg-do run { target openacc_nvidia_accel_selected } } */
 /* { dg-additional-options "-lcuda" } */
 
+#include <sys/time.h>
 #include <stdio.h>
 #include <unistd.h>
 #include <stdlib.h>
@@ -10,47 +11,15 @@ 
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay;
   CUmodule module;
   CUresult r;
-  const int N = 10;
+  const int N = 6;
   int i;
   CUstream streams[N];
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float dtime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
-      abort ();
-    }
-
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
   r = cuModuleLoad (&module, "subr.ptx");
   if (r != CUDA_SUCCESS)
     {
@@ -65,20 +34,6 @@  main (int argc, char **argv)
       abort ();
     }
 
-  nbytes = nprocs * sizeof (unsigned long);
-
-  dtime = 200.0;
-
-  dticks = (unsigned long) (dtime * clkrate);
-
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
-
-  acc_map_data (a, d_a, nbytes);
-
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
-
   for (i = 0; i < N; i++)
     {
       streams[i] = (CUstream) acc_get_cuda_stream (i);
@@ -98,13 +53,12 @@  main (int argc, char **argv)
 
   for (i = 0; i < N; i++)
     {
-      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[i], kargs, 0);
+      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[i], NULL, 0);
       if (r != CUDA_SUCCESS)
 	{
 	  fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
 	  abort ();
 	}
-
     }
 
   if (acc_async_test_all () != 0)
@@ -113,7 +67,12 @@  main (int argc, char **argv)
       abort ();
     }
 
-  sleep ((int) (dtime / 1000.0f) + 1);
+  r = cuCtxSynchronize ();
+  if (r != CUDA_SUCCESS)
+    {
+      fprintf (stderr, "cuCtxSynchronize failed: %d\n", r);
+      abort ();
+    }
 
   if (acc_async_test_all () != 1)
     {
@@ -121,11 +80,6 @@  main (int argc, char **argv)
       abort ();
     }
 
-  acc_unmap_data (a);
-
-  free (a);
-  acc_free (d_a);
-
   acc_shutdown (acc_device_nvidia);
 
   exit (0);
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/lib-74.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/lib-74.c	(revision 265394)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/lib-74.c	(working copy)
@@ -5,78 +5,54 @@ 
 #include <stdlib.h>
 #include <openacc.h>
 #include <cuda.h>
-#include "timer.h"
+#include <sys/time.h>
 
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay;
   CUmodule module;
   CUresult r;
   CUstream stream;
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float atime, dtime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
+  struct timeval tv1, tv2;
+  time_t t1, t2;
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
+  r = cuModuleLoad (&module, "subr.ptx");
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
+      fprintf (stderr, "cuModuleLoad failed: %d\n", r);
       abort ();
     }
 
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
+  r = cuModuleGetFunction (&delay, module, "delay");
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
+      fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
       abort ();
     }
 
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
+  gettimeofday (&tv1, NULL);
 
-  r = cuModuleLoad (&module, "subr.ptx");
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, NULL, NULL, 0);
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuModuleLoad failed: %d\n", r);
+      fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
       abort ();
     }
 
-  r = cuModuleGetFunction (&delay, module, "delay");
+  r = cuCtxSynchronize ();
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
+      fprintf (stderr, "cuCtxSynchronize failed: %d\n", r);
       abort ();
     }
 
-  nbytes = nprocs * sizeof (unsigned long);
+  gettimeofday (&tv2, NULL);
 
-  dtime = 200.0;
+  t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
 
-  dticks = (unsigned long) (dtime * clkrate);
-
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
-
-  acc_map_data (a, d_a, nbytes);
-
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
-
   stream = (CUstream) acc_get_cuda_stream (0);
   if (stream != NULL)
     abort ();
@@ -91,11 +67,9 @@  main (int argc, char **argv)
   if (!acc_set_cuda_stream (0, stream))
     abort ();
 
-  init_timers (1);
+  gettimeofday (&tv1, NULL);
 
-  start_timer (0);
-
-  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, NULL, 0);
   if (r != CUDA_SUCCESS)
     {
       fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -104,33 +78,30 @@  main (int argc, char **argv)
 
   acc_wait (0);
 
-  atime = stop_timer (0);
+  gettimeofday (&tv2, NULL);
 
-  if (atime < dtime)
+  t2 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
+
+  if (((abs (t2 - t1) / t1) * 100.0) > 1.0)
     {
-      fprintf (stderr, "actual time < delay time\n");
+      fprintf (stderr, "too long 1\n");
       abort ();
     }
 
-  start_timer (0);
+  gettimeofday (&tv1, NULL);
 
   acc_wait (0);
 
-  atime = stop_timer (0);
+  gettimeofday (&tv2, NULL);
 
-  if (0.010 < atime)
+  t2 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
+
+  if (t2 > 1000)
     {
-      fprintf (stderr, "actual time too long\n");
+      fprintf (stderr, "too long 2\n");
       abort ();
     }
 
-  acc_unmap_data (a);
-
-  fini_timers ();
-
-  free (a);
-  acc_free (d_a);
-
   acc_shutdown (acc_device_nvidia);
 
   exit (0);
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/lib-75.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/lib-75.c	(revision 265394)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/lib-75.c	(working copy)
@@ -6,79 +6,56 @@ 
 #include <stdlib.h>
 #include <openacc.h>
 #include <cuda.h>
-#include "timer.h"
+#include <sys/time.h>
 
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay;
   CUmodule module;
   CUresult r;
-  int N;
+  const int N = 2;
   int i;
   CUstream stream;
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float atime, dtime, hitime, lotime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
+  struct timeval tv1, tv2;
+  time_t t1, t2;
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
+  r = cuModuleLoad (&module, "subr.ptx");
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
+      fprintf (stderr, "cuModuleLoad failed: %d\n", r);
       abort ();
     }
 
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
+  r = cuModuleGetFunction (&delay, module, "delay");
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
+      fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
       abort ();
     }
 
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
+  gettimeofday (&tv1, NULL);
 
-  r = cuModuleLoad (&module, "subr.ptx");
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, NULL, NULL, 0);
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuModuleLoad failed: %d\n", r);
+      fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
       abort ();
     }
 
-  r = cuModuleGetFunction (&delay, module, "delay");
+  r = cuCtxSynchronize ();
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
+      fprintf (stderr, "cuCtxSynchronize failed: %d\n", r);
       abort ();
     }
 
-  nbytes = nprocs * sizeof (unsigned long);
+  gettimeofday (&tv2, NULL);
 
-  dtime = 200.0;
+  t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
 
-  dticks = (unsigned long) (dtime * clkrate);
-
-  N = nprocs;
-
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
-
-  acc_map_data (a, d_a, nbytes);
-
   stream = (CUstream) acc_get_cuda_stream (0);
   if (stream != NULL)
     abort ();
@@ -93,16 +70,11 @@  main (int argc, char **argv)
   if (!acc_set_cuda_stream (0, stream))
     abort ();
 
-  init_timers (1);
+  gettimeofday (&tv1, NULL);
 
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
-
-  start_timer (0);
-
   for (i = 0; i < N; i++)
     {
-      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
+      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, NULL, 0);
       if (r != CUDA_SUCCESS)
 	{
 	  fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -112,27 +84,18 @@  main (int argc, char **argv)
       acc_wait (0);
     }
 
-  atime = stop_timer (0);
+  gettimeofday (&tv2, NULL);
 
-  hitime = dtime * N;
-  hitime += hitime * 0.02;
+  t2 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
 
-  lotime = dtime * N;
-  lotime -= lotime * 0.02;
+  t1 *= N;
 
-  if (atime > hitime || atime < lotime)
+  if (((abs (t2 - t1) / t1) * 100.0) > 1.0)
     {
-      fprintf (stderr, "actual time < delay time\n");
+      fprintf (stderr, "too long\n");
       abort ();
     }
 
-  acc_unmap_data (a);
-
-  fini_timers ();
-
-  free (a);
-  acc_free (d_a);
-
   acc_shutdown (acc_device_nvidia);
 
   exit (0);
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/lib-76.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/lib-76.c	(revision 265394)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/lib-76.c	(working copy)
@@ -6,79 +6,56 @@ 
 #include <unistd.h>
 #include <openacc.h>
 #include <cuda.h>
-#include "timer.h"
+#include <sys/time.h>
 
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay;
   CUmodule module;
   CUresult r;
-  int N;
+  const int N = 2;
   int i;
   CUstream *streams;
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float atime, dtime, hitime, lotime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
+  struct timeval tv1, tv2;
+  time_t t1, t2;
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
+  r = cuModuleLoad (&module, "subr.ptx");
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
+      fprintf (stderr, "cuModuleLoad failed: %d\n", r);
       abort ();
     }
 
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
+  r = cuModuleGetFunction (&delay, module, "delay");
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
+      fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
       abort ();
     }
 
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
+  gettimeofday (&tv1, NULL);
 
-  r = cuModuleLoad (&module, "subr.ptx");
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, NULL, NULL, 0);
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuModuleLoad failed: %d\n", r);
+      fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
       abort ();
     }
 
-  r = cuModuleGetFunction (&delay, module, "delay");
+  r = cuCtxSynchronize ();
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
+      fprintf (stderr, "cuCtxSynchronize failed: %d\n", r);
       abort ();
     }
 
-  nbytes = nprocs * sizeof (unsigned long);
+  gettimeofday (&tv2, NULL);
 
-  dtime = 200.0;
+  t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
 
-  dticks = (unsigned long) (dtime * clkrate);
-
-  N = nprocs;
-
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
-
-  acc_map_data (a, d_a, nbytes);
-
   streams = (CUstream *) malloc (N * sizeof (void *));
 
   for (i = 0; i < N; i++)
@@ -98,16 +75,11 @@  main (int argc, char **argv)
 	  abort ();
     }
 
-  init_timers (1);
+  gettimeofday (&tv1, NULL);
 
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
-
-  start_timer (0);
-
   for (i = 0; i < N; i++)
     {
-      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[i], kargs, 0);
+      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[i], NULL, 0);
       if (r != CUDA_SUCCESS)
 	{
 	  fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -117,27 +89,19 @@  main (int argc, char **argv)
       acc_wait (i);
     }
 
-  atime = stop_timer (0);
+  gettimeofday (&tv2, NULL);
 
-  hitime = dtime * N;
-  hitime += hitime * 0.02;
+  t2 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
 
-  lotime = dtime * N;
-  lotime -= lotime * 0.02;
+  t1 *= N;
 
-  if (atime > hitime || atime < lotime)
+  if (((abs (t2 - t1) / t1) * 100.0) > 1.0)
     {
-      fprintf (stderr, "actual time < delay time\n");
+      fprintf (stderr, "too long\n");
       abort ();
     }
 
-  acc_unmap_data (a);
-
-  fini_timers ();
-
   free (streams);
-  free (a);
-  acc_free (d_a);
 
   acc_shutdown (acc_device_nvidia);
 
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/lib-77.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/lib-77.c	(revision 265394)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/lib-77.c	(working copy)
@@ -6,78 +6,54 @@ 
 #include <unistd.h>
 #include <openacc.h>
 #include <cuda.h>
-#include "timer.h"
+#include <sys/time.h>
 
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay;
   CUmodule module;
   CUresult r;
   CUstream stream;
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float atime, dtime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
+  struct timeval tv1, tv2;
+  time_t t1, t2;
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
+  r = cuModuleLoad (&module, "subr.ptx");
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
+      fprintf (stderr, "cuModuleLoad failed: %d\n", r);
       abort ();
     }
 
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
+  r = cuModuleGetFunction (&delay, module, "delay");
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
+      fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
       abort ();
     }
 
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
+  gettimeofday (&tv1, NULL);
 
-  r = cuModuleLoad (&module, "subr.ptx");
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, NULL, NULL, 0);
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuModuleLoad failed: %d\n", r);
+      fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
       abort ();
     }
 
-  r = cuModuleGetFunction (&delay, module, "delay");
+  r = cuCtxSynchronize();
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
+      fprintf (stderr, "cuCtxSynchronize failed: %d\n", r);
       abort ();
     }
 
-  nbytes = nprocs * sizeof (unsigned long);
+  gettimeofday (&tv2, NULL);
 
-  dtime = 200.0;
+  t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
 
-  dticks = (unsigned long) (dtime * clkrate);
-
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
-
-  acc_map_data (a, d_a, nbytes);
-
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
-
   r = cuStreamCreate (&stream, CU_STREAM_DEFAULT);
   if (r != CUDA_SUCCESS)
 	{
@@ -87,11 +63,9 @@  main (int argc, char **argv)
 
   acc_set_cuda_stream (0, stream);
 
-  init_timers (1);
+  gettimeofday (&tv1, NULL);
 
-  start_timer (0);
-
-  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, NULL, 0);
   if (r != CUDA_SUCCESS)
     {
       fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -101,33 +75,30 @@  main (int argc, char **argv)
   fprintf (stderr, "CheCKpOInT\n");
   acc_wait (1);
 
-  atime = stop_timer (0);
+  gettimeofday (&tv2, NULL);
 
-  if (atime < dtime)
+  t2 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
+
+  if (t2 > t1)
     {
-      fprintf (stderr, "actual time < delay time\n");
+      fprintf (stderr, "too long 1\n");
       abort ();
     }
 
-  start_timer (0);
+  gettimeofday (&tv1, NULL);
 
   acc_wait (1);
 
-  atime = stop_timer (0);
+  gettimeofday (&tv2, NULL);
 
-  if (0.010 < atime)
+  t2 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
+
+  if (t2 > 1000)
     {
-      fprintf (stderr, "actual time < delay time\n");
+      fprintf (stderr, "too long 2\n");
       abort ();
     }
 
-  acc_unmap_data (a);
-
-  fini_timers ();
-
-  free (a);
-  acc_free (d_a);
-
   acc_shutdown (acc_device_nvidia);
 
   return 0;
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/lib-78.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/lib-78.c	(revision 265394)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/lib-78.c	(working copy)
@@ -6,78 +6,54 @@ 
 #include <unistd.h>
 #include <openacc.h>
 #include <cuda.h>
-#include "timer.h"
+#include <sys/time.h>
 
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay;
   CUmodule module;
   CUresult r;
   CUstream stream;
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float atime, dtime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
+  struct timeval tv1, tv2;
+  time_t t1, t2;
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
+  r = cuModuleLoad (&module, "subr.ptx");
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
+      fprintf (stderr, "cuModuleLoad failed: %d\n", r);
       abort ();
     }
 
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
+  r = cuModuleGetFunction (&delay, module, "delay");
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
+      fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
       abort ();
     }
 
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
+  gettimeofday (&tv1, NULL);
 
-  r = cuModuleLoad (&module, "subr.ptx");
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, NULL, NULL, 0);
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuModuleLoad failed: %d\n", r);
+      fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
       abort ();
     }
 
-  r = cuModuleGetFunction (&delay, module, "delay");
+  r = cuCtxSynchronize ();
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
+      fprintf (stderr, "cuCtxSynchronize failed: %d\n", r);
       abort ();
     }
 
-  nbytes = nprocs * sizeof (unsigned long);
+  gettimeofday (&tv2, NULL);
 
-  dtime = 200.0;
+  t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
 
-  dticks = (unsigned long) (dtime * clkrate);
-
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
-
-  acc_map_data (a, d_a, nbytes);
-
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
-
   stream = (CUstream) acc_get_cuda_stream (0);
   if (stream != NULL)
     abort ();
@@ -92,11 +68,9 @@  main (int argc, char **argv)
   if (!acc_set_cuda_stream (0, stream))
     abort ();
 
-  init_timers (1);
+  gettimeofday (&tv1, NULL);
 
-  start_timer (0);
-
-  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, NULL, 0);
   if (r != CUDA_SUCCESS)
     {
       fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -105,33 +79,30 @@  main (int argc, char **argv)
 
   acc_wait_all ();
 
-  atime = stop_timer (0);
+  gettimeofday (&tv2, NULL);
 
-  if (atime < dtime)
+  t2 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
+
+  if (t2 > (t1 + (t1 * 0.10)))
     {
-      fprintf (stderr, "actual time < delay time\n");
+      fprintf (stderr, "too long 1\n");
       abort ();
     }
 
-  start_timer (0);
+  gettimeofday (&tv1, NULL);
 
   acc_wait_all ();
 
-  atime = stop_timer (0);
+  gettimeofday (&tv2, NULL);
 
-  if (0.010 < atime)
+  t2 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
+
+  if (t2 > 1000)
     {
-      fprintf (stderr, "actual time too long\n");
+      fprintf (stderr, "too long 2\n");
       abort ();
     }
 
-  acc_unmap_data (a);
-
-  fini_timers ();
-
-  free (a);
-  acc_free (d_a);
-
   acc_shutdown (acc_device_nvidia);
 
   exit (0);
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/lib-79.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/lib-79.c	(revision 265394)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/lib-79.c	(working copy)
@@ -6,81 +6,56 @@ 
 #include <unistd.h>
 #include <openacc.h>
 #include <cuda.h>
-#include "timer.h"
+#include <sys/time.h>
 
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay;
   CUmodule module;
   CUresult r;
-  int N;
+  const int N = 2;
   int i;
   CUstream stream;
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float atime, dtime, hitime, lotime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
+  struct timeval tv1, tv2;
+  time_t t1, t2;
 
-  devnum = 2;
-
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
+  r = cuModuleLoad (&module, "subr.ptx");
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
+      fprintf (stderr, "cuModuleLoad failed: %d\n", r);
       abort ();
     }
 
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
+  r = cuModuleGetFunction (&delay, module, "delay");
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
+      fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
       abort ();
     }
 
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
+  gettimeofday (&tv1, NULL);
 
-  r = cuModuleLoad (&module, "subr.ptx");
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, NULL, NULL, 0);
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuModuleLoad failed: %d\n", r);
+      fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
       abort ();
     }
 
-  r = cuModuleGetFunction (&delay, module, "delay");
+  r = cuCtxSynchronize ();
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
+      fprintf (stderr, "cuCtxSynchronize failed: %d\n", r);
       abort ();
     }
 
-  nbytes = nprocs * sizeof (unsigned long);
+  gettimeofday (&tv2, NULL);
 
-  dtime = 200.0;
+  t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
 
-  dticks = (unsigned long) (dtime * clkrate);
-
-  N = nprocs;
-
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
-
-  acc_map_data (a, d_a, nbytes);
-
   r = cuStreamCreate (&stream, CU_STREAM_DEFAULT);
   if (r != CUDA_SUCCESS)
     {
@@ -105,16 +80,11 @@  main (int argc, char **argv)
   if (!acc_set_cuda_stream (0, stream))
     abort ();
 
-  init_timers (1);
+  gettimeofday (&tv1, NULL);
 
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
-
-  start_timer (0);
-
   for (i = 0; i < N; i++)
     {
-      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
+      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, NULL, 0);
       if (r != CUDA_SUCCESS)
 	{
 	  fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -132,7 +102,7 @@  main (int argc, char **argv)
 
   acc_wait (1);
 
-  atime = stop_timer (0);
+  gettimeofday (&tv2, NULL);
 
   if (acc_async_test (0) != 1)
     abort ();
@@ -140,25 +110,16 @@  main (int argc, char **argv)
   if (acc_async_test (1) != 1)
     abort ();
 
-  hitime = dtime * N;
-  hitime += hitime * 0.02;
+  t2 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
 
-  lotime = dtime * N;
-  lotime -= lotime * 0.02;
+  t1 *= N;
 
-  if (atime > hitime || atime < lotime)
+  if (((abs (t2 - t1) / t1) * 100.0) > 1.0)
     {
-      fprintf (stderr, "actual time < delay time\n");
+      fprintf (stderr, "too long\n");
       abort ();
     }
 
-  acc_unmap_data (a);
-
-  fini_timers ();
-
-  free (a);
-  acc_free (d_a);
-
   acc_shutdown (acc_device_nvidia);
 
   exit (0);
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/lib-80.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/lib-80.c	(revision 265394)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/lib-80.c	(working copy)
@@ -6,98 +6,70 @@ 
 #include <unistd.h>
 #include <openacc.h>
 #include <cuda.h>
-#include "timer.h"
+#include <sys/time.h>
 
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay;
   CUmodule module;
   CUresult r;
   CUstream stream;
-  int N;
+  const int N = 2;
   int i;
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float atime, dtime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
+  struct timeval tv1, tv2;
+  time_t t1, t2;
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
+  r = cuModuleLoad (&module, "subr.ptx");
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
+      fprintf (stderr, "cuModuleLoad failed: %d\n", r);
       abort ();
     }
 
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
+  r = cuModuleGetFunction (&delay, module, "delay");
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
+      fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
       abort ();
     }
 
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
+  gettimeofday (&tv1, NULL);
 
-  r = cuModuleLoad (&module, "subr.ptx");
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, NULL, NULL, 0);
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuModuleLoad failed: %d\n", r);
+      fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
       abort ();
     }
 
-  r = cuModuleGetFunction (&delay, module, "delay");
+  r = cuCtxSynchronize();
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
+      fprintf (stderr, "cuCtxSynchronize failed: %d\n", r);
       abort ();
     }
 
-  nbytes = nprocs * sizeof (unsigned long);
+  gettimeofday (&tv2, NULL);
 
-  dtime = 200.0;
+  t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
 
-  dticks = (unsigned long) (dtime * clkrate);
-
-  N = nprocs;
-
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
-
-  acc_map_data (a, d_a, nbytes);
-
   r = cuStreamCreate (&stream, CU_STREAM_DEFAULT);
   if (r != CUDA_SUCCESS)
-	{
-	  fprintf (stderr, "cuStreamCreate failed: %d\n", r);
-	  abort ();
-	}
+    {
+      fprintf (stderr, "cuStreamCreate failed: %d\n", r);
+      abort ();
+    }
 
   acc_set_cuda_stream (1, stream);
 
-  init_timers (1);
+  gettimeofday (&tv1, NULL);
 
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
-
-  start_timer (0);
-
   for (i = 0; i < N; i++)
     {
-      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
+      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, NULL, 0);
       if (r != CUDA_SUCCESS)
 	{
 	  fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -110,21 +82,18 @@  main (int argc, char **argv)
 
   acc_wait (1);
 
-  atime = stop_timer (0);
+  gettimeofday (&tv2, NULL);
 
-  if (atime < dtime)
+  t2 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
+
+  t1 *= N;
+
+  if (((abs (t2 - t1) / t1) * 100.0) > 1.0)
     {
-      fprintf (stderr, "actual time < delay time\n");
+      fprintf (stderr, "too long\n");
       abort ();
     }
 
-  acc_unmap_data (a);
-
-  fini_timers ();
-
-  free (a);
-  acc_free (d_a);
-
   acc_shutdown (acc_device_nvidia);
 
   return 0;
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/lib-81.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/lib-81.c	(revision 265394)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/lib-81.c	(working copy)
@@ -6,79 +6,56 @@ 
 #include <unistd.h>
 #include <openacc.h>
 #include <cuda.h>
-#include "timer.h"
+#include <sys/time.h>
 
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay;
   CUmodule module;
   CUresult r;
-  int N;
+  const int N = 2;
   int i;
   CUstream *streams, stream;
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float atime, dtime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
+  struct timeval tv1, tv2;
+  time_t t1, t2;
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
+  r = cuModuleLoad (&module, "subr.ptx");
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
+      fprintf (stderr, "cuModuleLoad failed: %d\n", r);
       abort ();
     }
 
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
+  r = cuModuleGetFunction (&delay, module, "delay");
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
+      fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
       abort ();
     }
 
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
+  gettimeofday (&tv1, NULL);
 
-  r = cuModuleLoad (&module, "subr.ptx");
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, NULL, NULL, 0);
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuModuleLoad failed: %d\n", r);
-      abort ();
+      fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
+	abort ();
     }
 
-  r = cuModuleGetFunction (&delay, module, "delay");
+  r = cuCtxSynchronize ();
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
-      abort ();
+      fprintf (stderr, "cuCtxSynchronize failed: %d\n", r);
+	abort ();
     }
 
-  nbytes = nprocs * sizeof (unsigned long);
+  gettimeofday (&tv2, NULL);
 
-  dtime = 500.0;
+  t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
 
-  dticks = (unsigned long) (dtime * clkrate);
-
-  N = nprocs;
-
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
-
-  acc_map_data (a, d_a, nbytes);
-
   streams = (CUstream *) malloc (N * sizeof (void *));
 
   for (i = 0; i < N; i++)
@@ -98,11 +75,6 @@  main (int argc, char **argv)
 	  abort ();
     }
 
-  init_timers (1);
-
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
-
   stream = (CUstream) acc_get_cuda_stream (N);
   if (stream != NULL)
     abort ();
@@ -117,11 +89,11 @@  main (int argc, char **argv)
   if (!acc_set_cuda_stream (N, stream))
     abort ();
 
-  start_timer (0);
+  gettimeofday (&tv1, NULL);
 
   for (i = 0; i < N; i++)
     {
-      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[i], kargs, 0);
+      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[i], NULL, 0);
       if (r != CUDA_SUCCESS)
 	{
 	  fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -129,6 +101,10 @@  main (int argc, char **argv)
 	}
     }
 
+  gettimeofday (&tv2, NULL);
+
+  t2 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
+
   acc_wait_all_async (N);
 
   for (i = 0; i <= N; i++)
@@ -145,15 +121,13 @@  main (int argc, char **argv)
 	abort ();
     }
 
-  atime = stop_timer (0);
-
-  if (atime < dtime)
+  if ((t1 * N) < t2)
     {
-      fprintf (stderr, "actual time < delay time\n");
+      fprintf (stderr, "too long 1\n");
       abort ();
     }
 
-  start_timer (0);
+  gettimeofday (&tv1, NULL);
 
   stream = (CUstream) acc_get_cuda_stream (N + 1);
   if (stream != NULL)
@@ -173,35 +147,33 @@  main (int argc, char **argv)
 
   acc_wait (N + 1);
 
-  atime = stop_timer (0);
+  gettimeofday (&tv2, NULL);
 
-  if (0.10 < atime)
+  t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
+
+  if (t1 > 1000)
     {
-      fprintf (stderr, "actual time too long\n");
+      fprintf (stderr, "too long 2\n");
       abort ();
     }
 
-  start_timer (0);
+  gettimeofday (&tv1, NULL);
 
   acc_wait_all_async (N);
 
   acc_wait (N);
 
-  atime = stop_timer (0);
+  gettimeofday (&tv2, NULL);
 
-  if (0.10 < atime)
+  t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
+
+  if (t1 > 1000)
     {
-      fprintf (stderr, "actual time too long\n");
+      fprintf (stderr, "too long 3\n");
       abort ();
     }
 
-  acc_unmap_data (a);
-
-  fini_timers ();
-
   free (streams);
-  free (a);
-  acc_free (d_a);
 
   acc_shutdown (acc_device_nvidia);
 
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/lib-82.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/lib-82.c	(revision 265394)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/lib-82.c	(working copy)
@@ -10,46 +10,18 @@ 
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay2;
   CUmodule module;
   CUresult r;
-  int N;
+  const int N = 32;
   int i;
   CUstream *streams;
-  unsigned long **a, **d_a, *tid, ticks;
+  unsigned long **a, **d_a, *tid;
   int nbytes;
-  void *kargs[3];
-  int clkrate;
-  int devnum, nprocs;
+  void *kargs[2];
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
-      abort ();
-    }
-
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
   r = cuModuleLoad (&module, "subr.ptx");
   if (r != CUDA_SUCCESS)
     {
@@ -66,10 +38,6 @@  main (int argc, char **argv)
 
   nbytes = sizeof (int);
 
-  ticks = (unsigned long) (200.0 * clkrate);
-
-  N = nprocs;
-
   streams = (CUstream *) malloc (N * sizeof (void *));
 
   a = (unsigned long **) malloc (N * sizeof (unsigned long *));
@@ -103,8 +71,7 @@  main (int argc, char **argv)
   for (i = 0; i < N; i++)
     {
       kargs[0] = (void *) &d_a[i];
-      kargs[1] = (void *) &ticks;
-      kargs[2] = (void *) &tid[i];
+      kargs[1] = (void *) &tid[i];
 
       r = cuLaunchKernel (delay2, 1, 1, 1, 1, 1, 1, 0, streams[i], kargs, 0);
       if (r != CUDA_SUCCESS)
@@ -112,8 +79,6 @@  main (int argc, char **argv)
 	  fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
 	  abort ();
 	}
-
-      ticks = (unsigned long) (50.0 * clkrate);
     }
 
   acc_wait_all_async (0);
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/lib-93.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/lib-93.c	(nonexistent)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/lib-93.c	(working copy)
@@ -0,0 +1,19 @@ 
+/* { dg-do run { target { ! openacc_nvidia_accel_configured } } } */
+
+#include <stdio.h>
+#include <openacc.h>
+
+int
+main (void)
+{
+  fprintf (stderr, "CheCKpOInT\n");
+  acc_init (acc_device_nvidia);
+
+  acc_shutdown (acc_device_nvidia);
+
+  return 0;
+}
+
+/* { dg-output "CheCKpOInT(\n|\r\n|\r).*" } */
+/* { dg-output "device type nvidia not supported" } */
+/* { dg-shouldfail "" } */
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/subr.h
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/subr.h	(revision 265394)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/subr.h	(working copy)
@@ -1,46 +1,24 @@ 
 
-#if ACC_DEVICE_TYPE_nvidia
-
 #pragma acc routine nohost
-static int clock (void)
-{
-  int thetime;
-
-  asm __volatile__ ("mov.u32 %0, %%clock;" : "=r"(thetime));
-
-  return thetime;
-}
-
-#endif
-
 void
-delay (unsigned long *d_o, unsigned long delay)
+delay ()
 {
-  int start, ticks;
+  int i, sum;
+  const int N = 500000;
 
-  start = clock ();
-
-  ticks = 0;
-
-  while (ticks < delay)
-    ticks = clock () - start;
-
-  return;
+  for (i = 0; i < N; i++)
+    sum = sum + 1;
 }
 
+#pragma acc routine nohost
 void
-delay2 (unsigned long *d_o, unsigned long delay, unsigned long tid)
+delay2 (unsigned long *d_o, unsigned long tid)
 {
-  int start, ticks;
+  int i, sum;
+  const int N = 500000;
 
-  start = clock ();
+  for (i = 0; i < N; i++)
+    sum = sum + 1;
 
-  ticks = 0;
-
-  while (ticks < delay)
-    ticks = clock () - start;
-
   d_o[0] = tid;
-
-  return;
 }
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/subr.ptx
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/subr.ptx	(revision 265394)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/subr.ptx	(working copy)
@@ -1,148 +1,90 @@ 
-// BEGIN PREAMBLE
-	.version	3.1
-	.target	sm_30
+	.version 3.1
+	.target sm_30
 	.address_size 64
-// END PREAMBLE
 
-// BEGIN FUNCTION DEF: clock
-.func (.param.u32 %out_retval)clock
-{
-.reg.u32 %retval;
-	.reg.u64 %hr10;
-	.reg.u32 %r22;
-	.reg.u32 %r23;
-	.reg.u32 %r24;
-	.local.align 8 .b8 %frame[8];
-	// #APP 
-// 7 "subr.c" 1
-	mov.u32 %r24, %clock;
-// 0 "" 2
-	// #NO_APP 
-		st.local.u32	[%frame], %r24;
-		ld.local.u32	%r22, [%frame];
-		mov.u32	%r23, %r22;
-		mov.u32	%retval, %r23;
-	st.param.u32	[%out_retval], %retval;
-	ret;
-	}
-// END FUNCTION DEF
-// BEGIN GLOBAL FUNCTION DEF: delay
-.visible .entry delay(.param.u64 %in_ar1, .param.u64 %in_ar2)
-{
-	.reg.u64 %ar1;
-	.reg.u64 %ar2;
-	.reg.u64 %hr10;
-	.reg.u64 %r22;
-	.reg.u32 %r23;
-	.reg.u64 %r24;
-	.reg.u64 %r25;
-	.reg.u32 %r26;
-	.reg.u32 %r27;
-	.reg.u32 %r28;
-	.reg.u32 %r29;
-	.reg.u32 %r30;
-	.reg.u64 %r31;
-	.reg.pred %r32;
-	.local.align 8 .b8 %frame[24];
-	ld.param.u64 %ar1, [%in_ar1];
-	ld.param.u64 %ar2, [%in_ar2];
-		mov.u64	%r24, %ar1;
-		st.u64	[%frame+8], %r24;
-		mov.u64	%r25, %ar2;
-		st.local.u64	[%frame+16], %r25;
+	.visible .entry delay
 	{
-		.param.u32 %retval_in;
-	{
-		call (%retval_in), clock;
-	}
-		ld.param.u32	%r26, [%retval_in];
-}
-		st.local.u32	[%frame+4], %r26;
-		mov.u32	%r27, 0;
-		st.local.u32	[%frame], %r27;
-		bra	$L4;
-$L5:
-	{
-		.param.u32 %retval_in;
-	{
-		call (%retval_in), clock;
-	}
-		ld.param.u32	%r28, [%retval_in];
-}
-		mov.u32	%r23, %r28;
-		ld.local.u32	%r30, [%frame+4];
-		sub.u32	%r29, %r23, %r30;
-		st.local.u32	[%frame], %r29;
-$L4:
-		ld.local.s32	%r22, [%frame];
-		ld.local.u64	%r31, [%frame+16];
-		setp.lo.u64 %r32,%r22,%r31;
-	@%r32	bra	$L5;
+	.reg .u64 %hr10;
+	.reg .u32 %r22;
+	.reg .u32 %r23;
+	.reg .u32 %r24;
+	.reg .u32 %r25;
+	.reg .u32 %r26;
+	.reg .u32 %r27;
+	.reg .u32 %r28;
+	.reg .u32 %r29;
+	.reg .pred %r30;
+	.reg .u64 %frame;
+	.local .align 8 .b8 %farray[16];
+	cvta.local.u64 %frame,%farray;
+	mov.u32 %r22,500000;
+	st.u32 [%frame+8],%r22;
+	mov.u32 %r23,0;
+	st.u32 [%frame],%r23;
+	bra $L2;
+	$L3:
+	ld.u32 %r25,[%frame+4];
+	add.u32 %r24,%r25,1;
+	st.u32 [%frame+4],%r24;
+	ld.u32 %r27,[%frame];
+	add.u32 %r26,%r27,1;
+	st.u32 [%frame],%r26;
+	$L2:
+	ld.u32 %r28,[%frame];
+	ld.u32 %r29,[%frame+8];
+	setp.lt.s32 %r30,%r28,%r29;
+	@%r30 
+	bra $L3;
 	ret;
 	}
-// END FUNCTION DEF
-// BEGIN GLOBAL FUNCTION DEF: delay2
-.visible .entry delay2(.param.u64 %in_ar1, .param.u64 %in_ar2, .param.u64 %in_ar3)
-{
-	.reg.u64 %ar1;
-	.reg.u64 %ar2;
-	.reg.u64 %ar3;
-	.reg.u64 %hr10;
-	.reg.u64 %r22;
-	.reg.u32 %r23;
-	.reg.u64 %r24;
-	.reg.u64 %r25;
-	.reg.u64 %r26;
-	.reg.u32 %r27;
-	.reg.u32 %r28;
-	.reg.u32 %r29;
-	.reg.u32 %r30;
-	.reg.u32 %r31;
-	.reg.u64 %r32;
-	.reg.pred %r33;
-	.reg.u64 %r34;
-	.reg.u64 %r35;
-	.local.align 8 .b8 %frame[32];
-	ld.param.u64 %ar1, [%in_ar1];
-	ld.param.u64 %ar2, [%in_ar2];
-	ld.param.u64 %ar3, [%in_ar3];
-		mov.u64	%r24, %ar1;
-		st.local.u64	[%frame+8], %r24;
-		mov.u64	%r25, %ar2;
-		st.local.u64	[%frame+16], %r25;
-		mov.u64	%r26, %ar3;
-		st.local.u64	[%frame+24], %r26;
+
+	.visible .entry delay2 (.param .u64 %in_ar1, .param .u64 %in_ar2)
 	{
-		.param.u32 %retval_in;
-	{
-		call (%retval_in), clock;
-	}
-		ld.param.u32	%r27, [%retval_in];
-}
-		st.local.u32	[%frame+4], %r27;
-		mov.u32	%r28, 0;
-		st.local.u32	[%frame], %r28;
-		bra	$L8;
-$L9:
-	{
-		.param.u32 %retval_in;
-	{
-		call (%retval_in), clock;
-	}
-		ld.param.u32	%r29, [%retval_in];
-}
-		mov.u32	%r23, %r29;
-		ld.local.u32	%r31, [%frame+4];
-		sub.u32	%r30, %r23, %r31;
-		st.local.u32	[%frame], %r30;
-$L8:
-		ld.local.s32	%r22, [%frame];
-		ld.local.u64	%r32, [%frame+16];
-		setp.lo.u64 %r33,%r22,%r32;
-	@%r33	bra	$L9;
-		ld.local.u64	%r34, [%frame+8];
-		ld.local.u64	%r35, [%frame+24];
-		st.u64	[%r34], %r35;
+	.reg .u64 %ar1;
+	.reg .u64 %ar2;
+	.reg .u64 %hr10;
+	.reg .u64 %r22;
+	.reg .u64 %r23;
+	.reg .u32 %r24;
+	.reg .u32 %r25;
+	.reg .u32 %r26;
+	.reg .u32 %r27;
+	.reg .u32 %r28;
+	.reg .u32 %r29;
+	.reg .u32 %r30;
+	.reg .u32 %r31;
+	.reg .pred %r32;
+	.reg .u64 %r33;
+	.reg .u64 %r34;
+	.reg .u64 %frame;
+	.local .align 8 .b8 %farray[32];
+	cvta.local.u64 %frame,%farray;
+	ld.param.u64 %ar1,[%in_ar1];
+	ld.param.u64 %ar2,[%in_ar2];
+	mov.u64 %r22,%ar1;
+	st.u64 [%frame+16],%r22;
+	mov.u64 %r23,%ar2;
+	st.u64 [%frame+24],%r23;
+	mov.u32 %r24,500000;
+	st.u32 [%frame+8],%r24;
+	mov.u32 %r25,0;
+	st.u32 [%frame],%r25;
+	bra $L5;
+	$L6:
+	ld.u32 %r27,[%frame+4];
+	add.u32 %r26,%r27,1;
+	st.u32 [%frame+4],%r26;
+	ld.u32 %r29,[%frame];
+	add.u32 %r28,%r29,1;
+	st.u32 [%frame],%r28;
+	$L5:
+	ld.u32 %r30,[%frame];
+	ld.u32 %r31,[%frame+8];
+	setp.lt.s32 %r32,%r30,%r31;
+	@%r32 
+	bra $L6;
+	ld.u64 %r33,[%frame+16];
+	ld.u64 %r34,[%frame+24];
+	st.u64 [%r33],%r34;
 	ret;
 	}
-// END FUNCTION DEF
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/timer.h
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/timer.h	(revision 265394)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/timer.h	(nonexistent)
@@ -1,103 +0,0 @@ 
-
-#include <stdio.h>
-#include <cuda.h>
-
-static int _Tnum_timers;
-static CUevent *_Tstart_events, *_Tstop_events;
-static CUstream _Tstream;
-
-void
-init_timers (int ntimers)
-{
-  int i;
-  CUresult r;
-
-  _Tnum_timers = ntimers;
-
-  _Tstart_events = (CUevent *) malloc (_Tnum_timers * sizeof (CUevent));
-  _Tstop_events = (CUevent *) malloc (_Tnum_timers * sizeof (CUevent));
-
-  r = cuStreamCreate (&_Tstream, CU_STREAM_DEFAULT);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuStreamCreate failed: %d\n", r);
-      abort ();
-    }
-
-  for (i = 0; i < _Tnum_timers; i++)
-    {
-      r = cuEventCreate (&_Tstart_events[i], CU_EVENT_DEFAULT);
-      if (r != CUDA_SUCCESS)
-	{
-	  fprintf (stderr, "cuEventCreate failed: %d\n", r);
-	  abort ();
-	}
-
-      r = cuEventCreate (&_Tstop_events[i], CU_EVENT_DEFAULT);
-      if (r != CUDA_SUCCESS)
-	{
-	  fprintf (stderr, "cuEventCreate failed: %d\n", r);
-	  abort ();
-	}
-    }
-}
-
-void
-fini_timers (void)
-{
-  int i;
-
-  for (i = 0; i < _Tnum_timers; i++)
-    {
-      cuEventDestroy (_Tstart_events[i]);
-      cuEventDestroy (_Tstop_events[i]);
-    }
-
-  cuStreamDestroy (_Tstream);
-
-  free (_Tstart_events);
-  free (_Tstop_events);
-}
-
-void
-start_timer (int timer)
-{
-  CUresult r;
-
-  r = cuEventRecord (_Tstart_events[timer], _Tstream);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuEventRecord failed: %d\n", r);
-      abort ();
-    }
-}
-
-float
-stop_timer (int timer)
-{
-  CUresult r;
-  float etime;
-
-  r = cuEventRecord (_Tstop_events[timer], _Tstream);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuEventRecord failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuEventSynchronize (_Tstop_events[timer]);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuEventSynchronize failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuEventElapsedTime (&etime, _Tstart_events[timer], _Tstop_events[timer]);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuEventElapsedTime failed: %d\n", r);
-      abort ();
-    }
-
-  return etime;
-}
Index: libgomp/testsuite/libgomp.oacc-fortran/data-3.f90
===================================================================
--- libgomp/testsuite/libgomp.oacc-fortran/data-3.f90	(revision 265394)
+++ libgomp/testsuite/libgomp.oacc-fortran/data-3.f90	(working copy)
@@ -17,7 +17,7 @@  program asyncwait
 
   !$acc enter data copyin (a(1:N)) copyin (b(1:N)) copyin (N) async
 
-  !$acc parallel async wait
+  !$acc parallel async wait present (a(1:N)) present (b(1:N)) present (N)
   do i = 1, N
      b(i) = a(i)
   end do
@@ -36,7 +36,7 @@  program asyncwait
 
   !$acc enter data copyin (a(1:N)) copyin (b(1:N)) async (1)
 
-  !$acc parallel async (1) wait (1)
+  !$acc parallel async (1) wait (1) present (a(1:N), b(1:N), N)
   do i = 1, N
      b(i) = a(i)
   end do
@@ -55,21 +55,22 @@  program asyncwait
   c(:) = 0.0
   d(:) = 0.0
 
-  !$acc enter data copyin (a(1:N)) create (b(1:N)) create (c(1:N)) create (d(1:N))
+  !$acc enter data copyin (a(1:N)) create (b(1:N)) create (c(1:N)) &
+  !$acc& create (d(1:N))
 
-  !$acc parallel async (1)
+  !$acc parallel async (1) present (a(1:N), b(1:N), c(1:N), N)
   do i = 1, N
      b(i) = (a(i) * a(i) * a(i)) / a(i)
   end do
   !$acc end parallel
 
-  !$acc parallel async (1)
+  !$acc parallel async (1) present (a(1:N), b(1:N), c(1:N), N)
   do i = 1, N
      c(i) = (a(i) * 4) / a(i)
   end do
   !$acc end parallel
 
-  !$acc parallel async (1)
+  !$acc parallel async (1) present (a(1:N), b(1:N), c(1:N), d(1:N), N)
   do i = 1, N
      d(i) = ((a(i) * a(i)  + a(i)) / a(i)) - a(i)
   end do
@@ -76,7 +77,8 @@  program asyncwait
   !$acc end parallel
 
   !$acc wait (1)
-  !$acc exit data copyout (a(1:N)) copyout (b(1:N)) copyout (c(1:N)) copyout (d(1:N))
+  !$acc exit data copyout (a(1:N)) copyout (b(1:N)) copyout (c(1:N)) &
+  !$acc& copyout (d(1:N))
 
   do i = 1, N
      if (a(i) .ne. 3.0) STOP 5
@@ -91,27 +93,32 @@  program asyncwait
   d(:) = 0.0
   e(:) = 0.0
 
-  !$acc enter data copyin (a(1:N)) create (b(1:N)) create (c(1:N)) create (d(1:N)) copyin (e(1:N))
+  !$acc enter data copyin (a(1:N)) create (b(1:N)) create (c(1:N)) &
+  !$acc& create (d(1:N)) copyin (e(1:N))
 
-  !$acc parallel async (1)
+  !$acc parallel async (1) present (a(1:N), b(1:N), c(1:N), d(1:N)) &
+  !$acc& present (e(1:N), N)
   do i = 1, N
      b(i) = (a(i) * a(i) * a(i)) / a(i)
   end do
   !$acc end parallel
 
-  !$acc parallel async (1)
+  !$acc parallel async (1) present (a(1:N), b(1:N), c(1:N), d(1:N)) &
+  !$acc& present (e(1:N), N)
   do i = 1, N
      c(i) = (a(i) * 4) / a(i)
   end do
   !$acc end parallel
 
-  !$acc parallel async (1)
+  !$acc parallel async (1) present (a(1:N), b(1:N), c(1:N), d(1:N)) &
+  !$acc& present (e(1:N), N)
   do i = 1, N
      d(i) = ((a(i) * a(i) + a(i)) / a(i)) - a(i)
   end do
   !$acc end parallel
 
-  !$acc parallel wait (1) async (1)
+  !$acc parallel wait (1) async (1) present (a(1:N), b(1:N), c(1:N)) &
+  !$acc& present (d(1:N), e(1:N), N)
   do i = 1, N
      e(i) = a(i) + b(i) + c(i) + d(i)
   end do
@@ -118,7 +125,8 @@  program asyncwait
   !$acc end parallel
 
   !$acc wait (1)
-  !$acc exit data copyout (a(1:N)) copyout (b(1:N)) copyout (c(1:N)) copyout (d(1:N)) copyout (e(1:N))
+  !$acc exit data copyout (a(1:N)) copyout (b(1:N)) copyout (c(1:N)) &
+  !$acc& copyout (d(1:N)) copyout (e(1:N))
   !$acc exit data delete (N)
 
   do i = 1, N
Index: libgomp/testsuite/libgomp.oacc-fortran/data-4.f90
===================================================================
--- libgomp/testsuite/libgomp.oacc-fortran/data-4.f90	(revision 265394)
+++ libgomp/testsuite/libgomp.oacc-fortran/data-4.f90	(working copy)
@@ -17,7 +17,7 @@  program asyncwait
 
   !$acc enter data copyin (a(1:N)) copyin (b(1:N)) copyin (N) async
 
-  !$acc parallel async wait
+  !$acc parallel async wait present (a(1:N), b(1:N), N)
   !$acc loop
   do i = 1, N
      b(i) = a(i)
@@ -37,7 +37,7 @@  program asyncwait
 
   !$acc update device (a(1:N), b(1:N)) async (1)
 
-  !$acc parallel async (1) wait (1)
+  !$acc parallel async (1) wait (1) present (a(1:N), b(1:N), N)
   !$acc loop
   do i = 1, N
      b(i) = a(i)
@@ -60,19 +60,19 @@  program asyncwait
   !$acc enter data copyin (c(1:N), d(1:N)) async (1)
   !$acc update device (a(1:N), b(1:N)) async (1)
 
-  !$acc parallel async (1)
+  !$acc parallel async (1) present (a(1:N), b(1:N), N)
   do i = 1, N
      b(i) = (a(i) * a(i) * a(i)) / a(i)
   end do
   !$acc end parallel
 
-  !$acc parallel async (1)
+  !$acc parallel async (1) present (a(1:N), c(1:N), N)
   do i = 1, N
      c(i) = (a(i) * 4) / a(i)
   end do
   !$acc end parallel
 
-  !$acc parallel async (1)
+  !$acc parallel async (1) present (a(1:N), d(1:N), N)
   do i = 1, N
      d(i) = ((a(i) * a(i)  + a(i)) / a(i)) - a(i)
   end do
@@ -98,25 +98,26 @@  program asyncwait
   !$acc enter data copyin (e(1:N)) async (1)
   !$acc update device (a(1:N), b(1:N), c(1:N), d(1:N)) async (1)
 
-  !$acc parallel async (1)
+  !$acc parallel async (1) present (a(1:N), b(1:N), N)
   do i = 1, N
      b(i) = (a(i) * a(i) * a(i)) / a(i)
   end do
   !$acc end parallel
 
-  !$acc parallel async (1)
+  !$acc parallel async (1) present (a(1:N), c(1:N), N)
   do i = 1, N
      c(i) = (a(i) * 4) / a(i)
   end do
   !$acc end parallel
 
-  !$acc parallel async (1)
+  !$acc parallel async (1) present (a(1:N), d(1:N), N)
   do i = 1, N
      d(i) = ((a(i) * a(i) + a(i)) / a(i)) - a(i)
   end do
   !$acc end parallel
 
-  !$acc parallel wait (1) async (1)
+  !$acc parallel wait (1) async (1) present (a(1:N), b(1:N), c(1:N)) &
+  !$acc& present (d(1:N), e(1:N), N)
   do i = 1, N
      e(i) = a(i) + b(i) + c(i) + d(i)
   end do