[OpenACC,3/3] Non-contiguous array support for OpenACC data clauses (re-submission), libgomp patches

Message ID 5c0db7bd-093d-d406-eb73-b26bc7685a4d@mentor.com
State New
Headers show
Series
  • [OpenACC,1/3] Non-contiguous array support for OpenACC data clauses (re-submission), front-end patches
Related show

Commit Message

Chung-Lin Tang Aug. 20, 2019, 11:36 a.m.
These are the libgomp patches (including testcases). Not much has
changed from last submission besides renaming to 'non-contiguous', etc. and
rebasing.

Thanks,
Chung-Lin


	libgomp/
	* target.c (struct gomp_ncarray_dim): New struct declaration.
	(struct gomp_ncarray_descr_type): Likewise.
	(struct ncarray_info): Likewise.
	(gomp_noncontig_array_count_rows): New function.
	(gomp_noncontig_array_compute_info): Likewise.
	(gomp_noncontig_array_fill_rows_1): Likewise.
	(gomp_noncontig_array_fill_rows): Likewise.
	(gomp_noncontig_array_create_ptrblock): Likewise.
	(gomp_map_vars): Add code to handle non-contiguous array map kinds.

	* testsuite/libgomp.oacc-c-c++-common/noncontig_array-1.c: New test.
	* testsuite/libgomp.oacc-c-c++-common/noncontig_array-2.c: New test.
	* testsuite/libgomp.oacc-c-c++-common/noncontig_array-3.c: New test.
	* testsuite/libgomp.oacc-c-c++-common/noncontig_array-4.c: New test.
	* testsuite/libgomp.oacc-c-c++-common/noncontig_array-utils.h: New test.

Patch

Index: libgomp/target.c
===================================================================
--- libgomp/target.c	(revision 274618)
+++ libgomp/target.c	(working copy)
@@ -510,6 +510,151 @@  gomp_map_val (struct target_mem_desc *tgt, void **
   return tgt->tgt_start + tgt->list[i].offset;
 }
 
+/* Definitions for data structures describing non-contiguous arrays
+   (Note: interfaces with compiler)
+
+   The compiler generates a descriptor for each such array, places the
+   descriptor on stack, and passes the address of the descriptor to the libgomp
+   runtime as a normal map argument. The runtime then processes the array
+   data structure setup, and replaces the argument with the new actual
+   array address for the child function.
+
+   Care must be taken such that the struct field and layout assumptions
+   of struct gomp_ncarray_dim, gomp_ncarray_descr_type inside the compiler
+   be consistant with the below declarations.  */
+
+struct gomp_ncarray_dim {
+  size_t base;
+  size_t length;
+  size_t elem_size;
+  size_t is_array;
+};
+
+struct gomp_ncarray_descr_type {
+  void *ptr;
+  size_t ndims;
+  struct gomp_ncarray_dim dims[];
+};
+
+/* Internal non-contiguous array info struct, used only here inside the runtime. */
+
+struct ncarray_info
+{
+  struct gomp_ncarray_descr_type *descr;
+  size_t map_index;
+  size_t ptrblock_size;
+  size_t data_row_num;
+  size_t data_row_size;
+};
+
+static size_t
+gomp_noncontig_array_count_rows (struct gomp_ncarray_descr_type *descr)
+{
+  size_t nrows = 1;
+  for (size_t d = 0; d < descr->ndims - 1; d++)
+    nrows *= descr->dims[d].length / sizeof (void *);
+  return nrows;
+}
+
+static void
+gomp_noncontig_array_compute_info (struct ncarray_info *nca)
+{
+  size_t d, n = 1;
+  struct gomp_ncarray_descr_type *descr = nca->descr;
+
+  nca->ptrblock_size = 0;
+  for (d = 0; d < descr->ndims - 1; d++)
+    {
+      size_t dim_count = descr->dims[d].length / descr->dims[d].elem_size;
+      size_t dim_ptrblock_size = (descr->dims[d + 1].is_array
+				  ? 0 : descr->dims[d].length * n);
+      nca->ptrblock_size += dim_ptrblock_size;
+      n *= dim_count;
+    }
+  nca->data_row_num = n;
+  nca->data_row_size = descr->dims[d].length;
+}
+
+static void
+gomp_noncontig_array_fill_rows_1 (struct gomp_ncarray_descr_type *descr, void *nca,
+				  size_t d, void ***row_ptr, size_t *count)
+{
+  if (d < descr->ndims - 1)
+    {
+      size_t elsize = descr->dims[d].elem_size;
+      size_t n = descr->dims[d].length / elsize;
+      void *p = nca + descr->dims[d].base;
+      for (size_t i = 0; i < n; i++)
+	{
+	  void *ptr = p + i * elsize;
+	  /* Deref if next dimension is not array.  */
+	  if (!descr->dims[d + 1].is_array)
+	    ptr = *((void **) ptr);
+	  gomp_noncontig_array_fill_rows_1 (descr, ptr, d + 1, row_ptr, count);
+	}
+    }
+  else
+    {
+      **row_ptr = nca + descr->dims[d].base;
+      *row_ptr += 1;
+      *count += 1;
+    }
+}
+
+static size_t
+gomp_noncontig_array_fill_rows (struct gomp_ncarray_descr_type *descr, void *rows[])
+{
+  size_t count = 0;
+  void **p = rows;
+  gomp_noncontig_array_fill_rows_1 (descr, descr->ptr, 0, &p, &count);
+  return count;
+}
+
+static void *
+gomp_noncontig_array_create_ptrblock (struct ncarray_info *nca,
+				      void *tgt_addr, void *tgt_data_rows[])
+{
+  struct gomp_ncarray_descr_type *descr = nca->descr;
+  void *ptrblock = gomp_malloc (nca->ptrblock_size);
+  void **curr_dim_ptrblock = (void **) ptrblock;
+  size_t n = 1;
+
+  for (size_t d = 0; d < descr->ndims - 1; d++)
+    {
+      int curr_dim_len = descr->dims[d].length;
+      int next_dim_len = descr->dims[d + 1].length;
+      int curr_dim_num = curr_dim_len / sizeof (void *);
+
+      void *next_dim_ptrblock
+	= (void *)(curr_dim_ptrblock + n * curr_dim_num);
+
+      for (int b = 0; b < n; b++)
+        for (int i = 0; i < curr_dim_num; i++)
+	  {
+	    if (d < descr->ndims - 2)
+	      {
+		void *ptr = (next_dim_ptrblock
+			     + b * curr_dim_num * next_dim_len
+			     + i * next_dim_len);
+		void *tgt_ptr = tgt_addr + (ptr - ptrblock);
+		curr_dim_ptrblock[b * curr_dim_num + i] = tgt_ptr;
+	      }
+	    else
+	      {
+		curr_dim_ptrblock[b * curr_dim_num + i]
+		  = tgt_data_rows[b * curr_dim_num + i];
+	      }
+	    void *addr = &curr_dim_ptrblock[b * curr_dim_num + i];
+	    assert (ptrblock <= addr && addr < ptrblock + nca->ptrblock_size);
+	  }
+
+      n *= curr_dim_num;
+      curr_dim_ptrblock = next_dim_ptrblock;
+    }
+  assert (n == nca->data_row_num);
+  return ptrblock;
+}
+
 static inline __attribute__((always_inline)) struct target_mem_desc *
 gomp_map_vars_internal (struct gomp_device_descr *devicep,
 			struct goacc_asyncqueue *aq, size_t mapnum,
@@ -523,9 +668,37 @@  gomp_map_vars_internal (struct gomp_device_descr *
   const int typemask = short_mapkind ? 0xff : 0x7;
   struct splay_tree_s *mem_map = &devicep->mem_map;
   struct splay_tree_key_s cur_node;
-  struct target_mem_desc *tgt
-    = gomp_malloc (sizeof (*tgt) + sizeof (tgt->list[0]) * mapnum);
-  tgt->list_count = mapnum;
+  struct target_mem_desc *tgt;
+
+  bool process_noncontig_arrays = false;
+  size_t nca_data_row_num = 0, row_start = 0;
+  size_t nca_info_num = 0, nca_index;
+  struct ncarray_info *nca_info = NULL;
+  struct target_var_desc *row_desc;
+  uintptr_t target_row_addr;
+  void **host_data_rows = NULL, **target_data_rows = NULL;
+  void *row;
+
+  if (mapnum > 0)
+    {
+      int kind = get_kind (short_mapkind, kinds, 0);
+      process_noncontig_arrays = GOMP_MAP_NONCONTIG_ARRAY_P (kind & typemask);
+    }
+
+  if (process_noncontig_arrays)
+    for (i = 0; i < mapnum; i++)
+      {
+	int kind = get_kind (short_mapkind, kinds, i);
+	if (GOMP_MAP_NONCONTIG_ARRAY_P (kind & typemask))
+	  {
+	    nca_data_row_num += gomp_noncontig_array_count_rows (hostaddrs[i]);
+	    nca_info_num += 1;
+	  }
+      }
+
+  tgt = gomp_malloc (sizeof (*tgt)
+		     + sizeof (tgt->list[0]) * (mapnum + nca_data_row_num));
+  tgt->list_count = mapnum + nca_data_row_num;
   tgt->refcount = pragma_kind == GOMP_MAP_VARS_ENTER_DATA ? 0 : 1;
   tgt->device_descr = devicep;
   struct gomp_coalesce_buf cbuf, *cbufp = NULL;
@@ -537,6 +710,14 @@  gomp_map_vars_internal (struct gomp_device_descr *
       return tgt;
     }
 
+  if (nca_info_num)
+    nca_info = gomp_alloca (sizeof (struct ncarray_info) * nca_info_num);
+  if (nca_data_row_num)
+    {
+      host_data_rows = gomp_malloc (sizeof (void *) * nca_data_row_num);
+      target_data_rows = gomp_malloc (sizeof (void *) * nca_data_row_num);
+    }
+
   tgt_align = sizeof (void *);
   tgt_size = 0;
   cbuf.chunks = NULL;
@@ -568,7 +749,7 @@  gomp_map_vars_internal (struct gomp_device_descr *
       return NULL;
     }
 
-  for (i = 0; i < mapnum; i++)
+  for (i = 0, nca_index = 0; i < mapnum; i++)
     {
       int kind = get_kind (short_mapkind, kinds, i);
       if (hostaddrs[i] == NULL
@@ -633,6 +814,20 @@  gomp_map_vars_internal (struct gomp_device_descr *
 	  has_firstprivate = true;
 	  continue;
 	}
+      else if (GOMP_MAP_NONCONTIG_ARRAY_P (kind & typemask))
+	{
+	  /* Ignore non-contiguous arrays for now, we process them together
+	     later.  */
+	  tgt->list[i].key = NULL;
+	  tgt->list[i].offset = 0;
+	  not_found_cnt++;
+
+	  struct ncarray_info *nca = &nca_info[nca_index++];
+	  nca->descr = (struct gomp_ncarray_descr_type *) hostaddrs[i];
+	  nca->map_index = i;
+	  continue;
+	}
+
       cur_node.host_start = (uintptr_t) hostaddrs[i];
       if (!GOMP_MAP_POINTER_P (kind & typemask))
 	cur_node.host_end = cur_node.host_start + sizes[i];
@@ -701,6 +896,56 @@  gomp_map_vars_internal (struct gomp_device_descr *
 	}
     }
 
+  /* For non-contiguous arrays. Each data row is one target item, separated
+     from the normal map clause items, hence we order them after mapnum.  */
+  if (process_noncontig_arrays)
+    for (i = 0, nca_index = 0, row_start = 0; i < mapnum; i++)
+      {
+	int kind = get_kind (short_mapkind, kinds, i);
+	if (!GOMP_MAP_NONCONTIG_ARRAY_P (kind & typemask))
+	  continue;
+
+	struct ncarray_info *nca = &nca_info[nca_index++];
+	struct gomp_ncarray_descr_type *descr = nca->descr;
+	size_t nr;
+
+	gomp_noncontig_array_compute_info (nca);
+
+	/* We have allocated space in host/target_data_rows to place all the
+	   row data block pointers, now we can start filling them in.  */
+	nr = gomp_noncontig_array_fill_rows (descr, &host_data_rows[row_start]);
+	assert (nr == nca->data_row_num);
+
+	size_t align = (size_t) 1 << (kind >> rshift);
+	if (tgt_align < align)
+	  tgt_align = align;
+	tgt_size = (tgt_size + align - 1) & ~(align - 1);
+	tgt_size += nca->ptrblock_size;
+
+	for (size_t j = 0; j < nca->data_row_num; j++)
+	  {
+	    row = host_data_rows[row_start + j];
+	    row_desc = &tgt->list[mapnum + row_start + j];
+
+	    cur_node.host_start = (uintptr_t) row;
+	    cur_node.host_end = cur_node.host_start + nca->data_row_size;
+	    splay_tree_key n = splay_tree_lookup (mem_map, &cur_node);
+	    if (n)
+	      {
+		assert (n->refcount != REFCOUNT_LINK);
+		gomp_map_vars_existing (devicep, aq, n, &cur_node, row_desc,
+					kind & typemask, /* TODO: cbuf? */ NULL);
+	      }
+	    else
+	      {
+		tgt_size = (tgt_size + align - 1) & ~(align - 1);
+		tgt_size += nca->data_row_size;
+		not_found_cnt++;
+	      }
+	  }
+	row_start += nca->data_row_num;
+      }
+
   if (devaddrs)
     {
       if (mapnum != 1)
@@ -861,6 +1106,15 @@  gomp_map_vars_internal (struct gomp_device_descr *
 	      default:
 		break;
 	      }
+
+	    if (GOMP_MAP_NONCONTIG_ARRAY_P (kind & typemask))
+	      {
+		tgt->list[i].key = &array->key;
+		tgt->list[i].key->tgt = tgt;
+		array++;
+		continue;
+	      }
+
 	    splay_tree_key k = &array->key;
 	    k->host_start = (uintptr_t) hostaddrs[i];
 	    if (!GOMP_MAP_POINTER_P (kind & typemask))
@@ -1010,8 +1264,115 @@  gomp_map_vars_internal (struct gomp_device_descr *
 		array++;
 	      }
 	  }
+
+      /* Processing of non-contiguous array rows.  */
+      if (process_noncontig_arrays)
+	{
+	  for (i = 0, nca_index = 0, row_start = 0; i < mapnum; i++)
+	    {
+	      int kind = get_kind (short_mapkind, kinds, i);
+	      if (!GOMP_MAP_NONCONTIG_ARRAY_P (kind & typemask))
+		continue;
+
+	      struct ncarray_info *nca = &nca_info[nca_index++];
+	      assert (nca->descr == hostaddrs[i]);
+
+	      /* The map for the non-contiguous array itself is never copied from
+		 during unmapping, its the data rows that count. Set copy-from
+		 flags to false here.  */
+	      tgt->list[i].copy_from = false;
+	      tgt->list[i].always_copy_from = false;
+
+	      size_t align = (size_t) 1 << (kind >> rshift);
+	      tgt_size = (tgt_size + align - 1) & ~(align - 1);
+
+	      /* For the map of the non-contiguous array itself, adjust so that
+		 the passed device address points to the beginning of the
+		 ptrblock.  */
+	      tgt->list[i].key->tgt_offset = tgt_size;
+
+	      void *target_ptrblock = (void*) tgt->tgt_start + tgt_size;
+	      tgt_size += nca->ptrblock_size;
+
+	      /* Add splay key for each data row in current non-contiguous
+		 array.  */
+	      for (size_t j = 0; j < nca->data_row_num; j++)
+		{
+		  row = host_data_rows[row_start + j];
+		  row_desc = &tgt->list[mapnum + row_start + j];
+
+		  cur_node.host_start = (uintptr_t) row;
+		  cur_node.host_end = cur_node.host_start + nca->data_row_size;
+		  splay_tree_key n = splay_tree_lookup (mem_map, &cur_node);
+		  if (n)
+		    {
+		      assert (n->refcount != REFCOUNT_LINK);
+		      gomp_map_vars_existing (devicep, aq, n, &cur_node, row_desc,
+					      kind & typemask, cbufp);
+		      target_row_addr = n->tgt->tgt_start + n->tgt_offset;
+		    }
+		  else
+		    {
+		      tgt->refcount++;
+
+		      splay_tree_key k = &array->key;
+		      k->host_start = (uintptr_t) row;
+		      k->host_end = k->host_start + nca->data_row_size;
+
+		      k->tgt = tgt;
+		      k->refcount = 1;
+		      k->link_key = NULL;
+		      tgt_size = (tgt_size + align - 1) & ~(align - 1);
+		      target_row_addr = tgt->tgt_start + tgt_size;
+		      k->tgt_offset = tgt_size;
+		      tgt_size += nca->data_row_size;
+
+		      row_desc->key = k;
+		      row_desc->copy_from
+			= GOMP_MAP_COPY_FROM_P (kind & typemask);
+		      row_desc->always_copy_from
+			= GOMP_MAP_COPY_FROM_P (kind & typemask);
+		      row_desc->offset = 0;
+		      row_desc->length = nca->data_row_size;
+
+		      array->left = NULL;
+		      array->right = NULL;
+		      splay_tree_insert (mem_map, array);
+
+		      if (GOMP_MAP_COPY_TO_P (kind & typemask))
+			gomp_copy_host2dev (devicep, aq,
+					    (void *) tgt->tgt_start + k->tgt_offset,
+					    (void *) k->host_start,
+					    nca->data_row_size, cbufp);
+		      array++;
+		    }
+		  target_data_rows[row_start + j] = (void *) target_row_addr;
+		}
+
+	      /* Now we have the target memory allocated, and target offsets of all
+		 row blocks assigned and calculated, we can construct the
+		 accelerator side ptrblock and copy it in.  */
+	      if (nca->ptrblock_size)
+		{
+		  void *ptrblock = gomp_noncontig_array_create_ptrblock
+		    (nca, target_ptrblock, target_data_rows + row_start);
+		  gomp_copy_host2dev (devicep, aq, target_ptrblock, ptrblock,
+				      nca->ptrblock_size, cbufp);
+		  free (ptrblock);
+		}
+
+	      row_start += nca->data_row_num;
+	    }
+	  assert (row_start == nca_data_row_num && nca_index == nca_info_num);
+	}
     }
 
+  if (nca_data_row_num)
+    {
+      free (host_data_rows);
+      free (target_data_rows);
+    }
+
   if (pragma_kind == GOMP_MAP_VARS_TARGET)
     {
       for (i = 0; i < mapnum; i++)
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/noncontig_array-1.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/noncontig_array-1.c	(nonexistent)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/noncontig_array-1.c	(working copy)
@@ -0,0 +1,103 @@ 
+/* { dg-do run { target { ! openacc_host_selected } } } */
+
+#include <stdlib.h>
+#include <assert.h>
+
+#define n 100
+#define m 100
+
+int b[n][m];
+
+void
+test1 (void)
+{
+  int i, j, *a[100];
+
+  /* Array of pointers form test.  */
+  for (i = 0; i < n; i++)
+    {
+      a[i] = (int *)malloc (sizeof (int) * m);
+      for (j = 0; j < m; j++)
+	b[i][j] = j - i;
+    }
+
+  #pragma acc parallel loop copyout(a[0:n][0:m]) copyin(b)
+  for (i = 0; i < n; i++)
+    #pragma acc loop
+    for (j = 0; j < m; j++)
+      a[i][j] = b[i][j];
+
+  for (i = 0; i < n; i++)
+    {
+      for (j = 0; j < m; j++)
+	assert (a[i][j] == b[i][j]);
+      /* Clean up.  */
+      free (a[i]);
+    }
+}
+
+void
+test2 (void)
+{
+  int i, j, **a = (int **) malloc (sizeof (int *) * n);
+
+  /* Separately allocated blocks.  */
+  for (i = 0; i < n; i++)
+    {
+      a[i] = (int *)malloc (sizeof (int) * m);
+      for (j = 0; j < m; j++)
+	b[i][j] = j - i;
+    }
+
+  #pragma acc parallel loop copyout(a[0:n][0:m]) copyin(b)
+  for (i = 0; i < n; i++)
+    #pragma acc loop
+    for (j = 0; j < m; j++)
+      a[i][j] = b[i][j];
+
+  for (i = 0; i < n; i++)
+    {
+      for (j = 0; j < m; j++)
+	assert (a[i][j] == b[i][j]);
+      /* Clean up.  */
+      free (a[i]);
+    }
+  free (a);
+}
+
+void
+test3 (void)
+{
+  int i, j, **a = (int **) malloc (sizeof (int *) * n);
+  a[0] = (int *) malloc (sizeof (int) * n * m);
+
+  /* Rows allocated in one contiguous block.  */
+  for (i = 0; i < n; i++)
+    {
+      a[i] = *a + i * m;
+      for (j = 0; j < m; j++)
+	b[i][j] = j - i;
+    }
+
+  #pragma acc parallel loop copyout(a[0:n][0:m]) copyin(b)
+  for (i = 0; i < n; i++)
+    #pragma acc loop
+    for (j = 0; j < m; j++)
+      a[i][j] = b[i][j];
+
+  for (i = 0; i < n; i++)
+    for (j = 0; j < m; j++)
+      assert (a[i][j] == b[i][j]);
+
+  free (a[0]);
+  free (a);
+}
+
+int
+main (void)
+{
+  test1 ();
+  test2 ();
+  test3 ();
+  return 0;
+}
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/noncontig_array-2.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/noncontig_array-2.c	(nonexistent)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/noncontig_array-2.c	(working copy)
@@ -0,0 +1,37 @@ 
+/* { dg-do run { target { ! openacc_host_selected } } } */
+
+#include <assert.h>
+#include "noncontig_array-utils.h"
+
+int
+main (void)
+{
+  int n = 10;
+  int ***a = (int ***) create_ncarray (sizeof (int), n, 3);
+  int ***b = (int ***) create_ncarray (sizeof (int), n, 3);
+  int ***c = (int ***) create_ncarray (sizeof (int), n, 3);
+
+  for (int i = 0; i < n; i++)
+    for (int j = 0; j < n; j++)
+      for (int k = 0; k < n; k++)
+	{
+	  a[i][j][k] = i + j * k + k;
+	  b[i][j][k] = j + k * i + i * j;
+	  c[i][j][k] = a[i][j][k];
+	}
+
+  #pragma acc parallel copy (a[0:n][0:n][0:n]) copyin (b[0:n][0:n][0:n])
+  {
+    for (int i = 0; i < n; i++)
+      for (int j = 0; j < n; j++)
+	for (int k = 0; k < n; k++)
+	  a[i][j][k] += b[k][j][i] + i + j + k;
+  }
+
+  for (int i = 0; i < n; i++)
+    for (int j = 0; j < n; j++)
+      for (int k = 0; k < n; k++)
+	assert (a[i][j][k] == c[i][j][k] + b[k][j][i] + i + j + k);
+
+  return 0;
+}
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/noncontig_array-3.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/noncontig_array-3.c	(nonexistent)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/noncontig_array-3.c	(working copy)
@@ -0,0 +1,45 @@ 
+/* { dg-do run { target { ! openacc_host_selected } } } */
+
+#include <assert.h>
+#include "noncontig_array-utils.h"
+
+int main (void)
+{
+  int n = 20, x = 5, y = 12;
+  int *****a = (int *****) create_ncarray (sizeof (int), n, 5);
+
+  int sum1 = 0, sum2 = 0, sum3 = 0;
+
+  for (int i = 0; i < n; i++)
+    for (int j = 0; j < n; j++)
+      for (int k = 0; k < n; k++)
+	for (int l = 0; l < n; l++)
+	  for (int m = 0; m < n; m++)
+	    {
+	      a[i][j][k][l][m] = 1;
+	      sum1++;
+	    }
+
+  #pragma acc parallel copy (a[x:y][x:y][x:y][x:y][x:y]) copy(sum2)
+  {
+    for (int i = x; i < x + y; i++)
+      for (int j = x; j < x + y; j++)
+	for (int k = x; k < x + y; k++)
+	  for (int l = x; l < x + y; l++)
+	    for (int m = x; m < x + y; m++)
+	      {
+		a[i][j][k][l][m] = 0;
+		sum2++;
+	      }
+  }
+
+  for (int i = 0; i < n; i++)
+    for (int j = 0; j < n; j++)
+      for (int k = 0; k < n; k++)
+	for (int l = 0; l < n; l++)
+	  for (int m = 0; m < n; m++)
+	    sum3 += a[i][j][k][l][m];
+
+  assert (sum1 == sum2 + sum3);
+  return 0;
+}
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/noncontig_array-4.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/noncontig_array-4.c	(nonexistent)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/noncontig_array-4.c	(working copy)
@@ -0,0 +1,36 @@ 
+/* { dg-do run { target { ! openacc_host_selected } } } */
+
+#include <assert.h>
+#include "noncontig_array-utils.h"
+
+int main (void)
+{
+  int n = 128;
+  double ***a = (double ***) create_ncarray (sizeof (double), n, 3);
+  double ***b = (double ***) create_ncarray (sizeof (double), n, 3);
+
+  for (int i = 0; i < n; i++)
+    for (int j = 0; j < n; j++)
+      for (int k = 0; k < n; k++)
+	a[i][j][k] = i + j + k + i * j * k;
+
+  /* This test exercises async copyout of non-contiguous array rows.  */
+  #pragma acc parallel copyin(a[0:n][0:n][0:n]) copyout(b[0:n][0:n][0:n]) async(5)
+  {
+    #pragma acc loop gang
+    for (int i = 0; i < n; i++)
+      #pragma acc loop vector
+      for (int j = 0; j < n; j++)
+	for (int k = 0; k < n; k++)
+	  b[i][j][k] = a[i][j][k] * 2.0;
+  }
+
+  #pragma acc wait (5)
+
+  for (int i = 0; i < n; i++)
+    for (int j = 0; j < n; j++)
+      for (int k = 0; k < n; k++)
+	assert (b[i][j][k] == a[i][j][k] * 2.0);
+
+  return 0;
+}
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/noncontig_array-utils.h
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/noncontig_array-utils.h	(nonexistent)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/noncontig_array-utils.h	(working copy)
@@ -0,0 +1,44 @@ 
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <stdint.h>
+
+/* Allocate and create a pointer based NDIMS-dimensional array,
+   each dimension DIMLEN long, with ELSIZE sized data elements.  */
+void *
+create_ncarray (size_t elsize, int dimlen, int ndims)
+{
+  size_t blk_size = 0;
+  size_t n = 1;
+
+  for (int i = 0; i < ndims - 1; i++)
+    {
+      n *= dimlen;
+      blk_size += sizeof (void *) * n;
+    }
+  size_t data_rows_num = n;
+  size_t data_rows_offset = blk_size;
+  blk_size += elsize * n * dimlen;
+
+  void *blk = (void *) malloc (blk_size);
+  memset (blk, 0, blk_size);
+  void **curr_dim = (void **) blk;
+  n = 1;
+
+  for (int d = 0; d < ndims - 1; d++)
+    {
+      uintptr_t next_dim = (uintptr_t) (curr_dim + n * dimlen);
+      size_t next_dimlen = dimlen * (d < ndims - 2 ? sizeof (void *) : elsize);
+
+      for (int b = 0; b < n; b++)
+        for (int i = 0; i < dimlen; i++)
+	  if (d < ndims - 1)
+	    curr_dim[b * dimlen + i]
+	      = (void*) (next_dim + b * dimlen * next_dimlen + i * next_dimlen);
+
+      n *= dimlen;
+      curr_dim = (void**) next_dim;
+    }
+  assert (n == data_rows_num);
+  return blk;
+}