[og7] vector_length extension part 1: generalize function and variable names

Message ID d6642c62-6d01-10ce-dda2-f5fa453ed971@codesourcery.com
State New
Headers show
Series
  • [og7] vector_length extension part 1: generalize function and variable names
Related show

Commit Message

Cesar Philippidis March 1, 2018, 9:17 p.m.
Right now, I'm in the process of adding support for larger
vector_lengths in the nvptx BE. To reduce the size of the final patch,
I've separated all of the misc. function and variable renaming into this
patch. Once the nvptx BE is extended to support multiple CUDA
warps-sized vector lengths, in certain respect vectors will act like
workers with regards to state propagation and reductions (i.e., large
vectors will use shared-memory to propagate state between vector-single
to vector-partitioned modes and also for reductions). To that end, this
patch renames worker functions and variables from worker-something to
shared-something. Likewise, vector specific functions have been renamed
as warp-something.

This patch also introduces a new populate_offload_attrs function. At
present, that function is only used in nvptx_reorg, and it is actually
overkill for that. However, later one it will be used in other places,
including nvptx_validate_dims and the nvptx reduction handling code.

This patch has been committed to openacc-gcc-7-branch.

Cesar

Comments

Thomas Schwinge March 9, 2018, 3:29 p.m. | #1
Hi!

On Thu, 1 Mar 2018 13:17:01 -0800, Cesar Philippidis <cesar@codesourcery.com> wrote:
> To reduce the size of the final patch,

> I've separated all of the misc. function and variable renaming into this

> patch.


Yes, please always do such refactoring changes independently of other
functionality changes.


> This patch also introduces a new populate_offload_attrs function.


I'm seeing:

    [...]/gcc/config/nvptx/nvptx.c: In function 'void nvptx_reorg()':
    [...]/gcc/config/nvptx/nvptx.c:4451:3: warning: 'oa.offload_attrs::vector_length' may be used uninitialized in this function [-Wmaybe-uninitialized]
       if (oa->vector_length == 0)
       ^
    [...]/gcc/config/nvptx/nvptx.c:4516:21: note: 'oa.offload_attrs::vector_length' was declared here
           offload_attrs oa;
                         ^

That must be "populate_offload_attrs" inlined into "nvptx_reorg".  I
can't tell yet why it complains about "vector_length" only but not about
the others.

For reference:

> --- a/gcc/config/nvptx/nvptx.c

> +++ b/gcc/config/nvptx/nvptx.c


> +/* Offloading function attributes.  */

> +

> +struct offload_attrs

> +{

> +  unsigned mask;

> +  int num_gangs;

> +  int num_workers;

> +  int vector_length;

> +  int max_workers;

> +};


> +static void

> +populate_offload_attrs (offload_attrs *oa)

> +{

> +  tree attr = oacc_get_fn_attrib (current_function_decl);

> +  tree dims = TREE_VALUE (attr);

> +  unsigned ix;

> +

> +  oa->mask = 0;

> +

> +  for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))

> +    {

> +      tree t = TREE_VALUE (dims);

> +      int size = (t == NULL_TREE) ? 0 : TREE_INT_CST_LOW (t);

> +      tree allowed = TREE_PURPOSE (dims);

> +

> +      if (size != 1 && !(allowed && integer_zerop (allowed)))

> +	oa->mask |= GOMP_DIM_MASK (ix);

> +

> +      switch (ix)

> +	{

> +	case GOMP_DIM_GANG:

> +	  oa->num_gangs = size;

> +	  break;

> +

> +	case GOMP_DIM_WORKER:

> +	  oa->num_workers = size;

> +	  break;

> +

> +	case GOMP_DIM_VECTOR:

> +	  oa->vector_length = size;

> +	  break;

> +	}

> +    }

> +

> +  if (oa->vector_length == 0)

> +    {

> +      /* FIXME: Need a more graceful way to handle large vector

> +	 lengths in OpenACC routines.  */

> +      if (!lookup_attribute ("omp target entrypoint",

> +			     DECL_ATTRIBUTES (current_function_decl)))

> +	oa->vector_length = PTX_WARP_SIZE;

> +      else

> +	oa->vector_length = PTX_VECTOR_LENGTH;

> +    }

> +  if (oa->num_workers == 0)

> +    oa->max_workers = PTX_CTA_SIZE / oa->vector_length;

> +  else

> +    oa->max_workers = oa->num_workers;

> +}


> @@ -4435,27 +4513,19 @@ nvptx_reorg (void)

>      {

>        /* If we determined this mask before RTL expansion, we could

>  	 elide emission of some levels of forks and joins.  */

> -      unsigned mask = 0;

> -      tree dims = TREE_VALUE (attr);

> -      unsigned ix;

> +      offload_attrs oa;

>  

> -      for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))

> -	{

> -	  int size = TREE_INT_CST_LOW (TREE_VALUE (dims));

> -	  tree allowed = TREE_PURPOSE (dims);

> +      populate_offload_attrs (&oa);

>  

> -	  if (size != 1 && !(allowed && integer_zerop (allowed)))

> -	    mask |= GOMP_DIM_MASK (ix);

> -	}

>        /* If there is worker neutering, there must be vector

>  	 neutering.  Otherwise the hardware will fail.  */

> -      gcc_assert (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))

> -		  || (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));

> +      gcc_assert (!(oa.mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))

> +		  || (oa.mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));

>  

>        /* Discover & process partitioned regions.  */

>        parallel *pars = nvptx_discover_pars (&bb_insn_map);

>        nvptx_process_pars (pars);

> -      nvptx_neuter_pars (pars, mask, 0);

> +      nvptx_neuter_pars (pars, oa.mask, 0);

>        delete pars;

>      }



Grüße
 Thomas
Cesar Philippidis March 9, 2018, 3:31 p.m. | #2
On 03/09/2018 07:29 AM, Thomas Schwinge wrote:

> On Thu, 1 Mar 2018 13:17:01 -0800, Cesar Philippidis <cesar@codesourcery.com> wrote:

>> To reduce the size of the final patch,

>> I've separated all of the misc. function and variable renaming into this

>> patch.

> 

> Yes, please always do such refactoring changes independently of other

> functionality changes.

> 

> 

>> This patch also introduces a new populate_offload_attrs function.

> 

> I'm seeing:

> 

>     [...]/gcc/config/nvptx/nvptx.c: In function 'void nvptx_reorg()':

>     [...]/gcc/config/nvptx/nvptx.c:4451:3: warning: 'oa.offload_attrs::vector_length' may be used uninitialized in this function [-Wmaybe-uninitialized]

>        if (oa->vector_length == 0)

>        ^

>     [...]/gcc/config/nvptx/nvptx.c:4516:21: note: 'oa.offload_attrs::vector_length' was declared here

>            offload_attrs oa;

>                          ^

> 

> That must be "populate_offload_attrs" inlined into "nvptx_reorg".  I

> can't tell yet why it complains about "vector_length" only but not about

> the others.


I got lazy and just merged that function as-is. That warning will go
away once the reset of the vector length changes are in.

Cesar

Patch

2018-03-01  Cesar Philippidis  <cesar@codesourcery.com>

	gcc/
	* config/nvptx/nvptx.c (PTX_VECTOR_LENGTH, PTX_WORKER_LENGTH,
	PTX_DEFAULT_RUNTIME_DIM): Move to the top of the file.
	(PTX_WARP_SIZE): Define.
	(PTX_CTA_SIZE): Define.
	(worker_bcast_size): Rename to oacc_bcast_size.
	(worker_bcast_align): Rename to oacc_bcast_align.
	(worker_bcast_sym): Rename to oacc_bcast_sym.
	(nvptx_option_override): Update usage of oacc_bcast_*.
	(nvptx_gen_wcast): Rename to nvptx_gen_warp_bcast.
	(struct wcast_data_t): Rename to broadcast_data_t.
	(nvptx_gen_wcast): Rename to nvptx_gen_shared_bcast.  Update to use
	oacc_bcast_* variables.
	(struct offload_attrs): New.
	(propagator_fn): Add bool argument.
	(nvptx_propagate): New bool argument.  Pass bool argument to fn.
	(vprop_gen): Rename to warp_prop_gen.  Update call to
	nvptx_gen_warp_bcast.
	(nvptx_vpropagate): Rename to nvptx_warp_propagate. Update call to
	nvptx_propagate.
	(wprop_gen): Rename to shared_prop_gen.  Update usage of oacc_bcast_*
	variables and call to nvptx_gen_shared_bcast.
	(nvptx_wpropagate): Rename to nvptx_shared_propagate.  Update usage
	of oacc_bcast_* variables and call to nvptx_propagate.
	(nvptx_wsync): Rename to nvptx_cta_sync.
	(nvptx_single): Update usage of oacc_bcast_* vars and calls to
	nvptx_gen_warp_bcast, nvptx_gen_shared_bcast and nvptx_cta_sync.
	(nvptx_process_pars): Likewise.
	(nvptx_neuter_pars): Whitespace.
	(populate_offload_attrs): New function.
	(nvptx_reorg): Use it to extract partitioning mask.
	(write_worker_buffer): Rename to write_shared_buffer.
	(nvptx_file_end): Update calls to write_shared_buffer.
	(nvptx_expand_worker_addr): Rename to nvptx_expand_shared_addr.
	(nvptx_expand_builtin): Update call to nvptx_expand_shared_addr.
	(nvptx_simt_vf): Return PTX_WARP_SIZE instead of PTX_VECTOR_LENGTH.
	(nvptx_get_worker_red_addr): Rename to nvptx_get_shared_red_addr.
	(nvptx_goacc_reduction_setup): Update call to
	nvptx_get_shared_red_addr.
	(nvptx_goacc_reduction_fini): Likewise.
	(nvptx_goacc_reduction_teardown): Likewise.


diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index 1d510a7bb7d..b16cf59575c 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -77,6 +77,14 @@ 
 
 #define WORKAROUND_PTXJIT_BUG 1
 
+/* Define dimension sizes for known hardware.  */
+#define PTX_VECTOR_LENGTH 32
+//#define PTX_VECTOR_LENGTH 128
+#define PTX_WORKER_LENGTH 32
+#define PTX_DEFAULT_RUNTIME_DIM 0 /* Defer to runtime.  */
+#define PTX_WARP_SIZE 32
+#define PTX_CTA_SIZE 1024
+
 /* The various PTX memory areas an object might reside in.  */
 enum nvptx_data_area
 {
@@ -118,14 +126,15 @@  struct tree_hasher : ggc_cache_ptr_hash<tree_node>
 static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab;
 static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
 
-/* Buffer needed to broadcast across workers.  This is used for both
-   worker-neutering and worker broadcasting.  It is shared by all
-   functions emitted.  The buffer is placed in shared memory.  It'd be
-   nice if PTX supported common blocks, because then this could be
-   shared across TUs (taking the largest size).  */
-static unsigned worker_bcast_size;
-static unsigned worker_bcast_align;
-static GTY(()) rtx worker_bcast_sym;
+/* Buffer needed to broadcast across workers and vectors.  This is
+   used for both worker-neutering and worker broadcasting, and
+   vector-neutering and boardcasting when vector_length > 32.  It is
+   shared by all functions emitted.  The buffer is placed in shared
+   memory.  It'd be nice if PTX supported common blocks, because then
+   this could be shared across TUs (taking the largest size).  */
+static unsigned oacc_bcast_size;
+static unsigned oacc_bcast_align;
+static GTY(()) rtx oacc_bcast_sym;
 
 /* Buffer needed for worker reductions.  This has to be distinct from
    the worker broadcast array, as both may be live concurrently.  */
@@ -198,9 +207,9 @@  nvptx_option_override (void)
   declared_libfuncs_htab
     = hash_table<declared_libfunc_hasher>::create_ggc (17);
 
-  worker_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_bcast");
-  SET_SYMBOL_DATA_AREA (worker_bcast_sym, DATA_AREA_SHARED);
-  worker_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
+  oacc_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__oacc_bcast");
+  SET_SYMBOL_DATA_AREA (oacc_bcast_sym, DATA_AREA_SHARED);
+  oacc_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
 
   worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red");
   SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED);
@@ -1737,14 +1746,14 @@  nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind)
    across the vectors of a single warp.  */
 
 static rtx
-nvptx_gen_vcast (rtx reg)
+nvptx_gen_warp_bcast (rtx reg)
 {
   return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX);
 }
 
 /* Structure used when generating a worker-level spill or fill.  */
 
-struct wcast_data_t
+struct broadcast_data_t
 {
   rtx base;  /* Register holding base addr of buffer.  */
   rtx ptr;  /* Iteration var,  if needed.  */
@@ -1768,7 +1777,8 @@  enum propagate_mask
    how many loop iterations will be executed (0 for not a loop).  */
    
 static rtx
-nvptx_gen_wcast (rtx reg, propagate_mask pm, unsigned rep, wcast_data_t *data)
+nvptx_gen_shared_bcast (rtx reg, propagate_mask pm, unsigned rep,
+			broadcast_data_t *data, bool vector)
 {
   rtx  res;
   machine_mode mode = GET_MODE (reg);
@@ -1782,7 +1792,7 @@  nvptx_gen_wcast (rtx reg, propagate_mask pm, unsigned rep, wcast_data_t *data)
 	start_sequence ();
 	if (pm & PM_read)
 	  emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
-	emit_insn (nvptx_gen_wcast (tmp, pm, rep, data));
+	emit_insn (nvptx_gen_shared_bcast (tmp, pm, rep, data, vector));
 	if (pm & PM_write)
 	  emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx)));
 	res = get_insns ();
@@ -1798,10 +1808,11 @@  nvptx_gen_wcast (rtx reg, propagate_mask pm, unsigned rep, wcast_data_t *data)
 	  {
 	    unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
 
-	    if (align > worker_bcast_align)
-	      worker_bcast_align = align;
+	    if (align > oacc_bcast_align)
+	      oacc_bcast_align = align;
 	    data->offset = (data->offset + align - 1) & ~(align - 1);
 	    addr = data->base;
+	    gcc_assert (data->base != NULL);
 	    if (data->offset)
 	      addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset));
 	  }
@@ -2809,8 +2820,16 @@  nvptx_propagate_unified (rtx_insn *unified)
   gcc_assert (ok);
 }
 
-/* Loop structure of the function.  The entire function is described as
-   a NULL loop.  */
+/* Offloading function attributes.  */
+
+struct offload_attrs
+{
+  unsigned mask;
+  int num_gangs;
+  int num_workers;
+  int vector_length;
+  int max_workers;
+};
 
 struct parallel
 {
@@ -3746,11 +3765,11 @@  nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t &regions)
    regions and (b) only propagating stack entries that are used.  The
    latter might be quite hard to determine.  */
 
-typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *);
+typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *, bool);
 
 static bool
 nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
-		 propagate_mask rw, propagator_fn fn, void *data)
+		 propagate_mask rw, propagator_fn fn, void *data, bool vector)
 {
   bitmap live = DF_LIVE_IN (block);
   bitmap_iterator iterator;
@@ -3785,7 +3804,7 @@  nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
 	  
 	  emit_insn (gen_rtx_SET (idx, GEN_INT (fs)));
 	  /* Allow worker function to initialize anything needed.  */
-	  rtx init = fn (tmp, PM_loop_begin, fs, data);
+	  rtx init = fn (tmp, PM_loop_begin, fs, data, vector);
 	  if (init)
 	    emit_insn (init);
 	  emit_label (label);
@@ -3794,7 +3813,7 @@  nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
 	}
       if (rw & PM_read)
 	emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr)));
-      emit_insn (fn (tmp, rw, fs, data));
+      emit_insn (fn (tmp, rw, fs, data, vector));
       if (rw & PM_write)
 	emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp));
       if (fs)
@@ -3802,7 +3821,7 @@  nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
 	  emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx)));
 	  emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode))));
 	  emit_insn (gen_br_true_uni (pred, label));
-	  rtx fini = fn (tmp, PM_loop_end, fs, data);
+	  rtx fini = fn (tmp, PM_loop_end, fs, data, vector);
 	  if (fini)
 	    emit_insn (fini);
 	  emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx));
@@ -3822,7 +3841,7 @@  nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
 
 	if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
 	  {
-	    rtx bcast = fn (reg, rw, 0, data);
+	    rtx bcast = fn (reg, rw, 0, data, vector);
 
 	    insn = emit_insn_after (bcast, insn);
 	    empty = false;
@@ -3831,16 +3850,17 @@  nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
   return empty;
 }
 
-/* Worker for nvptx_vpropagate.  */
+/* Worker for nvptx_warp_propagate.  */
 
 static rtx
-vprop_gen (rtx reg, propagate_mask pm,
-	   unsigned ARG_UNUSED (count), void *ARG_UNUSED (data))
+warp_prop_gen (rtx reg, propagate_mask pm,
+	       unsigned ARG_UNUSED (count), void *ARG_UNUSED (data),
+	       bool ARG_UNUSED (vector))
 {
   if (!(pm & PM_read_write))
     return 0;
   
-  return nvptx_gen_vcast (reg);
+  return nvptx_gen_warp_bcast (reg);
 }
 
 /* Propagate state that is live at start of BLOCK across the vectors
@@ -3848,25 +3868,27 @@  vprop_gen (rtx reg, propagate_mask pm,
    IS_CALL and return as for nvptx_propagate.  */
 
 static bool
-nvptx_vpropagate (bool is_call, basic_block block, rtx_insn *insn)
+nvptx_warp_propagate (bool is_call, basic_block block, rtx_insn *insn)
 {
-  return nvptx_propagate (is_call, block, insn, PM_read_write, vprop_gen, 0);
+  return nvptx_propagate (is_call, block, insn, PM_read_write,
+			  warp_prop_gen, 0, false);
 }
 
-/* Worker for nvptx_wpropagate.  */
+/* Worker for nvptx_shared_propagate.  */
 
 static rtx
-wprop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_)
+shared_prop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_,
+		 bool vector)
 {
-  wcast_data_t *data = (wcast_data_t *)data_;
+  broadcast_data_t *data = (broadcast_data_t *)data_;
 
   if (pm & PM_loop_begin)
     {
       /* Starting a loop, initialize pointer.    */
       unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT;
 
-      if (align > worker_bcast_align)
-	worker_bcast_align = align;
+      if (align > oacc_bcast_align)
+	oacc_bcast_align = align;
       data->offset = (data->offset + align - 1) & ~(align - 1);
 
       data->ptr = gen_reg_rtx (Pmode);
@@ -3880,7 +3902,7 @@  wprop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_)
       return clobber;
     }
   else
-    return nvptx_gen_wcast (reg, pm, rep, data);
+    return nvptx_gen_shared_bcast (reg, pm, rep, data, vector);
 }
 
 /* Spill or fill live state that is live at start of BLOCK.  PRE_P
@@ -3889,34 +3911,36 @@  wprop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_)
    INSN.  IS_CALL and return as for nvptx_propagate.  */
 
 static bool
-nvptx_wpropagate (bool pre_p, bool is_call, basic_block block, rtx_insn *insn)
+nvptx_shared_propagate (bool pre_p, bool is_call, basic_block block,
+			rtx_insn *insn, bool vector)
 {
-  wcast_data_t data;
+  broadcast_data_t data;
 
   data.base = gen_reg_rtx (Pmode);
   data.offset = 0;
   data.ptr = NULL_RTX;
 
   bool empty = nvptx_propagate (is_call, block, insn,
-				pre_p ? PM_read : PM_write, wprop_gen, &data);
+				pre_p ? PM_read : PM_write, shared_prop_gen,
+				&data, vector);
   gcc_assert (empty == !data.offset);
   if (data.offset)
     {
       /* Stuff was emitted, initialize the base pointer now.  */
-      rtx init = gen_rtx_SET (data.base, worker_bcast_sym);
+      rtx init = gen_rtx_SET (data.base, oacc_bcast_sym);
       emit_insn_after (init, insn);
 
-      if (worker_bcast_size < data.offset)
-	worker_bcast_size = data.offset;
+      if (oacc_bcast_size < data.offset)
+	oacc_bcast_size = data.offset;
     }
   return empty;
 }
 
-/* Emit a worker-level synchronization barrier.  We use different
+/* Emit a CTA-level synchronization barrier.  We use different
    markers for before and after synchronizations.  */
 
 static rtx
-nvptx_wsync (bool after)
+nvptx_cta_sync (bool after)
 {
   return gen_nvptx_barsync (GEN_INT (after));
 }
@@ -4153,31 +4177,33 @@  nvptx_single (unsigned mask, basic_block from, basic_block to)
 	  emit_insn_before (gen_rtx_SET (tmp, pvar), label);
 	  emit_insn_before (gen_rtx_SET (pvar, tmp), tail);
 #endif
-	  emit_insn_before (nvptx_gen_vcast (pvar), tail);
+	  emit_insn_before (nvptx_gen_warp_bcast (pvar), tail);
 	}
       else
 	{
 	  /* Includes worker mode, do spill & fill.  By construction
 	     we should never have worker mode only. */
-	  wcast_data_t data;
+	  broadcast_data_t data;
 
-	  data.base = worker_bcast_sym;
+	  data.base = oacc_bcast_sym;
 	  data.ptr = 0;
 
-	  if (worker_bcast_size < GET_MODE_SIZE (SImode))
-	    worker_bcast_size = GET_MODE_SIZE (SImode);
+	  if (oacc_bcast_size < GET_MODE_SIZE (SImode))
+	    oacc_bcast_size = GET_MODE_SIZE (SImode);
 
 	  data.offset = 0;
-	  emit_insn_before (nvptx_gen_wcast (pvar, PM_read, 0, &data),
+	  emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_read, 0, &data,
+						    false),
 			    before);
 	  /* Barrier so other workers can see the write.  */
-	  emit_insn_before (nvptx_wsync (false), tail);
+	  emit_insn_before (nvptx_cta_sync (false), tail);
 	  data.offset = 0;
-	  emit_insn_before (nvptx_gen_wcast (pvar, PM_write, 0, &data), tail);
+	  emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_write, 0, &data,
+						    false), tail);
 	  /* This barrier is needed to avoid worker zero clobbering
 	     the broadcast buffer before all the other workers have
 	     had a chance to read this instance of it.  */
-	  emit_insn_before (nvptx_wsync (true), tail);
+	  emit_insn_before (nvptx_cta_sync (true), tail);
 	}
 
       extract_insn (tail);
@@ -4289,19 +4315,21 @@  nvptx_process_pars (parallel *par)
   
   if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
     {
-      nvptx_wpropagate (false, is_call, par->forked_block, par->forked_insn);
-      bool empty = nvptx_wpropagate (true, is_call,
-				     par->forked_block, par->fork_insn);
+      nvptx_shared_propagate (false, is_call, par->forked_block,
+			      par->forked_insn, false);
+      bool empty = nvptx_shared_propagate (true, is_call,
+					   par->forked_block, par->fork_insn,
+					   false);
 
       if (!empty || !is_call)
 	{
 	  /* Insert begin and end synchronizations.  */
-	  emit_insn_after (nvptx_wsync (false), par->forked_insn);
-	  emit_insn_before (nvptx_wsync (true), par->joining_insn);
+	  emit_insn_after (nvptx_cta_sync (false), par->forked_insn);
+	  emit_insn_before (nvptx_cta_sync (true), par->joining_insn);
 	}
     }
   else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
-    nvptx_vpropagate (is_call, par->forked_block, par->forked_insn);
+    nvptx_warp_propagate (is_call, par->forked_block, par->forked_insn);
 
   /* Now do siblings.  */
   if (par->next)
@@ -4380,12 +4408,62 @@  nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer)
     }
 
   if (skip_mask)
-      nvptx_skip_par (skip_mask, par);
+    nvptx_skip_par (skip_mask, par);
   
   if (par->next)
     nvptx_neuter_pars (par->next, modes, outer);
 }
 
+static void
+populate_offload_attrs (offload_attrs *oa)
+{
+  tree attr = oacc_get_fn_attrib (current_function_decl);
+  tree dims = TREE_VALUE (attr);
+  unsigned ix;
+
+  oa->mask = 0;
+
+  for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
+    {
+      tree t = TREE_VALUE (dims);
+      int size = (t == NULL_TREE) ? 0 : TREE_INT_CST_LOW (t);
+      tree allowed = TREE_PURPOSE (dims);
+
+      if (size != 1 && !(allowed && integer_zerop (allowed)))
+	oa->mask |= GOMP_DIM_MASK (ix);
+
+      switch (ix)
+	{
+	case GOMP_DIM_GANG:
+	  oa->num_gangs = size;
+	  break;
+
+	case GOMP_DIM_WORKER:
+	  oa->num_workers = size;
+	  break;
+
+	case GOMP_DIM_VECTOR:
+	  oa->vector_length = size;
+	  break;
+	}
+    }
+
+  if (oa->vector_length == 0)
+    {
+      /* FIXME: Need a more graceful way to handle large vector
+	 lengths in OpenACC routines.  */
+      if (!lookup_attribute ("omp target entrypoint",
+			     DECL_ATTRIBUTES (current_function_decl)))
+	oa->vector_length = PTX_WARP_SIZE;
+      else
+	oa->vector_length = PTX_VECTOR_LENGTH;
+    }
+  if (oa->num_workers == 0)
+    oa->max_workers = PTX_CTA_SIZE / oa->vector_length;
+  else
+    oa->max_workers = oa->num_workers;
+}
+
 /* PTX-specific reorganization
    - Split blocks at fork and join instructions
    - Compute live registers
@@ -4435,27 +4513,19 @@  nvptx_reorg (void)
     {
       /* If we determined this mask before RTL expansion, we could
 	 elide emission of some levels of forks and joins.  */
-      unsigned mask = 0;
-      tree dims = TREE_VALUE (attr);
-      unsigned ix;
+      offload_attrs oa;
 
-      for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
-	{
-	  int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
-	  tree allowed = TREE_PURPOSE (dims);
+      populate_offload_attrs (&oa);
 
-	  if (size != 1 && !(allowed && integer_zerop (allowed)))
-	    mask |= GOMP_DIM_MASK (ix);
-	}
       /* If there is worker neutering, there must be vector
 	 neutering.  Otherwise the hardware will fail.  */
-      gcc_assert (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
-		  || (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
+      gcc_assert (!(oa.mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
+		  || (oa.mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
 
       /* Discover & process partitioned regions.  */
       parallel *pars = nvptx_discover_pars (&bb_insn_map);
       nvptx_process_pars (pars);
-      nvptx_neuter_pars (pars, mask, 0);
+      nvptx_neuter_pars (pars, oa.mask, 0);
       delete pars;
     }
 
@@ -4629,10 +4699,11 @@  nvptx_file_start (void)
   fputs ("// END PREAMBLE\n", asm_out_file);
 }
 
-/* Emit a declaration for a worker-level buffer in .shared memory.  */
+/* Emit a declaration for a worker and vector-level buffer in .shared
+   memory.  */
 
 static void
-write_worker_buffer (FILE *file, rtx sym, unsigned align, unsigned size)
+write_shared_buffer (FILE *file, rtx sym, unsigned align, unsigned size)
 {
   const char *name = XSTR (sym, 0);
 
@@ -4653,16 +4724,16 @@  nvptx_file_end (void)
     nvptx_record_fndecl (decl);
   fputs (func_decls.str().c_str(), asm_out_file);
 
-  if (worker_bcast_size)
-    write_worker_buffer (asm_out_file, worker_bcast_sym,
-			 worker_bcast_align, worker_bcast_size);
+  if (oacc_bcast_size)
+    write_shared_buffer (asm_out_file, oacc_bcast_sym,
+			 oacc_bcast_align, oacc_bcast_size);
 
   if (worker_red_size)
-    write_worker_buffer (asm_out_file, worker_red_sym,
+    write_shared_buffer (asm_out_file, worker_red_sym,
 			 worker_red_align, worker_red_size);
 
   if (gangprivate_shared_size)
-    write_worker_buffer (asm_out_file, gangprivate_shared_sym,
+    write_shared_buffer (asm_out_file, gangprivate_shared_sym,
 			 gangprivate_shared_align, gangprivate_shared_size);
 
   if (need_softstack_decl)
@@ -4713,7 +4784,7 @@  nvptx_expand_shuffle (tree exp, rtx target, machine_mode mode, int ignore)
 /* Worker reduction address expander.  */
 
 static rtx
-nvptx_expand_worker_addr (tree exp, rtx target,
+nvptx_expand_shared_addr (tree exp, rtx target,
 			  machine_mode ARG_UNUSED (mode), int ignore)
 {
   if (ignore)
@@ -4870,7 +4941,7 @@  nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
       return nvptx_expand_shuffle (exp, target, mode, ignore);
 
     case NVPTX_BUILTIN_WORKER_ADDR:
-      return nvptx_expand_worker_addr (exp, target, mode, ignore);
+      return nvptx_expand_shared_addr (exp, target, mode, ignore);
 
     case NVPTX_BUILTIN_CMP_SWAP:
     case NVPTX_BUILTIN_CMP_SWAPLL:
@@ -4882,18 +4953,13 @@  nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
     default: gcc_unreachable ();
     }
 }
-
-/* Define dimension sizes for known hardware.  */
-#define PTX_VECTOR_LENGTH 32
-#define PTX_WORKER_LENGTH 32
-#define PTX_DEFAULT_RUNTIME_DIM 0 /* Defer to runtime.  */
 
 /* Implement TARGET_SIMT_VF target hook: number of threads in a warp.  */
 
 static int
 nvptx_simt_vf ()
 {
-  return PTX_VECTOR_LENGTH;
+  return PTX_WARP_SIZE;
 }
 
 /* Validate compute dimensions of an OpenACC offload or routine, fill
@@ -5007,7 +5073,7 @@  nvptx_goacc_fork_join (gcall *call, const int dims[],
    data at that location.  */
 
 static tree
-nvptx_get_worker_red_addr (tree type, tree offset)
+nvptx_get_shared_red_addr (tree type, tree offset)
 {
   machine_mode mode = TYPE_MODE (type);
   tree fndecl = nvptx_builtin_decl (NVPTX_BUILTIN_WORKER_ADDR, true);
@@ -5468,7 +5534,7 @@  nvptx_goacc_reduction_setup (gcall *call)
     {
       /* Store incoming value to worker reduction buffer.  */
       tree offset = gimple_call_arg (call, 5);
-      tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
+      tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset);
       tree ptr = make_ssa_name (TREE_TYPE (call));
 
       gimplify_assign (ptr, call, &seq);
@@ -5597,7 +5663,7 @@  nvptx_goacc_reduction_fini (gcall *call)
 	{
 	  /* Get reduction buffer address.  */
 	  tree offset = gimple_call_arg (call, 5);
-	  tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
+	  tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset);
 	  tree ptr = make_ssa_name (TREE_TYPE (call));
 
 	  gimplify_assign (ptr, call, &seq);
@@ -5645,7 +5711,7 @@  nvptx_goacc_reduction_teardown (gcall *call)
     {
       /* Read the worker reduction buffer.  */
       tree offset = gimple_call_arg (call, 5);
-      tree call = nvptx_get_worker_red_addr(TREE_TYPE (var), offset);
+      tree call = nvptx_get_shared_red_addr(TREE_TYPE (var), offset);
       tree ptr = make_ssa_name (TREE_TYPE (call));
 
       gimplify_assign (ptr, call, &seq);