[nvptx,committed] Force vl32 if calling vector-partitionable routines

Message ID f613e6c0-df68-06ec-9a06-3def009a0dcd@suse.de
State New
Headers show
Series
  • [nvptx,committed] Force vl32 if calling vector-partitionable routines
Related show

Commit Message

Tom de Vries Jan. 7, 2019, 7:11 p.m.
[ was: Re: [nvptx] vector length patch series ]

On 14-12-18 20:58, Tom de Vries wrote:
> 0023-nvptx-Force-vl32-if-calling-vector-partitionable-rou.patch


> @@ -73,6 +73,7 @@

>  #include "cfgloop.h"

>  #include "fold-const.h"

>  #include "intl.h"

> +#include "tree-hash-traits.h"

>  #include "omp-offload.h"

>  

>  /* This file should be included last.  */


I dropped that include, that's not necessary.

> @@ -5557,19 +5637,6 @@ nvptx_adjust_parallelism (unsigned inner_mask, unsigned outer_mask)

>    if (wv)

>      return inner_mask & ~GOMP_DIM_MASK (GOMP_DIM_WORKER);

>  

> -  /* It's difficult to guarantee that warps in large vector_lengths

> -     will remain convergent when a vector loop is nested inside a

> -     worker loop.  Therefore, fallback to setting vector_length to

> -     PTX_WARP_SIZE.  Hopefully this condition may be relaxed for

> -     sm_70+ targets.  */

> -  if ((inner_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))

> -      && (outer_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))

> -    {

> -      tree attr = tree_cons (get_identifier (NVPTX_GOACC_VL_WARP), NULL_TREE,

> -                             DECL_ATTRIBUTES (current_function_decl));

> -      DECL_ATTRIBUTES (current_function_decl) = attr;

> -    }

> -

>    return inner_mask;

>  }

>  


This patch is removing here some code related to a workaround that was
added earlier in the patch series
(0017-nvptx-Enable-large-vectors.patch). Which means that that submitted
patch should not have contained that code in the first place.

Committed (without test-cases) as attached.

Thanks,
- Tom

Patch

[nvptx] Force vl32 if calling vector-partitionable routines

With PTX_MAX_VECTOR_LENGTH set to larger than PTX_WARP_SIZE, routines can be
called from offloading regions with vector-size set to larger than warp size.
OTOH, vector-partitionable routines assume warp-sized vector length.

Detect if we're calling a vector-partitionable routine from an offloading
region, and if so, fall back to warp-sized vector length in that region.

2018-12-17  Tom de Vries  <tdevries@suse.de>

	PR target/85486
	* config/nvptx/nvptx.c (has_vector_partitionable_routine_calls_p): New
	function.
	(nvptx_goacc_validate_dims): Force vl32 if calling vector-partitionable
	routines.

---
 gcc/config/nvptx/nvptx.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index 5a4b38de522..7fdc285b6f8 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -59,6 +59,7 @@ 
 #include "builtins.h"
 #include "omp-general.h"
 #include "omp-low.h"
+#include "omp-offload.h"
 #include "gomp-constants.h"
 #include "dumpfile.h"
 #include "internal-fn.h"
@@ -5496,6 +5497,40 @@  nvptx_apply_dim_limits (int dims[])
     dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
 }
 
+/* Return true if FNDECL contains calls to vector-partitionable routines.  */
+
+static bool
+has_vector_partitionable_routine_calls_p (tree fndecl)
+{
+  if (!fndecl)
+    return false;
+
+  basic_block bb;
+  FOR_EACH_BB_FN (bb, DECL_STRUCT_FUNCTION (fndecl))
+    for (gimple_stmt_iterator i = gsi_start_bb (bb); !gsi_end_p (i);
+	 gsi_next_nondebug (&i))
+      {
+	gimple *stmt = gsi_stmt (i);
+	if (gimple_code (stmt) != GIMPLE_CALL)
+	  continue;
+
+	tree callee = gimple_call_fndecl (stmt);
+	if (!callee)
+	  continue;
+
+	tree attrs  = oacc_get_fn_attrib (callee);
+	if (attrs == NULL_TREE)
+	  return false;
+
+	int partition_level = oacc_fn_attrib_level (attrs);
+	bool seq_routine_p = partition_level == GOMP_DIM_MAX;
+	if (!seq_routine_p)
+	  return true;
+      }
+
+  return false;
+}
+
 /* As nvptx_goacc_validate_dims, but does not return bool to indicate whether
    DIMS has changed.  */
 
@@ -5611,6 +5646,16 @@  nvptx_goacc_validate_dims_1 (tree decl, int dims[], int fn_level)
     old_dims[i] = dims[i];
 
   const char *vector_reason = NULL;
+  if (offload_region_p && has_vector_partitionable_routine_calls_p (decl))
+    {
+      if (dims[GOMP_DIM_VECTOR] > PTX_WARP_SIZE)
+	{
+	  vector_reason = G_("using vector_length (%d) due to call to"
+			     " vector-partitionable routine, ignoring %d");
+	  dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
+	}
+    }
+
   if (dims[GOMP_DIM_VECTOR] == 0)
     {
       vector_reason = G_("using vector_length (%d), ignoring runtime setting");