[09/13] Use explicit encodings for simple permutes

Message ID 87d13nlc6v.fsf@linaro.org
State New
Headers show
Series
  • Make VEC_PERM_EXPR work for variable-length vectors
Related show

Commit Message

Richard Sandiford Dec. 9, 2017, 11:21 p.m.
This patch makes users of vec_perm_builders use the compressed encoding
where possible.  This means that they work with variable-length vectors.


2017-12-09  Richard Sandiford  <richard.sandiford@linaro.org>

gcc/
	* optabs.c (expand_vec_perm_var): Use an explicit encoding for
	the broadcast of the low byte.
	(expand_mult_highpart): Use an explicit encoding for the permutes.
	* optabs-query.c (can_mult_highpart_p): Likewise.
	* tree-vect-loop.c (calc_vec_perm_mask_for_shift): Likewise.
	* tree-vect-stmts.c (perm_mask_for_reverse): Likewise.
	(vectorizable_bswap): Likewise.
	* tree-vect-data-refs.c (vect_grouped_store_supported): Use an
	explicit encoding for the power-of-2 permutes.
	(vect_permute_store_chain): Likewise.
	(vect_grouped_load_supported): Likewise.
	(vect_permute_load_chain): Likewise.

Comments

Richard Sandiford Dec. 19, 2017, 8:37 p.m. | #1
Ping

Richard Sandiford <richard.sandiford@linaro.org> writes:
> This patch makes users of vec_perm_builders use the compressed encoding

> where possible.  This means that they work with variable-length vectors.

>

>

> 2017-12-09  Richard Sandiford  <richard.sandiford@linaro.org>

>

> gcc/

> 	* optabs.c (expand_vec_perm_var): Use an explicit encoding for

> 	the broadcast of the low byte.

> 	(expand_mult_highpart): Use an explicit encoding for the permutes.

> 	* optabs-query.c (can_mult_highpart_p): Likewise.

> 	* tree-vect-loop.c (calc_vec_perm_mask_for_shift): Likewise.

> 	* tree-vect-stmts.c (perm_mask_for_reverse): Likewise.

> 	(vectorizable_bswap): Likewise.

> 	* tree-vect-data-refs.c (vect_grouped_store_supported): Use an

> 	explicit encoding for the power-of-2 permutes.

> 	(vect_permute_store_chain): Likewise.

> 	(vect_grouped_load_supported): Likewise.

> 	(vect_permute_load_chain): Likewise.

>

> Index: gcc/optabs.c

> ===================================================================

> --- gcc/optabs.c	2017-12-09 22:48:47.546825312 +0000

> +++ gcc/optabs.c	2017-12-09 22:48:52.266015836 +0000

> @@ -5625,15 +5625,14 @@ expand_vec_perm_var (machine_mode mode,

>  			       NULL, 0, OPTAB_DIRECT);

>    gcc_assert (sel != NULL);

>  

> -  /* Broadcast the low byte each element into each of its bytes.  */

> -  vec_perm_builder const_sel (w, w, 1);

> -  for (i = 0; i < w; ++i)

> -    {

> -      int this_e = i / u * u;

> -      if (BYTES_BIG_ENDIAN)

> -	this_e += u - 1;

> -      const_sel.quick_push (this_e);

> -    }

> +  /* Broadcast the low byte each element into each of its bytes.

> +     The encoding has U interleaved stepped patterns, one for each

> +     byte of an element.  */

> +  vec_perm_builder const_sel (w, u, 3);

> +  unsigned int low_byte_in_u = BYTES_BIG_ENDIAN ? u - 1 : 0;

> +  for (i = 0; i < 3; ++i)

> +    for (unsigned int j = 0; j < u; ++j)

> +      const_sel.quick_push (i * u + low_byte_in_u);

>    sel = gen_lowpart (qimode, sel);

>    sel = expand_vec_perm_const (qimode, sel, sel, const_sel, qimode, NULL);

>    gcc_assert (sel != NULL);

> @@ -5853,16 +5852,20 @@ expand_mult_highpart (machine_mode mode,

>    expand_insn (optab_handler (tab2, mode), 3, eops);

>    m2 = gen_lowpart (mode, eops[0].value);

>  

> -  vec_perm_builder sel (nunits, nunits, 1);

> +  vec_perm_builder sel;

>    if (method == 2)

>      {

> -      for (i = 0; i < nunits; ++i)

> +      /* The encoding has 2 interleaved stepped patterns.  */

> +      sel.new_vector (nunits, 2, 3);

> +      for (i = 0; i < 6; ++i)

>  	sel.quick_push (!BYTES_BIG_ENDIAN + (i & ~1)

>  			+ ((i & 1) ? nunits : 0));

>      }

>    else

>      {

> -      for (i = 0; i < nunits; ++i)

> +      /* The encoding has a single interleaved stepped pattern.  */

> +      sel.new_vector (nunits, 1, 3);

> +      for (i = 0; i < 3; ++i)

>  	sel.quick_push (2 * i + (BYTES_BIG_ENDIAN ? 0 : 1));

>      }

>  

> Index: gcc/optabs-query.c

> ===================================================================

> --- gcc/optabs-query.c	2017-12-09 22:48:47.545825268 +0000

> +++ gcc/optabs-query.c	2017-12-09 22:48:52.265015799 +0000

> @@ -501,8 +501,9 @@ can_mult_highpart_p (machine_mode mode,

>        op = uns_p ? vec_widen_umult_odd_optab : vec_widen_smult_odd_optab;

>        if (optab_handler (op, mode) != CODE_FOR_nothing)

>  	{

> -	  vec_perm_builder sel (nunits, nunits, 1);

> -	  for (i = 0; i < nunits; ++i)

> +	  /* The encoding has 2 interleaved stepped patterns.  */

> +	  vec_perm_builder sel (nunits, 2, 3);

> +	  for (i = 0; i < 6; ++i)

>  	    sel.quick_push (!BYTES_BIG_ENDIAN

>  			    + (i & ~1)

>  			    + ((i & 1) ? nunits : 0));

> @@ -518,8 +519,9 @@ can_mult_highpart_p (machine_mode mode,

>        op = uns_p ? vec_widen_umult_lo_optab : vec_widen_smult_lo_optab;

>        if (optab_handler (op, mode) != CODE_FOR_nothing)

>  	{

> -	  vec_perm_builder sel (nunits, nunits, 1);

> -	  for (i = 0; i < nunits; ++i)

> +	  /* The encoding has a single stepped pattern.  */

> +	  vec_perm_builder sel (nunits, 1, 3);

> +	  for (int i = 0; i < 3; ++i)

>  	    sel.quick_push (2 * i + (BYTES_BIG_ENDIAN ? 0 : 1));

>  	  vec_perm_indices indices (sel, 2, nunits);

>  	  if (can_vec_perm_const_p (mode, indices))

> Index: gcc/tree-vect-loop.c

> ===================================================================

> --- gcc/tree-vect-loop.c	2017-12-09 22:48:47.547825355 +0000

> +++ gcc/tree-vect-loop.c	2017-12-09 22:48:52.267015873 +0000

> @@ -3716,8 +3716,10 @@ vect_estimate_min_profitable_iters (loop

>  calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,

>  			      vec_perm_builder *sel)

>  {

> -  sel->new_vector (nelt, nelt, 1);

> -  for (unsigned int i = 0; i < nelt; i++)

> +  /* The encoding is a single stepped pattern.  Any wrap-around is handled

> +     by vec_perm_indices.  */

> +  sel->new_vector (nelt, 1, 3);

> +  for (unsigned int i = 0; i < 3; i++)

>      sel->quick_push (i + offset);

>  }

>  

> Index: gcc/tree-vect-stmts.c

> ===================================================================

> --- gcc/tree-vect-stmts.c	2017-12-09 22:48:50.360942531 +0000

> +++ gcc/tree-vect-stmts.c	2017-12-09 22:48:52.268015910 +0000

> @@ -1717,8 +1717,9 @@ perm_mask_for_reverse (tree vectype)

>  

>    nunits = TYPE_VECTOR_SUBPARTS (vectype);

>  

> -  vec_perm_builder sel (nunits, nunits, 1);

> -  for (i = 0; i < nunits; ++i)

> +  /* The encoding has a single stepped pattern.  */

> +  vec_perm_builder sel (nunits, 1, 3);

> +  for (i = 0; i < 3; ++i)

>      sel.quick_push (nunits - 1 - i);

>  

>    vec_perm_indices indices (sel, 1, nunits);

> @@ -2504,8 +2505,9 @@ vectorizable_bswap (gimple *stmt, gimple

>    unsigned int num_bytes = TYPE_VECTOR_SUBPARTS (char_vectype);

>    unsigned word_bytes = num_bytes / nunits;

>  

> -  vec_perm_builder elts (num_bytes, num_bytes, 1);

> -  for (unsigned i = 0; i < nunits; ++i)

> +  /* The encoding uses one stepped pattern for each byte in the word.  */

> +  vec_perm_builder elts (num_bytes, word_bytes, 3);

> +  for (unsigned i = 0; i < 3; ++i)

>      for (unsigned j = 0; j < word_bytes; ++j)

>        elts.quick_push ((i + 1) * word_bytes - j - 1);

>  

> Index: gcc/tree-vect-data-refs.c

> ===================================================================

> --- gcc/tree-vect-data-refs.c	2017-12-09 22:48:47.546825312 +0000

> +++ gcc/tree-vect-data-refs.c	2017-12-09 22:48:52.267015873 +0000

> @@ -4566,14 +4566,13 @@ vect_grouped_store_supported (tree vecty

>    if (VECTOR_MODE_P (mode))

>      {

>        unsigned int i, nelt = GET_MODE_NUNITS (mode);

> -      vec_perm_builder sel (nelt, nelt, 1);

> -      sel.quick_grow (nelt);

> -

>        if (count == 3)

>  	{

>  	  unsigned int j0 = 0, j1 = 0, j2 = 0;

>  	  unsigned int i, j;

>  

> +	  vec_perm_builder sel (nelt, nelt, 1);

> +	  sel.quick_grow (nelt);

>  	  vec_perm_indices indices;

>  	  for (j = 0; j < 3; j++)

>  	    {

> @@ -4623,7 +4622,10 @@ vect_grouped_store_supported (tree vecty

>  	  /* If length is not equal to 3 then only power of 2 is supported.  */

>  	  gcc_assert (pow2p_hwi (count));

>  

> -	  for (i = 0; i < nelt / 2; i++)

> +	  /* The encoding has 2 interleaved stepped patterns.  */

> +	  vec_perm_builder sel (nelt, 2, 3);

> +	  sel.quick_grow (6);

> +	  for (i = 0; i < 3; i++)

>  	    {

>  	      sel[i * 2] = i;

>  	      sel[i * 2 + 1] = i + nelt;

> @@ -4631,7 +4633,7 @@ vect_grouped_store_supported (tree vecty

>  	  vec_perm_indices indices (sel, 2, nelt);

>  	  if (can_vec_perm_const_p (mode, indices))

>  	    {

> -	      for (i = 0; i < nelt; i++)

> +	      for (i = 0; i < 6; i++)

>  		sel[i] += nelt / 2;

>  	      indices.new_vector (sel, 2, nelt);

>  	      if (can_vec_perm_const_p (mode, indices))

> @@ -4736,9 +4738,6 @@ vect_permute_store_chain (vec<tree> dr_c

>    unsigned int i, n, log_length = exact_log2 (length);

>    unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);

>  

> -  vec_perm_builder sel (nelt, nelt, 1);

> -  sel.quick_grow (nelt);

> -

>    result_chain->quick_grow (length);

>    memcpy (result_chain->address (), dr_chain.address (),

>  	  length * sizeof (tree));

> @@ -4747,6 +4746,8 @@ vect_permute_store_chain (vec<tree> dr_c

>      {

>        unsigned int j0 = 0, j1 = 0, j2 = 0;

>  

> +      vec_perm_builder sel (nelt, nelt, 1);

> +      sel.quick_grow (nelt);

>        vec_perm_indices indices;

>        for (j = 0; j < 3; j++)

>          {

> @@ -4808,7 +4809,10 @@ vect_permute_store_chain (vec<tree> dr_c

>        /* If length is not equal to 3 then only power of 2 is supported.  */

>        gcc_assert (pow2p_hwi (length));

>  

> -      for (i = 0, n = nelt / 2; i < n; i++)

> +      /* The encoding has 2 interleaved stepped patterns.  */

> +      vec_perm_builder sel (nelt, 2, 3);

> +      sel.quick_grow (6);

> +      for (i = 0; i < 3; i++)

>  	{

>  	  sel[i * 2] = i;

>  	  sel[i * 2 + 1] = i + nelt;

> @@ -4816,7 +4820,7 @@ vect_permute_store_chain (vec<tree> dr_c

>  	vec_perm_indices indices (sel, 2, nelt);

>  	perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);

>  

> -	for (i = 0; i < nelt; i++)

> +	for (i = 0; i < 6; i++)

>  	  sel[i] += nelt / 2;

>  	indices.new_vector (sel, 2, nelt);

>  	perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);

> @@ -5164,11 +5168,11 @@ vect_grouped_load_supported (tree vectyp

>    if (VECTOR_MODE_P (mode))

>      {

>        unsigned int i, j, nelt = GET_MODE_NUNITS (mode);

> -      vec_perm_builder sel (nelt, nelt, 1);

> -      sel.quick_grow (nelt);

>  

>        if (count == 3)

>  	{

> +	  vec_perm_builder sel (nelt, nelt, 1);

> +	  sel.quick_grow (nelt);

>  	  vec_perm_indices indices;

>  	  unsigned int k;

>  	  for (k = 0; k < 3; k++)

> @@ -5209,12 +5213,15 @@ vect_grouped_load_supported (tree vectyp

>  	  /* If length is not equal to 3 then only power of 2 is supported.  */

>  	  gcc_assert (pow2p_hwi (count));

>  

> -	  for (i = 0; i < nelt; i++)

> +	  /* The encoding has a single stepped pattern.  */

> +	  vec_perm_builder sel (nelt, 1, 3);

> +	  sel.quick_grow (3);

> +	  for (i = 0; i < 3; i++)

>  	    sel[i] = i * 2;

>  	  vec_perm_indices indices (sel, 2, nelt);

>  	  if (can_vec_perm_const_p (mode, indices))

>  	    {

> -	      for (i = 0; i < nelt; i++)

> +	      for (i = 0; i < 3; i++)

>  		sel[i] = i * 2 + 1;

>  	      indices.new_vector (sel, 2, nelt);

>  	      if (can_vec_perm_const_p (mode, indices))

> @@ -5332,9 +5339,6 @@ vect_permute_load_chain (vec<tree> dr_ch

>    unsigned int i, j, log_length = exact_log2 (length);

>    unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);

>  

> -  vec_perm_builder sel (nelt, nelt, 1);

> -  sel.quick_grow (nelt);

> -

>    result_chain->quick_grow (length);

>    memcpy (result_chain->address (), dr_chain.address (),

>  	  length * sizeof (tree));

> @@ -5343,6 +5347,8 @@ vect_permute_load_chain (vec<tree> dr_ch

>      {

>        unsigned int k;

>  

> +      vec_perm_builder sel (nelt, nelt, 1);

> +      sel.quick_grow (nelt);

>        vec_perm_indices indices;

>        for (k = 0; k < 3; k++)

>  	{

> @@ -5390,12 +5396,15 @@ vect_permute_load_chain (vec<tree> dr_ch

>        /* If length is not equal to 3 then only power of 2 is supported.  */

>        gcc_assert (pow2p_hwi (length));

>  

> -      for (i = 0; i < nelt; ++i)

> +      /* The encoding has a single stepped pattern.  */

> +      vec_perm_builder sel (nelt, 1, 3);

> +      sel.quick_grow (3);

> +      for (i = 0; i < 3; ++i)

>  	sel[i] = i * 2;

>        vec_perm_indices indices (sel, 2, nelt);

>        perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);

>  

> -      for (i = 0; i < nelt; ++i)

> +      for (i = 0; i < 3; ++i)

>  	sel[i] = i * 2 + 1;

>        indices.new_vector (sel, 2, nelt);

>        perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
Richard Biener Jan. 2, 2018, 1:07 p.m. | #2
On Sun, Dec 10, 2017 at 12:21 AM, Richard Sandiford
<richard.sandiford@linaro.org> wrote:
> This patch makes users of vec_perm_builders use the compressed encoding

> where possible.  This means that they work with variable-length vectors.

>


Ok.

Richard.

> 2017-12-09  Richard Sandiford  <richard.sandiford@linaro.org>

>

> gcc/

>         * optabs.c (expand_vec_perm_var): Use an explicit encoding for

>         the broadcast of the low byte.

>         (expand_mult_highpart): Use an explicit encoding for the permutes.

>         * optabs-query.c (can_mult_highpart_p): Likewise.

>         * tree-vect-loop.c (calc_vec_perm_mask_for_shift): Likewise.

>         * tree-vect-stmts.c (perm_mask_for_reverse): Likewise.

>         (vectorizable_bswap): Likewise.

>         * tree-vect-data-refs.c (vect_grouped_store_supported): Use an

>         explicit encoding for the power-of-2 permutes.

>         (vect_permute_store_chain): Likewise.

>         (vect_grouped_load_supported): Likewise.

>         (vect_permute_load_chain): Likewise.

>

> Index: gcc/optabs.c

> ===================================================================

> --- gcc/optabs.c        2017-12-09 22:48:47.546825312 +0000

> +++ gcc/optabs.c        2017-12-09 22:48:52.266015836 +0000

> @@ -5625,15 +5625,14 @@ expand_vec_perm_var (machine_mode mode,

>                                NULL, 0, OPTAB_DIRECT);

>    gcc_assert (sel != NULL);

>

> -  /* Broadcast the low byte each element into each of its bytes.  */

> -  vec_perm_builder const_sel (w, w, 1);

> -  for (i = 0; i < w; ++i)

> -    {

> -      int this_e = i / u * u;

> -      if (BYTES_BIG_ENDIAN)

> -       this_e += u - 1;

> -      const_sel.quick_push (this_e);

> -    }

> +  /* Broadcast the low byte each element into each of its bytes.

> +     The encoding has U interleaved stepped patterns, one for each

> +     byte of an element.  */

> +  vec_perm_builder const_sel (w, u, 3);

> +  unsigned int low_byte_in_u = BYTES_BIG_ENDIAN ? u - 1 : 0;

> +  for (i = 0; i < 3; ++i)

> +    for (unsigned int j = 0; j < u; ++j)

> +      const_sel.quick_push (i * u + low_byte_in_u);

>    sel = gen_lowpart (qimode, sel);

>    sel = expand_vec_perm_const (qimode, sel, sel, const_sel, qimode, NULL);

>    gcc_assert (sel != NULL);

> @@ -5853,16 +5852,20 @@ expand_mult_highpart (machine_mode mode,

>    expand_insn (optab_handler (tab2, mode), 3, eops);

>    m2 = gen_lowpart (mode, eops[0].value);

>

> -  vec_perm_builder sel (nunits, nunits, 1);

> +  vec_perm_builder sel;

>    if (method == 2)

>      {

> -      for (i = 0; i < nunits; ++i)

> +      /* The encoding has 2 interleaved stepped patterns.  */

> +      sel.new_vector (nunits, 2, 3);

> +      for (i = 0; i < 6; ++i)

>         sel.quick_push (!BYTES_BIG_ENDIAN + (i & ~1)

>                         + ((i & 1) ? nunits : 0));

>      }

>    else

>      {

> -      for (i = 0; i < nunits; ++i)

> +      /* The encoding has a single interleaved stepped pattern.  */

> +      sel.new_vector (nunits, 1, 3);

> +      for (i = 0; i < 3; ++i)

>         sel.quick_push (2 * i + (BYTES_BIG_ENDIAN ? 0 : 1));

>      }

>

> Index: gcc/optabs-query.c

> ===================================================================

> --- gcc/optabs-query.c  2017-12-09 22:48:47.545825268 +0000

> +++ gcc/optabs-query.c  2017-12-09 22:48:52.265015799 +0000

> @@ -501,8 +501,9 @@ can_mult_highpart_p (machine_mode mode,

>        op = uns_p ? vec_widen_umult_odd_optab : vec_widen_smult_odd_optab;

>        if (optab_handler (op, mode) != CODE_FOR_nothing)

>         {

> -         vec_perm_builder sel (nunits, nunits, 1);

> -         for (i = 0; i < nunits; ++i)

> +         /* The encoding has 2 interleaved stepped patterns.  */

> +         vec_perm_builder sel (nunits, 2, 3);

> +         for (i = 0; i < 6; ++i)

>             sel.quick_push (!BYTES_BIG_ENDIAN

>                             + (i & ~1)

>                             + ((i & 1) ? nunits : 0));

> @@ -518,8 +519,9 @@ can_mult_highpart_p (machine_mode mode,

>        op = uns_p ? vec_widen_umult_lo_optab : vec_widen_smult_lo_optab;

>        if (optab_handler (op, mode) != CODE_FOR_nothing)

>         {

> -         vec_perm_builder sel (nunits, nunits, 1);

> -         for (i = 0; i < nunits; ++i)

> +         /* The encoding has a single stepped pattern.  */

> +         vec_perm_builder sel (nunits, 1, 3);

> +         for (int i = 0; i < 3; ++i)

>             sel.quick_push (2 * i + (BYTES_BIG_ENDIAN ? 0 : 1));

>           vec_perm_indices indices (sel, 2, nunits);

>           if (can_vec_perm_const_p (mode, indices))

> Index: gcc/tree-vect-loop.c

> ===================================================================

> --- gcc/tree-vect-loop.c        2017-12-09 22:48:47.547825355 +0000

> +++ gcc/tree-vect-loop.c        2017-12-09 22:48:52.267015873 +0000

> @@ -3716,8 +3716,10 @@ vect_estimate_min_profitable_iters (loop

>  calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,

>                               vec_perm_builder *sel)

>  {

> -  sel->new_vector (nelt, nelt, 1);

> -  for (unsigned int i = 0; i < nelt; i++)

> +  /* The encoding is a single stepped pattern.  Any wrap-around is handled

> +     by vec_perm_indices.  */

> +  sel->new_vector (nelt, 1, 3);

> +  for (unsigned int i = 0; i < 3; i++)

>      sel->quick_push (i + offset);

>  }

>

> Index: gcc/tree-vect-stmts.c

> ===================================================================

> --- gcc/tree-vect-stmts.c       2017-12-09 22:48:50.360942531 +0000

> +++ gcc/tree-vect-stmts.c       2017-12-09 22:48:52.268015910 +0000

> @@ -1717,8 +1717,9 @@ perm_mask_for_reverse (tree vectype)

>

>    nunits = TYPE_VECTOR_SUBPARTS (vectype);

>

> -  vec_perm_builder sel (nunits, nunits, 1);

> -  for (i = 0; i < nunits; ++i)

> +  /* The encoding has a single stepped pattern.  */

> +  vec_perm_builder sel (nunits, 1, 3);

> +  for (i = 0; i < 3; ++i)

>      sel.quick_push (nunits - 1 - i);

>

>    vec_perm_indices indices (sel, 1, nunits);

> @@ -2504,8 +2505,9 @@ vectorizable_bswap (gimple *stmt, gimple

>    unsigned int num_bytes = TYPE_VECTOR_SUBPARTS (char_vectype);

>    unsigned word_bytes = num_bytes / nunits;

>

> -  vec_perm_builder elts (num_bytes, num_bytes, 1);

> -  for (unsigned i = 0; i < nunits; ++i)

> +  /* The encoding uses one stepped pattern for each byte in the word.  */

> +  vec_perm_builder elts (num_bytes, word_bytes, 3);

> +  for (unsigned i = 0; i < 3; ++i)

>      for (unsigned j = 0; j < word_bytes; ++j)

>        elts.quick_push ((i + 1) * word_bytes - j - 1);

>

> Index: gcc/tree-vect-data-refs.c

> ===================================================================

> --- gcc/tree-vect-data-refs.c   2017-12-09 22:48:47.546825312 +0000

> +++ gcc/tree-vect-data-refs.c   2017-12-09 22:48:52.267015873 +0000

> @@ -4566,14 +4566,13 @@ vect_grouped_store_supported (tree vecty

>    if (VECTOR_MODE_P (mode))

>      {

>        unsigned int i, nelt = GET_MODE_NUNITS (mode);

> -      vec_perm_builder sel (nelt, nelt, 1);

> -      sel.quick_grow (nelt);

> -

>        if (count == 3)

>         {

>           unsigned int j0 = 0, j1 = 0, j2 = 0;

>           unsigned int i, j;

>

> +         vec_perm_builder sel (nelt, nelt, 1);

> +         sel.quick_grow (nelt);

>           vec_perm_indices indices;

>           for (j = 0; j < 3; j++)

>             {

> @@ -4623,7 +4622,10 @@ vect_grouped_store_supported (tree vecty

>           /* If length is not equal to 3 then only power of 2 is supported.  */

>           gcc_assert (pow2p_hwi (count));

>

> -         for (i = 0; i < nelt / 2; i++)

> +         /* The encoding has 2 interleaved stepped patterns.  */

> +         vec_perm_builder sel (nelt, 2, 3);

> +         sel.quick_grow (6);

> +         for (i = 0; i < 3; i++)

>             {

>               sel[i * 2] = i;

>               sel[i * 2 + 1] = i + nelt;

> @@ -4631,7 +4633,7 @@ vect_grouped_store_supported (tree vecty

>           vec_perm_indices indices (sel, 2, nelt);

>           if (can_vec_perm_const_p (mode, indices))

>             {

> -             for (i = 0; i < nelt; i++)

> +             for (i = 0; i < 6; i++)

>                 sel[i] += nelt / 2;

>               indices.new_vector (sel, 2, nelt);

>               if (can_vec_perm_const_p (mode, indices))

> @@ -4736,9 +4738,6 @@ vect_permute_store_chain (vec<tree> dr_c

>    unsigned int i, n, log_length = exact_log2 (length);

>    unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);

>

> -  vec_perm_builder sel (nelt, nelt, 1);

> -  sel.quick_grow (nelt);

> -

>    result_chain->quick_grow (length);

>    memcpy (result_chain->address (), dr_chain.address (),

>           length * sizeof (tree));

> @@ -4747,6 +4746,8 @@ vect_permute_store_chain (vec<tree> dr_c

>      {

>        unsigned int j0 = 0, j1 = 0, j2 = 0;

>

> +      vec_perm_builder sel (nelt, nelt, 1);

> +      sel.quick_grow (nelt);

>        vec_perm_indices indices;

>        for (j = 0; j < 3; j++)

>          {

> @@ -4808,7 +4809,10 @@ vect_permute_store_chain (vec<tree> dr_c

>        /* If length is not equal to 3 then only power of 2 is supported.  */

>        gcc_assert (pow2p_hwi (length));

>

> -      for (i = 0, n = nelt / 2; i < n; i++)

> +      /* The encoding has 2 interleaved stepped patterns.  */

> +      vec_perm_builder sel (nelt, 2, 3);

> +      sel.quick_grow (6);

> +      for (i = 0; i < 3; i++)

>         {

>           sel[i * 2] = i;

>           sel[i * 2 + 1] = i + nelt;

> @@ -4816,7 +4820,7 @@ vect_permute_store_chain (vec<tree> dr_c

>         vec_perm_indices indices (sel, 2, nelt);

>         perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);

>

> -       for (i = 0; i < nelt; i++)

> +       for (i = 0; i < 6; i++)

>           sel[i] += nelt / 2;

>         indices.new_vector (sel, 2, nelt);

>         perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);

> @@ -5164,11 +5168,11 @@ vect_grouped_load_supported (tree vectyp

>    if (VECTOR_MODE_P (mode))

>      {

>        unsigned int i, j, nelt = GET_MODE_NUNITS (mode);

> -      vec_perm_builder sel (nelt, nelt, 1);

> -      sel.quick_grow (nelt);

>

>        if (count == 3)

>         {

> +         vec_perm_builder sel (nelt, nelt, 1);

> +         sel.quick_grow (nelt);

>           vec_perm_indices indices;

>           unsigned int k;

>           for (k = 0; k < 3; k++)

> @@ -5209,12 +5213,15 @@ vect_grouped_load_supported (tree vectyp

>           /* If length is not equal to 3 then only power of 2 is supported.  */

>           gcc_assert (pow2p_hwi (count));

>

> -         for (i = 0; i < nelt; i++)

> +         /* The encoding has a single stepped pattern.  */

> +         vec_perm_builder sel (nelt, 1, 3);

> +         sel.quick_grow (3);

> +         for (i = 0; i < 3; i++)

>             sel[i] = i * 2;

>           vec_perm_indices indices (sel, 2, nelt);

>           if (can_vec_perm_const_p (mode, indices))

>             {

> -             for (i = 0; i < nelt; i++)

> +             for (i = 0; i < 3; i++)

>                 sel[i] = i * 2 + 1;

>               indices.new_vector (sel, 2, nelt);

>               if (can_vec_perm_const_p (mode, indices))

> @@ -5332,9 +5339,6 @@ vect_permute_load_chain (vec<tree> dr_ch

>    unsigned int i, j, log_length = exact_log2 (length);

>    unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);

>

> -  vec_perm_builder sel (nelt, nelt, 1);

> -  sel.quick_grow (nelt);

> -

>    result_chain->quick_grow (length);

>    memcpy (result_chain->address (), dr_chain.address (),

>           length * sizeof (tree));

> @@ -5343,6 +5347,8 @@ vect_permute_load_chain (vec<tree> dr_ch

>      {

>        unsigned int k;

>

> +      vec_perm_builder sel (nelt, nelt, 1);

> +      sel.quick_grow (nelt);

>        vec_perm_indices indices;

>        for (k = 0; k < 3; k++)

>         {

> @@ -5390,12 +5396,15 @@ vect_permute_load_chain (vec<tree> dr_ch

>        /* If length is not equal to 3 then only power of 2 is supported.  */

>        gcc_assert (pow2p_hwi (length));

>

> -      for (i = 0; i < nelt; ++i)

> +      /* The encoding has a single stepped pattern.  */

> +      vec_perm_builder sel (nelt, 1, 3);

> +      sel.quick_grow (3);

> +      for (i = 0; i < 3; ++i)

>         sel[i] = i * 2;

>        vec_perm_indices indices (sel, 2, nelt);

>        perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);

>

> -      for (i = 0; i < nelt; ++i)

> +      for (i = 0; i < 3; ++i)

>         sel[i] = i * 2 + 1;

>        indices.new_vector (sel, 2, nelt);

>        perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);

Patch

Index: gcc/optabs.c
===================================================================
--- gcc/optabs.c	2017-12-09 22:48:47.546825312 +0000
+++ gcc/optabs.c	2017-12-09 22:48:52.266015836 +0000
@@ -5625,15 +5625,14 @@  expand_vec_perm_var (machine_mode mode,
 			       NULL, 0, OPTAB_DIRECT);
   gcc_assert (sel != NULL);
 
-  /* Broadcast the low byte each element into each of its bytes.  */
-  vec_perm_builder const_sel (w, w, 1);
-  for (i = 0; i < w; ++i)
-    {
-      int this_e = i / u * u;
-      if (BYTES_BIG_ENDIAN)
-	this_e += u - 1;
-      const_sel.quick_push (this_e);
-    }
+  /* Broadcast the low byte each element into each of its bytes.
+     The encoding has U interleaved stepped patterns, one for each
+     byte of an element.  */
+  vec_perm_builder const_sel (w, u, 3);
+  unsigned int low_byte_in_u = BYTES_BIG_ENDIAN ? u - 1 : 0;
+  for (i = 0; i < 3; ++i)
+    for (unsigned int j = 0; j < u; ++j)
+      const_sel.quick_push (i * u + low_byte_in_u);
   sel = gen_lowpart (qimode, sel);
   sel = expand_vec_perm_const (qimode, sel, sel, const_sel, qimode, NULL);
   gcc_assert (sel != NULL);
@@ -5853,16 +5852,20 @@  expand_mult_highpart (machine_mode mode,
   expand_insn (optab_handler (tab2, mode), 3, eops);
   m2 = gen_lowpart (mode, eops[0].value);
 
-  vec_perm_builder sel (nunits, nunits, 1);
+  vec_perm_builder sel;
   if (method == 2)
     {
-      for (i = 0; i < nunits; ++i)
+      /* The encoding has 2 interleaved stepped patterns.  */
+      sel.new_vector (nunits, 2, 3);
+      for (i = 0; i < 6; ++i)
 	sel.quick_push (!BYTES_BIG_ENDIAN + (i & ~1)
 			+ ((i & 1) ? nunits : 0));
     }
   else
     {
-      for (i = 0; i < nunits; ++i)
+      /* The encoding has a single interleaved stepped pattern.  */
+      sel.new_vector (nunits, 1, 3);
+      for (i = 0; i < 3; ++i)
 	sel.quick_push (2 * i + (BYTES_BIG_ENDIAN ? 0 : 1));
     }
 
Index: gcc/optabs-query.c
===================================================================
--- gcc/optabs-query.c	2017-12-09 22:48:47.545825268 +0000
+++ gcc/optabs-query.c	2017-12-09 22:48:52.265015799 +0000
@@ -501,8 +501,9 @@  can_mult_highpart_p (machine_mode mode,
       op = uns_p ? vec_widen_umult_odd_optab : vec_widen_smult_odd_optab;
       if (optab_handler (op, mode) != CODE_FOR_nothing)
 	{
-	  vec_perm_builder sel (nunits, nunits, 1);
-	  for (i = 0; i < nunits; ++i)
+	  /* The encoding has 2 interleaved stepped patterns.  */
+	  vec_perm_builder sel (nunits, 2, 3);
+	  for (i = 0; i < 6; ++i)
 	    sel.quick_push (!BYTES_BIG_ENDIAN
 			    + (i & ~1)
 			    + ((i & 1) ? nunits : 0));
@@ -518,8 +519,9 @@  can_mult_highpart_p (machine_mode mode,
       op = uns_p ? vec_widen_umult_lo_optab : vec_widen_smult_lo_optab;
       if (optab_handler (op, mode) != CODE_FOR_nothing)
 	{
-	  vec_perm_builder sel (nunits, nunits, 1);
-	  for (i = 0; i < nunits; ++i)
+	  /* The encoding has a single stepped pattern.  */
+	  vec_perm_builder sel (nunits, 1, 3);
+	  for (int i = 0; i < 3; ++i)
 	    sel.quick_push (2 * i + (BYTES_BIG_ENDIAN ? 0 : 1));
 	  vec_perm_indices indices (sel, 2, nunits);
 	  if (can_vec_perm_const_p (mode, indices))
Index: gcc/tree-vect-loop.c
===================================================================
--- gcc/tree-vect-loop.c	2017-12-09 22:48:47.547825355 +0000
+++ gcc/tree-vect-loop.c	2017-12-09 22:48:52.267015873 +0000
@@ -3716,8 +3716,10 @@  vect_estimate_min_profitable_iters (loop
 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
 			      vec_perm_builder *sel)
 {
-  sel->new_vector (nelt, nelt, 1);
-  for (unsigned int i = 0; i < nelt; i++)
+  /* The encoding is a single stepped pattern.  Any wrap-around is handled
+     by vec_perm_indices.  */
+  sel->new_vector (nelt, 1, 3);
+  for (unsigned int i = 0; i < 3; i++)
     sel->quick_push (i + offset);
 }
 
Index: gcc/tree-vect-stmts.c
===================================================================
--- gcc/tree-vect-stmts.c	2017-12-09 22:48:50.360942531 +0000
+++ gcc/tree-vect-stmts.c	2017-12-09 22:48:52.268015910 +0000
@@ -1717,8 +1717,9 @@  perm_mask_for_reverse (tree vectype)
 
   nunits = TYPE_VECTOR_SUBPARTS (vectype);
 
-  vec_perm_builder sel (nunits, nunits, 1);
-  for (i = 0; i < nunits; ++i)
+  /* The encoding has a single stepped pattern.  */
+  vec_perm_builder sel (nunits, 1, 3);
+  for (i = 0; i < 3; ++i)
     sel.quick_push (nunits - 1 - i);
 
   vec_perm_indices indices (sel, 1, nunits);
@@ -2504,8 +2505,9 @@  vectorizable_bswap (gimple *stmt, gimple
   unsigned int num_bytes = TYPE_VECTOR_SUBPARTS (char_vectype);
   unsigned word_bytes = num_bytes / nunits;
 
-  vec_perm_builder elts (num_bytes, num_bytes, 1);
-  for (unsigned i = 0; i < nunits; ++i)
+  /* The encoding uses one stepped pattern for each byte in the word.  */
+  vec_perm_builder elts (num_bytes, word_bytes, 3);
+  for (unsigned i = 0; i < 3; ++i)
     for (unsigned j = 0; j < word_bytes; ++j)
       elts.quick_push ((i + 1) * word_bytes - j - 1);
 
Index: gcc/tree-vect-data-refs.c
===================================================================
--- gcc/tree-vect-data-refs.c	2017-12-09 22:48:47.546825312 +0000
+++ gcc/tree-vect-data-refs.c	2017-12-09 22:48:52.267015873 +0000
@@ -4566,14 +4566,13 @@  vect_grouped_store_supported (tree vecty
   if (VECTOR_MODE_P (mode))
     {
       unsigned int i, nelt = GET_MODE_NUNITS (mode);
-      vec_perm_builder sel (nelt, nelt, 1);
-      sel.quick_grow (nelt);
-
       if (count == 3)
 	{
 	  unsigned int j0 = 0, j1 = 0, j2 = 0;
 	  unsigned int i, j;
 
+	  vec_perm_builder sel (nelt, nelt, 1);
+	  sel.quick_grow (nelt);
 	  vec_perm_indices indices;
 	  for (j = 0; j < 3; j++)
 	    {
@@ -4623,7 +4622,10 @@  vect_grouped_store_supported (tree vecty
 	  /* If length is not equal to 3 then only power of 2 is supported.  */
 	  gcc_assert (pow2p_hwi (count));
 
-	  for (i = 0; i < nelt / 2; i++)
+	  /* The encoding has 2 interleaved stepped patterns.  */
+	  vec_perm_builder sel (nelt, 2, 3);
+	  sel.quick_grow (6);
+	  for (i = 0; i < 3; i++)
 	    {
 	      sel[i * 2] = i;
 	      sel[i * 2 + 1] = i + nelt;
@@ -4631,7 +4633,7 @@  vect_grouped_store_supported (tree vecty
 	  vec_perm_indices indices (sel, 2, nelt);
 	  if (can_vec_perm_const_p (mode, indices))
 	    {
-	      for (i = 0; i < nelt; i++)
+	      for (i = 0; i < 6; i++)
 		sel[i] += nelt / 2;
 	      indices.new_vector (sel, 2, nelt);
 	      if (can_vec_perm_const_p (mode, indices))
@@ -4736,9 +4738,6 @@  vect_permute_store_chain (vec<tree> dr_c
   unsigned int i, n, log_length = exact_log2 (length);
   unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
 
-  vec_perm_builder sel (nelt, nelt, 1);
-  sel.quick_grow (nelt);
-
   result_chain->quick_grow (length);
   memcpy (result_chain->address (), dr_chain.address (),
 	  length * sizeof (tree));
@@ -4747,6 +4746,8 @@  vect_permute_store_chain (vec<tree> dr_c
     {
       unsigned int j0 = 0, j1 = 0, j2 = 0;
 
+      vec_perm_builder sel (nelt, nelt, 1);
+      sel.quick_grow (nelt);
       vec_perm_indices indices;
       for (j = 0; j < 3; j++)
         {
@@ -4808,7 +4809,10 @@  vect_permute_store_chain (vec<tree> dr_c
       /* If length is not equal to 3 then only power of 2 is supported.  */
       gcc_assert (pow2p_hwi (length));
 
-      for (i = 0, n = nelt / 2; i < n; i++)
+      /* The encoding has 2 interleaved stepped patterns.  */
+      vec_perm_builder sel (nelt, 2, 3);
+      sel.quick_grow (6);
+      for (i = 0; i < 3; i++)
 	{
 	  sel[i * 2] = i;
 	  sel[i * 2 + 1] = i + nelt;
@@ -4816,7 +4820,7 @@  vect_permute_store_chain (vec<tree> dr_c
 	vec_perm_indices indices (sel, 2, nelt);
 	perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
 
-	for (i = 0; i < nelt; i++)
+	for (i = 0; i < 6; i++)
 	  sel[i] += nelt / 2;
 	indices.new_vector (sel, 2, nelt);
 	perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
@@ -5164,11 +5168,11 @@  vect_grouped_load_supported (tree vectyp
   if (VECTOR_MODE_P (mode))
     {
       unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
-      vec_perm_builder sel (nelt, nelt, 1);
-      sel.quick_grow (nelt);
 
       if (count == 3)
 	{
+	  vec_perm_builder sel (nelt, nelt, 1);
+	  sel.quick_grow (nelt);
 	  vec_perm_indices indices;
 	  unsigned int k;
 	  for (k = 0; k < 3; k++)
@@ -5209,12 +5213,15 @@  vect_grouped_load_supported (tree vectyp
 	  /* If length is not equal to 3 then only power of 2 is supported.  */
 	  gcc_assert (pow2p_hwi (count));
 
-	  for (i = 0; i < nelt; i++)
+	  /* The encoding has a single stepped pattern.  */
+	  vec_perm_builder sel (nelt, 1, 3);
+	  sel.quick_grow (3);
+	  for (i = 0; i < 3; i++)
 	    sel[i] = i * 2;
 	  vec_perm_indices indices (sel, 2, nelt);
 	  if (can_vec_perm_const_p (mode, indices))
 	    {
-	      for (i = 0; i < nelt; i++)
+	      for (i = 0; i < 3; i++)
 		sel[i] = i * 2 + 1;
 	      indices.new_vector (sel, 2, nelt);
 	      if (can_vec_perm_const_p (mode, indices))
@@ -5332,9 +5339,6 @@  vect_permute_load_chain (vec<tree> dr_ch
   unsigned int i, j, log_length = exact_log2 (length);
   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
 
-  vec_perm_builder sel (nelt, nelt, 1);
-  sel.quick_grow (nelt);
-
   result_chain->quick_grow (length);
   memcpy (result_chain->address (), dr_chain.address (),
 	  length * sizeof (tree));
@@ -5343,6 +5347,8 @@  vect_permute_load_chain (vec<tree> dr_ch
     {
       unsigned int k;
 
+      vec_perm_builder sel (nelt, nelt, 1);
+      sel.quick_grow (nelt);
       vec_perm_indices indices;
       for (k = 0; k < 3; k++)
 	{
@@ -5390,12 +5396,15 @@  vect_permute_load_chain (vec<tree> dr_ch
       /* If length is not equal to 3 then only power of 2 is supported.  */
       gcc_assert (pow2p_hwi (length));
 
-      for (i = 0; i < nelt; ++i)
+      /* The encoding has a single stepped pattern.  */
+      vec_perm_builder sel (nelt, 1, 3);
+      sel.quick_grow (3);
+      for (i = 0; i < 3; ++i)
 	sel[i] = i * 2;
       vec_perm_indices indices (sel, 2, nelt);
       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
 
-      for (i = 0; i < nelt; ++i)
+      for (i = 0; i < 3; ++i)
 	sel[i] = i * 2 + 1;
       indices.new_vector (sel, 2, nelt);
       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);