[X86] Fold more shuffle builtins to VEC_PERM_EXPR.

Message ID CAMZc-bwSDvTwsiwf3QO+pJ+3pr3iciWsrGQddqw35TigVicPsg@mail.gmail.com
State New
Headers show
Series
  • [X86] Fold more shuffle builtins to VEC_PERM_EXPR.
Related show

Commit Message

H.J. Lu via Gcc-patches Dec. 15, 2020, 10:10 a.m.
Hi:
  As indicated in PR98167, this patch is a follow-up to [1].
  Bootstrapped and regtested on x86_64-linux-gnu.
  Ok for trunk?

gcc/
        PR target/98167
        * config/i386/i386.c (ix86_gimple_fold_builtin): Handle
        IX86_BUILTIN_SHUFPD512, IX86_BUILTIN_SHUFPS512,
        IX86_BUILTIN_SHUFPD256, IX86_BUILTIN_SHUFPS,
        IX86_BUILTIN_SHUFPS256.

gcc/testsuite/
        * gcc.target/i386/avx512f-vshufpd-1.c: Adjust testcase.
        * gcc.target/i386/avx512f-vshufps-1.c: Adjust testcase.

[1]. https://gcc.gnu.org/pipermail/gcc-patches/2019-May/521983.html

-- 
BR,
Hongtao

Comments

H.J. Lu via Gcc-patches Dec. 15, 2020, 11:11 a.m. | #1
On Tue, Dec 15, 2020 at 06:10:57PM +0800, Hongtao Liu via Gcc-patches wrote:
> --- a/gcc/config/i386/i386.c

> +++ b/gcc/config/i386/i386.c

> @@ -18187,21 +18187,67 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)

>  	}

>        break;

>  

> +    case IX86_BUILTIN_SHUFPD512:

> +    case IX86_BUILTIN_SHUFPS512:

> +      if (n_args > 2)

> +	{

> +	  /* This is masked shuffle.  Only optimize if the mask is all ones.  */

> +	  tree argl = gimple_call_arg (stmt, n_args - 1);

> +	  arg0 = gimple_call_arg (stmt, 0);

> +	  if (!tree_fits_uhwi_p (argl))

> +	    break;

> +	  unsigned HOST_WIDE_INT mask = tree_to_uhwi (argl);

> +	  unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));


I think it would be better not to mix the argl and arg0 stuff.
So e.g. do
	  arg0 = gimple_call_arg (stmt, 0);
	  unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
first and then the argl stuff, or vice versa.
Furthermore, you don't really care about the upper bits of argl,
so why don't punt just if (TREE_CODE (argl) != INTEGER_CST)
and use mask = TREE_LOW_CST (argl);
?

> +	  if ((mask | (HOST_WIDE_INT_M1U << elems)) != HOST_WIDE_INT_M1U)

> +	    break;

> +	}

> +      /* Fall thru.  */

>      case IX86_BUILTIN_SHUFPD:

> +    case IX86_BUILTIN_SHUFPD256:

> +    case IX86_BUILTIN_SHUFPS:

> +    case IX86_BUILTIN_SHUFPS256:

>        arg2 = gimple_call_arg (stmt, 2);

>        if (TREE_CODE (arg2) == INTEGER_CST)

>  	{

> -	  location_t loc = gimple_location (stmt);

> -	  unsigned HOST_WIDE_INT imask = TREE_INT_CST_LOW (arg2);

>  	  arg0 = gimple_call_arg (stmt, 0);

> +	  unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));

> +	  machine_mode imode = GET_MODE_INNER (TYPE_MODE (TREE_TYPE (arg0)));

> +	  unsigned HOST_WIDE_INT imask = TREE_INT_CST_LOW (arg2);

> +

> +	  /* Check valid imm, refer to gcc.target/i386/testimm-10.c.  */

> +	  if (imask > 255

> +	      || (imask >= HOST_WIDE_INT_1U << elems

> +		  && imode == E_DFmode))

> +	    return false;


Why is this extra checking done only for DFmode and not for SFmode?

> +	  tree itype = imode == E_DFmode

> +	    ? long_long_integer_type_node : integer_type_node;


Formatting.  Should be e.g.
	  tree itype
	    = (imode == E_DFmode
	       ? long_long_integer_type_node : integer_type_node);
or
	  tree itype = (imode == E_DFmode ? long_long_integer_type_node
					  : integer_type_node);
but the ? which is part of the imode == E_DFmode ... subexpression
can't just be below something unrelated.

> +	      if (imode == E_DFmode)

> +		sel_idx = (i & 1) * elems

> +		  + (i >> 1 << 1) + ((imask & 1 << i) >> i);


Again, formatting.  Plus, i >> 1 << 1 looks too ugly/unreadable,
if you mean i & ~1, write it like that, it is up to the compiler to emit
it like i >> 1 << 1 if that is the best implementation.

> +	      else

> +		{

> +		  /* Imm[7:0](if VL > 128, also use Imm[7:0]) provide 4 select

> +		     controls for each element of the destination.  */

> +		  unsigned j = i % 4;

> +		  sel_idx = ((i & 2) >> 1) * elems

> +		    + (i >> 2 << 2) + ((imask & 3 << j << j) >> j >> j);


Ditto.

	Jakub
H.J. Lu via Gcc-patches Dec. 16, 2020, 10:41 a.m. | #2
On Tue, Dec 15, 2020 at 7:11 PM Jakub Jelinek <jakub@redhat.com> wrote:
>

> On Tue, Dec 15, 2020 at 06:10:57PM +0800, Hongtao Liu via Gcc-patches wrote:

> > --- a/gcc/config/i386/i386.c

> > +++ b/gcc/config/i386/i386.c

> > @@ -18187,21 +18187,67 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)

> >       }

> >        break;

> >

> > +    case IX86_BUILTIN_SHUFPD512:

> > +    case IX86_BUILTIN_SHUFPS512:

> > +      if (n_args > 2)

> > +     {

> > +       /* This is masked shuffle.  Only optimize if the mask is all ones.  */

> > +       tree argl = gimple_call_arg (stmt, n_args - 1);

> > +       arg0 = gimple_call_arg (stmt, 0);

> > +       if (!tree_fits_uhwi_p (argl))

> > +         break;

> > +       unsigned HOST_WIDE_INT mask = tree_to_uhwi (argl);

> > +       unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));

>

> I think it would be better not to mix the argl and arg0 stuff.

> So e.g. do

>           arg0 = gimple_call_arg (stmt, 0);

>           unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));

> first and then the argl stuff, or vice versa.

> Furthermore, you don't really care about the upper bits of argl,

> so why don't punt just if (TREE_CODE (argl) != INTEGER_CST)

> and use mask = TREE_LOW_CST (argl);

> ?

>


Yes, and for maintenance convenience, i put these code into a new
function which can be also called by masked shift
@@ -18128,17 +18142,10 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
       gcc_assert (n_args >= 2);
       arg0 = gimple_call_arg (stmt, 0);
       arg1 = gimple_call_arg (stmt, 1);
-      if (n_args > 2)
- {
-   /* This is masked shift.  Only optimize if the mask is all ones.  */
-   tree argl = gimple_call_arg (stmt, n_args - 1);
-   if (!tree_fits_uhwi_p (argl))
-     break;
-   unsigned HOST_WIDE_INT mask = tree_to_uhwi (argl);
-   unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
-   if ((mask | (HOST_WIDE_INT_M1U << elems)) != HOST_WIDE_INT_M1U)
-     break;
- }
+      /* For masked shift, only optimize if the mask is all ones.  */
+      if (n_args > 2
+   && !ix86_masked_all_ones (arg0, gimple_call_arg (stmt, n_args - 1)))
+ break;


> > +       if ((mask | (HOST_WIDE_INT_M1U << elems)) != HOST_WIDE_INT_M1U)

> > +         break;

> > +     }

> > +      /* Fall thru.  */

> >      case IX86_BUILTIN_SHUFPD:

> > +    case IX86_BUILTIN_SHUFPD256:

> > +    case IX86_BUILTIN_SHUFPS:

> > +    case IX86_BUILTIN_SHUFPS256:

> >        arg2 = gimple_call_arg (stmt, 2);

> >        if (TREE_CODE (arg2) == INTEGER_CST)

> >       {

> > -       location_t loc = gimple_location (stmt);

> > -       unsigned HOST_WIDE_INT imask = TREE_INT_CST_LOW (arg2);

> >         arg0 = gimple_call_arg (stmt, 0);

> > +       unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));

> > +       machine_mode imode = GET_MODE_INNER (TYPE_MODE (TREE_TYPE (arg0)));

> > +       unsigned HOST_WIDE_INT imask = TREE_INT_CST_LOW (arg2);

> > +

> > +       /* Check valid imm, refer to gcc.target/i386/testimm-10.c.  */

> > +       if (imask > 255

> > +           || (imask >= HOST_WIDE_INT_1U << elems

> > +               && imode == E_DFmode))

> > +         return false;

>

> Why is this extra checking done only for DFmode and not for SFmode?

Oh, yes, delete extra checking, the instruction would ignore high bits
for 128/256-bit DFmode version.
>

> > +       tree itype = imode == E_DFmode

> > +         ? long_long_integer_type_node : integer_type_node;

>

> Formatting.  Should be e.g.

>           tree itype

>             = (imode == E_DFmode

>                ? long_long_integer_type_node : integer_type_node);

> or

>           tree itype = (imode == E_DFmode ? long_long_integer_type_node

>                                           : integer_type_node);

> but the ? which is part of the imode == E_DFmode ... subexpression

> can't just be below something unrelated.

>

Changed.
> > +           if (imode == E_DFmode)

> > +             sel_idx = (i & 1) * elems

> > +               + (i >> 1 << 1) + ((imask & 1 << i) >> i);

>

> Again, formatting.  Plus, i >> 1 << 1 looks too ugly/unreadable,

> if you mean i & ~1, write it like that, it is up to the compiler to emit

> it like i >> 1 << 1 if that is the best implementation.

>

Changed.
> > +           else

> > +             {

> > +               /* Imm[7:0](if VL > 128, also use Imm[7:0]) provide 4 select

> > +                  controls for each element of the destination.  */

> > +               unsigned j = i % 4;

> > +               sel_idx = ((i & 2) >> 1) * elems

> > +                 + (i >> 2 << 2) + ((imask & 3 << j << j) >> j >> j);

>

> Ditto.

>

Changed.
>         Jakub

>


Update patch

--
BR,
Hongtao
From 1cfec402ffa25375c88fa38e783d203401f38c5e Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Fri, 11 Dec 2020 19:02:43 +0800
Subject: [PATCH] [X86] Fold more shuffle builtins to VEC_PERM_EXPR.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A follow-up to https://gcc.gnu.org/pipermail/gcc-patches/2019-May/521983.html

gcc/
	PR target/98167
	* config/i386/i386.c (ix86_gimple_fold_builtin): Handle
	IX86_BUILTIN_SHUFPD512, IX86_BUILTIN_SHUFPS512,
	IX86_BUILTIN_SHUFPD256, IX86_BUILTIN_SHUFPS,
	IX86_BUILTIN_SHUFPS256.

gcc/testsuite/
	* gcc.target/i386/avx512f-vshufpd-1.c: Adjust testcase.
	* gcc.target/i386/avx512f-vshufps-1.c: Adjust testcase.
---
 gcc/config/i386/i386.c                        | 86 ++++++++++++++-----
 .../gcc.target/i386/avx512f-vshufpd-1.c       |  3 +-
 .../gcc.target/i386/avx512f-vshufps-1.c       |  3 +-
 3 files changed, 70 insertions(+), 22 deletions(-)

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 54b7e103ba2..ecae06bbef8 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -17403,6 +17403,22 @@ ix86_vector_shift_count (tree arg1)
   return NULL_TREE;
 }
 
+/* Return true if arg_mask is all ones, arg_vec is corresponding vector.  */
+static bool
+ix86_masked_all_ones (tree arg_vec, tree arg_mask)
+{
+  if (TREE_CODE (arg_mask) != INTEGER_CST)
+    return false;
+
+  unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg_vec));
+  unsigned HOST_WIDE_INT mask = TREE_INT_CST_LOW (arg_mask);
+  if ((mask | (HOST_WIDE_INT_M1U << elems)) != HOST_WIDE_INT_M1U)
+    return false;
+
+  return true;
+
+}
+
 static tree
 ix86_fold_builtin (tree fndecl, int n_args,
 		   tree *args, bool ignore ATTRIBUTE_UNUSED)
@@ -18128,17 +18144,10 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
       gcc_assert (n_args >= 2);
       arg0 = gimple_call_arg (stmt, 0);
       arg1 = gimple_call_arg (stmt, 1);
-      if (n_args > 2)
-	{
-	  /* This is masked shift.  Only optimize if the mask is all ones.  */
-	  tree argl = gimple_call_arg (stmt, n_args - 1);
-	  if (!tree_fits_uhwi_p (argl))
-	    break;
-	  unsigned HOST_WIDE_INT mask = tree_to_uhwi (argl);
-	  unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
-	  if ((mask | (HOST_WIDE_INT_M1U << elems)) != HOST_WIDE_INT_M1U)
-	    break;
-	}
+      /* For masked shift, only optimize if the mask is all ones.  */
+      if (n_args > 2
+	  && !ix86_masked_all_ones (arg0, gimple_call_arg (stmt, n_args - 1)))
+	break;
       if (is_vshift)
 	{
 	  if (TREE_CODE (arg1) != VECTOR_CST)
@@ -18187,21 +18196,58 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
 	}
       break;
 
+    case IX86_BUILTIN_SHUFPD512:
+    case IX86_BUILTIN_SHUFPS512:
+      /* This is masked shuffle.  Only optimize if the mask is all ones.  */
+      if (n_args > 2
+	  && !ix86_masked_all_ones (gimple_call_arg (stmt, 0),
+				    gimple_call_arg (stmt, n_args - 1)))
+	break;
+      /* Fall thru.  */
     case IX86_BUILTIN_SHUFPD:
+    case IX86_BUILTIN_SHUFPD256:
+    case IX86_BUILTIN_SHUFPS:
+    case IX86_BUILTIN_SHUFPS256:
       arg2 = gimple_call_arg (stmt, 2);
       if (TREE_CODE (arg2) == INTEGER_CST)
 	{
-	  location_t loc = gimple_location (stmt);
-	  unsigned HOST_WIDE_INT imask = TREE_INT_CST_LOW (arg2);
 	  arg0 = gimple_call_arg (stmt, 0);
+	  unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
+	  machine_mode imode = GET_MODE_INNER (TYPE_MODE (TREE_TYPE (arg0)));
+	  unsigned HOST_WIDE_INT imask = TREE_INT_CST_LOW (arg2);
+
+	  /* Check valid imm, refer to gcc.target/i386/testimm-10.c.  */
+	  if (imask > 255)
+	    return false;
+
 	  arg1 = gimple_call_arg (stmt, 1);
-	  tree itype = long_long_integer_type_node;
-	  tree vtype = build_vector_type (itype, 2); /* V2DI */
-	  tree_vector_builder elts (vtype, 2, 1);
-	  /* Ignore bits other than the lowest 2.  */
-	  elts.quick_push (build_int_cst (itype, imask & 1));
-	  imask >>= 1;
-	  elts.quick_push (build_int_cst (itype, 2 + (imask & 1)));
+	  location_t loc = gimple_location (stmt);
+	  tree itype = (imode == E_DFmode
+			? long_long_integer_type_node : integer_type_node);
+	  /* V2DI/V4DI/V8DI/V4SI/V8SI/V16SI  */
+	  tree vtype = build_vector_type (itype, elems);
+	  tree_vector_builder elts (vtype, elems, 1);
+
+	  for (unsigned i = 0; i != elems; i++)
+	    {
+	      unsigned sel_idx;
+	      /* Imm[1:0](if VL > 128, then use Imm[3:2],Imm[5:4],Imm[7:6])
+		 provide 2 select constrols for each element of the
+		 destination.  */
+	      if (imode == E_DFmode)
+		sel_idx = ((i & 1) * elems
+			   + (i & ~1) + ((imask & 1 << i) >> i));
+	      else
+		{
+		  /* Imm[7:0](if VL > 128, also use Imm[7:0]) provide 4 select
+		     controls for each element of the destination.  */
+		  unsigned j = i % 4;
+		  sel_idx = (((i & 2) >> 1) * elems
+			     + (i & ~3) + ((imask & 3 << j << j) >> j >> j));
+		}
+	      elts.quick_push (build_int_cst (itype, sel_idx));
+	    }
+
 	  tree omask = elts.build ();
 	  gimple *g = gimple_build_assign (gimple_call_lhs (stmt),
 					   VEC_PERM_EXPR,
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vshufpd-1.c b/gcc/testsuite/gcc.target/i386/avx512f-vshufpd-1.c
index d1ac01e1c88..8df5b9d4441 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-vshufpd-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-vshufpd-1.c
@@ -7,11 +7,12 @@
 #include <immintrin.h>
 
 __m512d x;
+__m512d y;
 
 void extern
 avx512f_test (void)
 {
-  x = _mm512_shuffle_pd (x, x, 56);
+  x = _mm512_shuffle_pd (x, y, 56);
   x = _mm512_mask_shuffle_pd (x, 2, x, x, 56);
   x = _mm512_maskz_shuffle_pd (2, x, x, 56);
 }
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vshufps-1.c b/gcc/testsuite/gcc.target/i386/avx512f-vshufps-1.c
index 07a63fca3ff..378ae4b7101 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-vshufps-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-vshufps-1.c
@@ -7,11 +7,12 @@
 #include <immintrin.h>
 
 __m512 x;
+__m512 y;
 
 void extern
 avx512f_test (void)
 {
-  x = _mm512_shuffle_ps (x, x, 56);
+  x = _mm512_shuffle_ps (x, y, 56);
   x = _mm512_mask_shuffle_ps (x, 2, x, x, 56);
   x = _mm512_maskz_shuffle_ps (2, x, x, 56);
 }
H.J. Lu via Gcc-patches Feb. 15, 2021, 11:33 p.m. | #3
On 12/16/20 3:41 AM, Hongtao Liu via Gcc-patches wrote:
> On Tue, Dec 15, 2020 at 7:11 PM Jakub Jelinek <jakub@redhat.com> wrote:

>> On Tue, Dec 15, 2020 at 06:10:57PM +0800, Hongtao Liu via Gcc-patches wrote:

>>> --- a/gcc/config/i386/i386.c

>>> +++ b/gcc/config/i386/i386.c

>>> @@ -18187,21 +18187,67 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)

>>>       }

>>>        break;

>>>

>>> +    case IX86_BUILTIN_SHUFPD512:

>>> +    case IX86_BUILTIN_SHUFPS512:

>>> +      if (n_args > 2)

>>> +     {

>>> +       /* This is masked shuffle.  Only optimize if the mask is all ones.  */

>>> +       tree argl = gimple_call_arg (stmt, n_args - 1);

>>> +       arg0 = gimple_call_arg (stmt, 0);

>>> +       if (!tree_fits_uhwi_p (argl))

>>> +         break;

>>> +       unsigned HOST_WIDE_INT mask = tree_to_uhwi (argl);

>>> +       unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));

>> I think it would be better not to mix the argl and arg0 stuff.

>> So e.g. do

>>           arg0 = gimple_call_arg (stmt, 0);

>>           unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));

>> first and then the argl stuff, or vice versa.

>> Furthermore, you don't really care about the upper bits of argl,

>> so why don't punt just if (TREE_CODE (argl) != INTEGER_CST)

>> and use mask = TREE_LOW_CST (argl);

>> ?

>>

> Yes, and for maintenance convenience, i put these code into a new

> function which can be also called by masked shift

> @@ -18128,17 +18142,10 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)

>        gcc_assert (n_args >= 2);

>        arg0 = gimple_call_arg (stmt, 0);

>        arg1 = gimple_call_arg (stmt, 1);

> -      if (n_args > 2)

> - {

> -   /* This is masked shift.  Only optimize if the mask is all ones.  */

> -   tree argl = gimple_call_arg (stmt, n_args - 1);

> -   if (!tree_fits_uhwi_p (argl))

> -     break;

> -   unsigned HOST_WIDE_INT mask = tree_to_uhwi (argl);

> -   unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));

> -   if ((mask | (HOST_WIDE_INT_M1U << elems)) != HOST_WIDE_INT_M1U)

> -     break;

> - }

> +      /* For masked shift, only optimize if the mask is all ones.  */

> +      if (n_args > 2

> +   && !ix86_masked_all_ones (arg0, gimple_call_arg (stmt, n_args - 1)))

> + break;

>

>

>>> +       if ((mask | (HOST_WIDE_INT_M1U << elems)) != HOST_WIDE_INT_M1U)

>>> +         break;

>>> +     }

>>> +      /* Fall thru.  */

>>>      case IX86_BUILTIN_SHUFPD:

>>> +    case IX86_BUILTIN_SHUFPD256:

>>> +    case IX86_BUILTIN_SHUFPS:

>>> +    case IX86_BUILTIN_SHUFPS256:

>>>        arg2 = gimple_call_arg (stmt, 2);

>>>        if (TREE_CODE (arg2) == INTEGER_CST)

>>>       {

>>> -       location_t loc = gimple_location (stmt);

>>> -       unsigned HOST_WIDE_INT imask = TREE_INT_CST_LOW (arg2);

>>>         arg0 = gimple_call_arg (stmt, 0);

>>> +       unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));

>>> +       machine_mode imode = GET_MODE_INNER (TYPE_MODE (TREE_TYPE (arg0)));

>>> +       unsigned HOST_WIDE_INT imask = TREE_INT_CST_LOW (arg2);

>>> +

>>> +       /* Check valid imm, refer to gcc.target/i386/testimm-10.c.  */

>>> +       if (imask > 255

>>> +           || (imask >= HOST_WIDE_INT_1U << elems

>>> +               && imode == E_DFmode))

>>> +         return false;

>> Why is this extra checking done only for DFmode and not for SFmode?

> Oh, yes, delete extra checking, the instruction would ignore high bits

> for 128/256-bit DFmode version.

>>> +       tree itype = imode == E_DFmode

>>> +         ? long_long_integer_type_node : integer_type_node;

>> Formatting.  Should be e.g.

>>           tree itype

>>             = (imode == E_DFmode

>>                ? long_long_integer_type_node : integer_type_node);

>> or

>>           tree itype = (imode == E_DFmode ? long_long_integer_type_node

>>                                           : integer_type_node);

>> but the ? which is part of the imode == E_DFmode ... subexpression

>> can't just be below something unrelated.

>>

> Changed.

>>> +           if (imode == E_DFmode)

>>> +             sel_idx = (i & 1) * elems

>>> +               + (i >> 1 << 1) + ((imask & 1 << i) >> i);

>> Again, formatting.  Plus, i >> 1 << 1 looks too ugly/unreadable,

>> if you mean i & ~1, write it like that, it is up to the compiler to emit

>> it like i >> 1 << 1 if that is the best implementation.

>>

> Changed.

>>> +           else

>>> +             {

>>> +               /* Imm[7:0](if VL > 128, also use Imm[7:0]) provide 4 select

>>> +                  controls for each element of the destination.  */

>>> +               unsigned j = i % 4;

>>> +               sel_idx = ((i & 2) >> 1) * elems

>>> +                 + (i >> 2 << 2) + ((imask & 3 << j << j) >> j >> j);

>> Ditto.

>>

> Changed.

>>         Jakub

>>

> Update patch

>

> --

> BR,

> Hongtao

>

> 0001-X86-Fold-more-shuffle-builtins-to-VEC_PERM_EXPR.patch

>

> From 1cfec402ffa25375c88fa38e783d203401f38c5e Mon Sep 17 00:00:00 2001

> From: liuhongt <hongtao.liu@intel.com>

> Date: Fri, 11 Dec 2020 19:02:43 +0800

> Subject: [PATCH] [X86] Fold more shuffle builtins to VEC_PERM_EXPR.

> MIME-Version: 1.0

> Content-Type: text/plain; charset=UTF-8

> Content-Transfer-Encoding: 8bit

>

> A follow-up to https://gcc.gnu.org/pipermail/gcc-patches/2019-May/521983.html

>

> gcc/

> 	PR target/98167

> 	* config/i386/i386.c (ix86_gimple_fold_builtin): Handle

> 	IX86_BUILTIN_SHUFPD512, IX86_BUILTIN_SHUFPS512,

> 	IX86_BUILTIN_SHUFPD256, IX86_BUILTIN_SHUFPS,

> 	IX86_BUILTIN_SHUFPS256.

>

> gcc/testsuite/

> 	* gcc.target/i386/avx512f-vshufpd-1.c: Adjust testcase.

> 	* gcc.target/i386/avx512f-vshufps-1.c: Adjust testcase.

I think this should defer to gcc-12.

jeff

Patch

From 74596b08a91dafcb29441de59544dd857a090564 Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Fri, 11 Dec 2020 19:02:43 +0800
Subject: [PATCH] [X86] Fold more shuffle builtins to VEC_PERM_EXPR.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A follow-up to https://gcc.gnu.org/pipermail/gcc-patches/2019-May/521983.html

gcc/
	PR target/98167
	* config/i386/i386.c (ix86_gimple_fold_builtin): Handle
	IX86_BUILTIN_SHUFPD512, IX86_BUILTIN_SHUFPS512,
	IX86_BUILTIN_SHUFPD256, IX86_BUILTIN_SHUFPS,
	IX86_BUILTIN_SHUFPS256.

gcc/testsuite/
	* gcc.target/i386/avx512f-vshufpd-1.c: Adjust testcase.
	* gcc.target/i386/avx512f-vshufps-1.c: Adjust testcase.
---
 gcc/config/i386/i386.c                        | 64 ++++++++++++++++---
 .../gcc.target/i386/avx512f-vshufpd-1.c       |  3 +-
 .../gcc.target/i386/avx512f-vshufps-1.c       |  3 +-
 3 files changed, 59 insertions(+), 11 deletions(-)

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 54b7e103ba2..432470a916c 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -18187,21 +18187,67 @@  ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
 	}
       break;
 
+    case IX86_BUILTIN_SHUFPD512:
+    case IX86_BUILTIN_SHUFPS512:
+      if (n_args > 2)
+	{
+	  /* This is masked shuffle.  Only optimize if the mask is all ones.  */
+	  tree argl = gimple_call_arg (stmt, n_args - 1);
+	  arg0 = gimple_call_arg (stmt, 0);
+	  if (!tree_fits_uhwi_p (argl))
+	    break;
+	  unsigned HOST_WIDE_INT mask = tree_to_uhwi (argl);
+	  unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
+	  if ((mask | (HOST_WIDE_INT_M1U << elems)) != HOST_WIDE_INT_M1U)
+	    break;
+	}
+      /* Fall thru.  */
     case IX86_BUILTIN_SHUFPD:
+    case IX86_BUILTIN_SHUFPD256:
+    case IX86_BUILTIN_SHUFPS:
+    case IX86_BUILTIN_SHUFPS256:
       arg2 = gimple_call_arg (stmt, 2);
       if (TREE_CODE (arg2) == INTEGER_CST)
 	{
-	  location_t loc = gimple_location (stmt);
-	  unsigned HOST_WIDE_INT imask = TREE_INT_CST_LOW (arg2);
 	  arg0 = gimple_call_arg (stmt, 0);
+	  unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
+	  machine_mode imode = GET_MODE_INNER (TYPE_MODE (TREE_TYPE (arg0)));
+	  unsigned HOST_WIDE_INT imask = TREE_INT_CST_LOW (arg2);
+
+	  /* Check valid imm, refer to gcc.target/i386/testimm-10.c.  */
+	  if (imask > 255
+	      || (imask >= HOST_WIDE_INT_1U << elems
+		  && imode == E_DFmode))
+	    return false;
+
 	  arg1 = gimple_call_arg (stmt, 1);
-	  tree itype = long_long_integer_type_node;
-	  tree vtype = build_vector_type (itype, 2); /* V2DI */
-	  tree_vector_builder elts (vtype, 2, 1);
-	  /* Ignore bits other than the lowest 2.  */
-	  elts.quick_push (build_int_cst (itype, imask & 1));
-	  imask >>= 1;
-	  elts.quick_push (build_int_cst (itype, 2 + (imask & 1)));
+	  location_t loc = gimple_location (stmt);
+	  tree itype = imode == E_DFmode
+	    ? long_long_integer_type_node : integer_type_node;
+	  /* V2DI/V4DI/V8DI/V4SI/V8SI/V16SI  */
+	  tree vtype = build_vector_type (itype, elems);
+	  tree_vector_builder elts (vtype, elems, 1);
+
+	  for (unsigned i = 0; i != elems; i++)
+	    {
+	      unsigned sel_idx;
+	      /* Imm[1:0](if VL > 128, then use Imm[3:2],Imm[5:4],Imm[7:6])
+		 provide 2 select constrols for each element of the
+		 destination.  */
+	      if (imode == E_DFmode)
+		sel_idx = (i & 1) * elems
+		  + (i >> 1 << 1) + ((imask & 1 << i) >> i);
+	      else
+		{
+		  /* Imm[7:0](if VL > 128, also use Imm[7:0]) provide 4 select
+		     controls for each element of the destination.  */
+		  unsigned j = i % 4;
+		  sel_idx = ((i & 2) >> 1) * elems
+		    + (i >> 2 << 2) + ((imask & 3 << j << j) >> j >> j);
+		}
+	      elts.quick_push (build_int_cst (itype, sel_idx));
+	    }
+
 	  tree omask = elts.build ();
 	  gimple *g = gimple_build_assign (gimple_call_lhs (stmt),
 					   VEC_PERM_EXPR,
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vshufpd-1.c b/gcc/testsuite/gcc.target/i386/avx512f-vshufpd-1.c
index d1ac01e1c88..8df5b9d4441 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-vshufpd-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-vshufpd-1.c
@@ -7,11 +7,12 @@ 
 #include <immintrin.h>
 
 __m512d x;
+__m512d y;
 
 void extern
 avx512f_test (void)
 {
-  x = _mm512_shuffle_pd (x, x, 56);
+  x = _mm512_shuffle_pd (x, y, 56);
   x = _mm512_mask_shuffle_pd (x, 2, x, x, 56);
   x = _mm512_maskz_shuffle_pd (2, x, x, 56);
 }
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vshufps-1.c b/gcc/testsuite/gcc.target/i386/avx512f-vshufps-1.c
index 07a63fca3ff..378ae4b7101 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-vshufps-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-vshufps-1.c
@@ -7,11 +7,12 @@ 
 #include <immintrin.h>
 
 __m512 x;
+__m512 y;
 
 void extern
 avx512f_test (void)
 {
-  x = _mm512_shuffle_ps (x, x, 56);
+  x = _mm512_shuffle_ps (x, y, 56);
   x = _mm512_mask_shuffle_ps (x, 2, x, x, 56);
   x = _mm512_maskz_shuffle_ps (2, x, x, 56);
 }
-- 
2.18.1