i386: Add missing two element 64bit vector permutations [PR89021]

Message ID CAFULd4YKVR9QcPw6Ed6gTrHTAjvBi4_RLqsrowoB_D7v-p2LkQ@mail.gmail.com
State New
Headers show
Series
  • i386: Add missing two element 64bit vector permutations [PR89021]
Related show

Commit Message

Michael Meissner via Gcc-patches June 16, 2021, 2:08 p.m.
In addition to V8QI permutations, several other missing permutations are
added for 64bit vector modes for TARGET_SSSE3 and TARGET_SSE4_1 targets.

2021-06-16  UroŇ° Bizjak  <ubizjak@gmail.com>

gcc/
    PR target/89021
    * config/i386/i386-expand.c (expand_vec_perm_2perm_pblendv):
    Handle 64bit modes for TARGET_SSE4_1.
    (expand_vec_perm_pshufb2): Handle 64bit modes for TARGET_SSSE3.
    (expand_vec_perm_even_odd_pack): Handle V4HI mode.
    (expand_vec_perm_even_odd_1) <case E_V4HImode>: Expand via
    expand_vec_perm_pshufb2 for TARGET_SSSE3 and via
    expand_vec_perm_even_odd_pack for TARGET_SSE4_1.
    * config/i386/mmx.md (mmx_packusdw): New insn pattern.

Bootstrapped and regression tested on x86_64-linux-gnu {,-m32}.

Pushed to master.

Uros.

Patch

diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index dee3df2e3a0..eb6f9b0684e 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -18972,7 +18974,8 @@  expand_vec_perm_2perm_pblendv (struct expand_vec_perm_d *d, bool two_insn)
     ;
   else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
     ;
-  else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
+  else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 16
+			     || GET_MODE_SIZE (vmode) == 8))
     ;
   else
     return false;
@@ -19229,14 +19232,31 @@  expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
 {
   rtx rperm[2][16], vperm, l, h, op, m128;
   unsigned int i, nelt, eltsz;
+  machine_mode mode;
+  rtx (*gen) (rtx, rtx, rtx);
 
-  if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
+  if (!TARGET_SSSE3 || (GET_MODE_SIZE (d->vmode) != 16
+			&& GET_MODE_SIZE (d->vmode) != 8))
     return false;
   gcc_assert (!d->one_operand_p);
 
   if (d->testing_p)
     return true;
 
+  switch (GET_MODE_SIZE (d->vmode))
+    {
+    case 8:
+      mode = V8QImode;
+      gen = gen_mmx_pshufbv8qi3;
+      break;
+    case 16:
+      mode = V16QImode;
+      gen = gen_ssse3_pshufbv16qi3;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
   nelt = d->nelt;
   eltsz = GET_MODE_UNIT_SIZE (d->vmode);
 
@@ -19247,7 +19267,7 @@  expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
   m128 = GEN_INT (-128);
   for (i = 0; i < nelt; ++i)
     {
-      unsigned j, e = d->perm[i];
+      unsigned j, k, e = d->perm[i];
       unsigned which = (e >= nelt);
       if (e >= nelt)
 	e -= nelt;
@@ -19257,26 +19277,29 @@  expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
 	  rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
 	  rperm[1-which][i*eltsz + j] = m128;
 	}
+
+      for (k = i*eltsz + j; k < 16; ++k)
+	rperm[0][k] = rperm[1][k] = m128;
     }
 
   vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
   vperm = force_reg (V16QImode, vperm);
 
-  l = gen_reg_rtx (V16QImode);
-  op = gen_lowpart (V16QImode, d->op0);
-  emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
+  l = gen_reg_rtx (mode);
+  op = gen_lowpart (mode, d->op0);
+  emit_insn (gen (l, op, vperm));
 
   vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
   vperm = force_reg (V16QImode, vperm);
 
-  h = gen_reg_rtx (V16QImode);
-  op = gen_lowpart (V16QImode, d->op1);
-  emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
+  h = gen_reg_rtx (mode);
+  op = gen_lowpart (mode, d->op1);
+  emit_insn (gen (h, op, vperm));
 
   op = d->target;
-  if (d->vmode != V16QImode)
-    op = gen_reg_rtx (V16QImode);
-  emit_insn (gen_iorv16qi3 (op, l, h));
+  if (d->vmode != mode)
+    op = gen_reg_rtx (mode);
+  emit_insn (gen_rtx_SET (op, gen_rtx_IOR (mode, l, h)));
   if (op != d->target)
     emit_move_insn (d->target, gen_lowpart (d->vmode, op));
 
@@ -19455,6 +19478,17 @@  expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
 
   switch (d->vmode)
     {
+    case E_V4HImode:
+      /* Required for "pack".  */
+      if (!TARGET_SSE4_1)
+	return false;
+      c = 0xffff;
+      s = 16;
+      half_mode = V2SImode;
+      gen_and = gen_andv2si3;
+      gen_pack = gen_mmx_packusdw;
+      gen_shift = gen_lshrv2si3;
+      break;
     case E_V8HImode:
       /* Required for "pack".  */
       if (!TARGET_SSE4_1)
@@ -19507,7 +19541,7 @@  expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
       end_perm = true;
       break;
     default:
-      /* Only V8QI, V8HI, V16QI, V16HI and V32QI modes
+      /* Only V4HI, V8QI, V8HI, V16QI, V16HI and V32QI modes
 	 are more profitable than general shuffles.  */
       return false;
     }
@@ -19698,18 +19732,25 @@  expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
       break;
 
     case E_V4HImode:
-      if (d->testing_p)
-	break;
-      /* We need 2*log2(N)-1 operations to achieve odd/even
-	 with interleave. */
-      t1 = gen_reg_rtx (V4HImode);
-      emit_insn (gen_mmx_punpckhwd (t1, d->op0, d->op1));
-      emit_insn (gen_mmx_punpcklwd (d->target, d->op0, d->op1));
-      if (odd)
-	t2 = gen_mmx_punpckhwd (d->target, d->target, t1);
+      if (TARGET_SSE4_1)
+	return expand_vec_perm_even_odd_pack (d);
+      else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
+	return expand_vec_perm_pshufb2 (d);
       else
-	t2 = gen_mmx_punpcklwd (d->target, d->target, t1);
-      emit_insn (t2);
+	{
+	  if (d->testing_p)
+	    break;
+	  /* We need 2*log2(N)-1 operations to achieve odd/even
+	     with interleave. */
+	  t1 = gen_reg_rtx (V4HImode);
+	  emit_insn (gen_mmx_punpckhwd (t1, d->op0, d->op1));
+	  emit_insn (gen_mmx_punpcklwd (d->target, d->op0, d->op1));
+	  if (odd)
+	    t2 = gen_mmx_punpckhwd (d->target, d->target, t1);
+	  else
+	    t2 = gen_mmx_punpcklwd (d->target, d->target, t1);
+	  emit_insn (t2);
+	}
       break;
 
     case E_V8HImode:
diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 1a9e7b024dd..59a16f4cd50 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -2477,6 +2477,22 @@  (define_insn_and_split "mmx_packssdw"
    (set_attr "type" "mmxshft,sselog,sselog")
    (set_attr "mode" "DI,TI,TI")])
 
+(define_insn_and_split "mmx_packusdw"
+  [(set (match_operand:V4HI 0 "register_operand" "=Yr,*x,Yw")
+	(vec_concat:V4HI
+	  (us_truncate:V2HI
+	    (match_operand:V2SI 1 "register_operand" "0,0,Yw"))
+	  (us_truncate:V2HI
+	    (match_operand:V2SI 2 "register_operand" "Yr,*x,Yw"))))]
+  "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+  "ix86_split_mmx_pack (operands, US_TRUNCATE); DONE;"
+  [(set_attr "isa" "noavx,noavx,avx")
+   (set_attr "type" "sselog")
+   (set_attr "mode" "TI")])
+
 (define_insn_and_split "mmx_punpckhbw"
   [(set (match_operand:V8QI 0 "register_operand" "=y,x,Yw")
 	(vec_select:V8QI