V2 [PATCH] i386: Remove duplicated AVX2/AVX512 vec_dup patterns

Message ID 20181105220234.GA4696@intel.com
State New
Headers show
Series
  • V2 [PATCH] i386: Remove duplicated AVX2/AVX512 vec_dup patterns
Related show

Commit Message

H.J. Lu Nov. 5, 2018, 10:02 p.m.
Hi Richard, Jakub,

Can you take a look at this patch?  The last review from Kirill was in
June.

Thanks.


H.J.
--
There are many duplicated AVX2/AVX512 vec_dup patterns like:

(define_insn "avx2_vec_dup<mode>"
  [(set (match_operand:VF1_128_256 0 "register_operand" "=v")
        (vec_duplicate:VF1_128_256
          (vec_select:SF
            (match_operand:V4SF 1 "register_operand" "v")
            (parallel [(const_int 0)]))))]
  "TARGET_AVX2"
  "vbroadcastss\t{%1, %0|%0, %1}"
  [(set_attr "type" "sselog1")
    (set_attr "prefix" "maybe_evex")
    (set_attr "mode" "<MODE>")])

and

(define_insn "vec_dup<mode>"
  [(set (match_operand:AVX_VEC_DUP_MODE 0 "register_operand" "=x,x,x,v,x")
        (vec_duplicate:AVX_VEC_DUP_MODE
          (match_operand:<ssescalarmode> 1 "nonimmediate_operand" "m,m,x,v,?x")))]
  "TARGET_AVX"
  "@
   v<sseintprefix>broadcast<bcstscalarsuff>\t{%1, %0|%0, %1}
   vbroadcast<vecdupssescalarmodesuffix>\t{%1, %0|%0, %1}
   v<sseintprefix>broadcast<bcstscalarsuff>\t{%x1, %0|%0, %x1}
   v<sseintprefix>broadcast<bcstscalarsuff>\t{%x1, %g0|%g0, %x1}
   #"
  [(set_attr "type" "ssemov")
   (set_attr "prefix_extra" "1")
   (set_attr "prefix" "maybe_evex")
   (set_attr "isa" "avx2,noavx2,avx2,avx512f,noavx2")
   (set_attr "mode" "<sseinsnmode>,V8SF,<sseinsnmode>,<sseinsnmode>,V8SF")])

We can remove the duplicated AVX2/AVX512 vec_dup patterns and use the
normal AVX2/AVX512 vec_dup patterns instead by changing source operand
to subreg of the same register class of the base by generating

(set (reg:V8SF 84)
     (vec_duplicate:V8SF (subreg:SF (reg:V4SF 85) 0)))

instead of

(set (reg:V8SF 84)
      (vec_duplicate:V8SF
	(vec_select:SF (reg:V4SF 85)
	  (parallel [(const_int 0 [0])]))))

For integer vector broadcast, we generate

(set (reg:V32QI 86)
     (vec_duplicate:V32QI
	(vec_select:QI (subreg:V16QI (reg:V32QI 87) 0))
	  (parallel [(const_int 0 [0])]))))

instead of

(set (reg:V32QI 86)
     (vec_duplicate:V32QI
	(vec_select:QI (reg:V32QI 87)
	  (parallel [(const_int 0 [0])]))))

so that we can remove

(define_insn "avx2_pbroadcast<mode>_1"
  [(set (match_operand:VI_256 0 "register_operand" "=x,x,v,v")
        (vec_duplicate:VI_256
          (vec_select:<ssescalarmode>
            (match_operand:VI_256 1 "nonimmediate_operand" "m,x,m,v")
            (parallel [(const_int 0)]))))]
  "TARGET_AVX2"
  "@
   vpbroadcast<ssemodesuffix>\t{%1, %0|%0, %<iptr>1}
   vpbroadcast<ssemodesuffix>\t{%x1, %0|%0, %x1}
   vpbroadcast<ssemodesuffix>\t{%1, %0|%0, %<iptr>1}
   vpbroadcast<ssemodesuffix>\t{%x1, %0|%0, %x1}"
  [(set_attr "isa" "*,*,<pbroadcast_evex_isa>,<pbroadcast_evex_isa>")
   (set_attr "type" "ssemov")
   (set_attr "prefix_extra" "1")
   (set_attr "prefix" "vex")
   (set_attr "mode" "<sseinsnmode>")])

and keep only

(define_insn "avx2_pbroadcast<mode>"
  [(set (match_operand:VI 0 "register_operand" "=x,v")
        (vec_duplicate:VI
          (vec_select:<ssescalarmode>
            (match_operand:<ssexmmmode> 1 "nonimmediate_operand" "xm,vm")
            (parallel [(const_int 0)]))))]
  "TARGET_AVX2"
  "vpbroadcast<ssemodesuffix>\t{%1, %0|%0, %<iptr>1}"
  [(set_attr "isa" "*,<pbroadcast_evex_isa>")
   (set_attr "type" "ssemov")
   (set_attr "prefix_extra" "1")
   (set_attr "prefix" "vex,evex")
   (set_attr "mode" "<sseinsnmode>")])

gcc.target/i386/avx2-vbroadcastss_ps256-1.c is changed by

 avx2_test:
 	.cfi_startproc
-	vmovaps	x(%rip), %xmm1
-	vbroadcastss	%xmm1, %ymm0
+	vbroadcastss	x(%rip), %ymm0
 	vmovaps	%ymm0, y(%rip)
 	vzeroupper
 	ret
	.cfi_endproc

gcc.target/i386/avx512vl-vbroadcast-3.c is changed by

@@ -113,7 +113,7 @@ f10:
 	.cfi_startproc
 	vmovaps	%ymm0, %ymm16
 	vpermilps	$85, %ymm16, %ymm16
-	vbroadcastss	%xmm16, %ymm16
+	vshuff32x4	$0x0, %ymm16, %ymm16, %ymm16
 	vzeroupper
 	ret
 	.cfi_endproc
@@ -153,8 +153,7 @@ f12:
 f13:
 .LFB12:
 	.cfi_startproc
-	vmovaps	(%rdi), %ymm16
-	vbroadcastss	%xmm16, %ymm16
+	vbroadcastss	(%rdi), %ymm16
 	vzeroupper
 	ret
 	.cfi_endproc

gcc/

	* config/i386/i386-builtin.def: Replace CODE_FOR_avx2_vec_dupv4sf,
	CODE_FOR_avx2_vec_dupv8sf and CODE_FOR_avx2_vec_dupv4df with
	CODE_FOR_vec_dupv4sf, CODE_FOR_vec_dupv8sf and
	CODE_FOR_vec_dupv4df, respectively.
	* config/i386/i386.c (expand_vec_perm_1): Use subreg with vec_dup.
	* config/i386/i386.md (SF to DF splitter): Replace
	gen_avx512f_vec_dupv16sf_1 with gen_avx512f_vec_dupv16sf.
	* config/i386/sse.md (VF48_AVX512VL): New.
	(avx2_vec_dup<mode>): Removed.
	(avx2_vec_dupv8sf_1): Likewise.
	(avx512f_vec_dup<mode>_1): Likewise.
	(avx2_pbroadcast<mode>_1): Likewise.
	(avx2_vec_dupv4df): Likewise.
	(<avx512>_vec_dup<mode>_1): Likewise.
	(*avx_vperm_broadcast_<mode>): Replace gen_avx2_vec_dupv8sf with
	gen_vec_dupv8sf.

gcc/testsuite/

	* gcc.target/i386/avx2-vbroadcastss_ps256-1.c: Updated.
	* gcc.target/i386/avx512vl-vbroadcast-3.c: Likewise.
---
 gcc/config/i386/i386-builtin.def              |  6 +-
 gcc/config/i386/i386.c                        | 57 ++++++++++---
 gcc/config/i386/i386.md                       |  2 +-
 gcc/config/i386/sse.md                        | 83 +------------------
 .../i386/avx2-vbroadcastss_ps256-1.c          |  3 +-
 .../gcc.target/i386/avx512vl-vbroadcast-3.c   |  5 +-
 6 files changed, 56 insertions(+), 100 deletions(-)

-- 
2.17.2

Comments

H.J. Lu Nov. 20, 2018, 5:13 p.m. | #1
On Mon, Nov 5, 2018 at 2:02 PM H.J. Lu <hongjiu.lu@intel.com> wrote:
>

> Hi Richard, Jakub,

>

> Can you take a look at this patch?  The last review from Kirill was in

> June.

>

> Thanks.

>

>

> H.J.

> --

> There are many duplicated AVX2/AVX512 vec_dup patterns like:

>

> (define_insn "avx2_vec_dup<mode>"

>   [(set (match_operand:VF1_128_256 0 "register_operand" "=v")

>         (vec_duplicate:VF1_128_256

>           (vec_select:SF

>             (match_operand:V4SF 1 "register_operand" "v")

>             (parallel [(const_int 0)]))))]

>   "TARGET_AVX2"

>   "vbroadcastss\t{%1, %0|%0, %1}"

>   [(set_attr "type" "sselog1")

>     (set_attr "prefix" "maybe_evex")

>     (set_attr "mode" "<MODE>")])

>

> and

>

> (define_insn "vec_dup<mode>"

>   [(set (match_operand:AVX_VEC_DUP_MODE 0 "register_operand" "=x,x,x,v,x")

>         (vec_duplicate:AVX_VEC_DUP_MODE

>           (match_operand:<ssescalarmode> 1 "nonimmediate_operand" "m,m,x,v,?x")))]

>   "TARGET_AVX"

>   "@

>    v<sseintprefix>broadcast<bcstscalarsuff>\t{%1, %0|%0, %1}

>    vbroadcast<vecdupssescalarmodesuffix>\t{%1, %0|%0, %1}

>    v<sseintprefix>broadcast<bcstscalarsuff>\t{%x1, %0|%0, %x1}

>    v<sseintprefix>broadcast<bcstscalarsuff>\t{%x1, %g0|%g0, %x1}

>    #"

>   [(set_attr "type" "ssemov")

>    (set_attr "prefix_extra" "1")

>    (set_attr "prefix" "maybe_evex")

>    (set_attr "isa" "avx2,noavx2,avx2,avx512f,noavx2")

>    (set_attr "mode" "<sseinsnmode>,V8SF,<sseinsnmode>,<sseinsnmode>,V8SF")])

>

> We can remove the duplicated AVX2/AVX512 vec_dup patterns and use the

> normal AVX2/AVX512 vec_dup patterns instead by changing source operand

> to subreg of the same register class of the base by generating

>

> (set (reg:V8SF 84)

>      (vec_duplicate:V8SF (subreg:SF (reg:V4SF 85) 0)))

>

> instead of

>

> (set (reg:V8SF 84)

>       (vec_duplicate:V8SF

>         (vec_select:SF (reg:V4SF 85)

>           (parallel [(const_int 0 [0])]))))

>

> For integer vector broadcast, we generate

>

> (set (reg:V32QI 86)

>      (vec_duplicate:V32QI

>         (vec_select:QI (subreg:V16QI (reg:V32QI 87) 0))

>           (parallel [(const_int 0 [0])]))))

>

> instead of

>

> (set (reg:V32QI 86)

>      (vec_duplicate:V32QI

>         (vec_select:QI (reg:V32QI 87)

>           (parallel [(const_int 0 [0])]))))

>

> so that we can remove

>

> (define_insn "avx2_pbroadcast<mode>_1"

>   [(set (match_operand:VI_256 0 "register_operand" "=x,x,v,v")

>         (vec_duplicate:VI_256

>           (vec_select:<ssescalarmode>

>             (match_operand:VI_256 1 "nonimmediate_operand" "m,x,m,v")

>             (parallel [(const_int 0)]))))]

>   "TARGET_AVX2"

>   "@

>    vpbroadcast<ssemodesuffix>\t{%1, %0|%0, %<iptr>1}

>    vpbroadcast<ssemodesuffix>\t{%x1, %0|%0, %x1}

>    vpbroadcast<ssemodesuffix>\t{%1, %0|%0, %<iptr>1}

>    vpbroadcast<ssemodesuffix>\t{%x1, %0|%0, %x1}"

>   [(set_attr "isa" "*,*,<pbroadcast_evex_isa>,<pbroadcast_evex_isa>")

>    (set_attr "type" "ssemov")

>    (set_attr "prefix_extra" "1")

>    (set_attr "prefix" "vex")

>    (set_attr "mode" "<sseinsnmode>")])

>

> and keep only

>

> (define_insn "avx2_pbroadcast<mode>"

>   [(set (match_operand:VI 0 "register_operand" "=x,v")

>         (vec_duplicate:VI

>           (vec_select:<ssescalarmode>

>             (match_operand:<ssexmmmode> 1 "nonimmediate_operand" "xm,vm")

>             (parallel [(const_int 0)]))))]

>   "TARGET_AVX2"

>   "vpbroadcast<ssemodesuffix>\t{%1, %0|%0, %<iptr>1}"

>   [(set_attr "isa" "*,<pbroadcast_evex_isa>")

>    (set_attr "type" "ssemov")

>    (set_attr "prefix_extra" "1")

>    (set_attr "prefix" "vex,evex")

>    (set_attr "mode" "<sseinsnmode>")])

>

> gcc.target/i386/avx2-vbroadcastss_ps256-1.c is changed by

>

>  avx2_test:

>         .cfi_startproc

> -       vmovaps x(%rip), %xmm1

> -       vbroadcastss    %xmm1, %ymm0

> +       vbroadcastss    x(%rip), %ymm0

>         vmovaps %ymm0, y(%rip)

>         vzeroupper

>         ret

>         .cfi_endproc

>

> gcc.target/i386/avx512vl-vbroadcast-3.c is changed by

>

> @@ -113,7 +113,7 @@ f10:

>         .cfi_startproc

>         vmovaps %ymm0, %ymm16

>         vpermilps       $85, %ymm16, %ymm16

> -       vbroadcastss    %xmm16, %ymm16

> +       vshuff32x4      $0x0, %ymm16, %ymm16, %ymm16

>         vzeroupper

>         ret

>         .cfi_endproc

> @@ -153,8 +153,7 @@ f12:

>  f13:

>  .LFB12:

>         .cfi_startproc

> -       vmovaps (%rdi), %ymm16

> -       vbroadcastss    %xmm16, %ymm16

> +       vbroadcastss    (%rdi), %ymm16

>         vzeroupper

>         ret

>         .cfi_endproc

>

> gcc/

>

>         * config/i386/i386-builtin.def: Replace CODE_FOR_avx2_vec_dupv4sf,

>         CODE_FOR_avx2_vec_dupv8sf and CODE_FOR_avx2_vec_dupv4df with

>         CODE_FOR_vec_dupv4sf, CODE_FOR_vec_dupv8sf and

>         CODE_FOR_vec_dupv4df, respectively.

>         * config/i386/i386.c (expand_vec_perm_1): Use subreg with vec_dup.

>         * config/i386/i386.md (SF to DF splitter): Replace

>         gen_avx512f_vec_dupv16sf_1 with gen_avx512f_vec_dupv16sf.

>         * config/i386/sse.md (VF48_AVX512VL): New.

>         (avx2_vec_dup<mode>): Removed.

>         (avx2_vec_dupv8sf_1): Likewise.

>         (avx512f_vec_dup<mode>_1): Likewise.

>         (avx2_pbroadcast<mode>_1): Likewise.

>         (avx2_vec_dupv4df): Likewise.

>         (<avx512>_vec_dup<mode>_1): Likewise.

>         (*avx_vperm_broadcast_<mode>): Replace gen_avx2_vec_dupv8sf with

>         gen_vec_dupv8sf.

>

> gcc/testsuite/

>

>         * gcc.target/i386/avx2-vbroadcastss_ps256-1.c: Updated.

>         * gcc.target/i386/avx512vl-vbroadcast-3.c: Likewise.


PING:

https://gcc.gnu.org/ml/gcc-patches/2018-11/msg00315.html

-- 
H.J.

Patch

diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
index df0f7e975ac..d217add8ee2 100644
--- a/gcc/config/i386/i386-builtin.def
+++ b/gcc/config/i386/i386-builtin.def
@@ -1194,9 +1194,9 @@  BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv16hi, "__builtin_ia32_
 BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv8si, "__builtin_ia32_punpckldq256", IX86_BUILTIN_PUNPCKLDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI)
 BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv4di, "__builtin_ia32_punpcklqdq256", IX86_BUILTIN_PUNPCKLQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI)
 BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_xorv4di3, "__builtin_ia32_pxor256", IX86_BUILTIN_PXOR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI)
-BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4sf, "__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) V4SF_FTYPE_V4SF)
-BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv8sf, "__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, (int) V8SF_FTYPE_V4SF)
-BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4df, "__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, (int) V4DF_FTYPE_V2DF)
+BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) V4SF_FTYPE_V4SF)
+BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, (int) V8SF_FTYPE_V4SF)
+BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, (int) V4DF_FTYPE_V2DF)
 BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vbroadcasti128_v4di, "__builtin_ia32_vbroadcastsi256", IX86_BUILTIN_VBROADCASTSI256, UNKNOWN, (int) V4DI_FTYPE_V2DI)
 BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv4si, "__builtin_ia32_pblendd128", IX86_BUILTIN_PBLENDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_INT)
 BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv8si, "__builtin_ia32_pblendd256", IX86_BUILTIN_PBLENDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT)
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 963c7fcbb34..6b95d774ad1 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -45963,28 +45963,41 @@  expand_vec_perm_1 (struct expand_vec_perm_d *d)
 	{
 	  /* Use vpbroadcast{b,w,d}.  */
 	  rtx (*gen) (rtx, rtx) = NULL;
+	  machine_mode smode = VOIDmode;
 	  switch (d->vmode)
 	    {
 	    case E_V64QImode:
 	      if (TARGET_AVX512BW)
-		gen = gen_avx512bw_vec_dupv64qi_1;
+		{
+		  smode = V16QImode;
+		  gen = gen_avx512bw_vec_dupv64qi;
+		}
 	      break;
 	    case E_V32QImode:
-	      gen = gen_avx2_pbroadcastv32qi_1;
+	      smode = V16QImode;
+	      gen = gen_avx2_pbroadcastv32qi;
 	      break;
 	    case E_V32HImode:
 	      if (TARGET_AVX512BW)
-		gen = gen_avx512bw_vec_dupv32hi_1;
+		{
+		  smode = V8HImode;
+		  gen = gen_avx512bw_vec_dupv32hi;
+		}
 	      break;
 	    case E_V16HImode:
-	      gen = gen_avx2_pbroadcastv16hi_1;
+	      smode = V8HImode;
+	      gen = gen_avx2_pbroadcastv16hi;
 	      break;
 	    case E_V16SImode:
 	      if (TARGET_AVX512F)
-		gen = gen_avx512f_vec_dupv16si_1;
+		{
+		  smode = V4SImode;
+		  gen = gen_avx512f_vec_dupv16si;
+		}
 	      break;
 	    case E_V8SImode:
-	      gen = gen_avx2_pbroadcastv8si_1;
+	      smode = V4SImode;
+	      gen = gen_avx2_pbroadcastv8si;
 	      break;
 	    case E_V16QImode:
 	      gen = gen_avx2_pbroadcastv16qi;
@@ -45993,19 +46006,25 @@  expand_vec_perm_1 (struct expand_vec_perm_d *d)
 	      gen = gen_avx2_pbroadcastv8hi;
 	      break;
 	    case E_V16SFmode:
+	      smode = SFmode;
 	      if (TARGET_AVX512F)
-		gen = gen_avx512f_vec_dupv16sf_1;
+		gen = gen_avx512f_vec_dupv16sf;
 	      break;
 	    case E_V8SFmode:
-	      gen = gen_avx2_vec_dupv8sf_1;
+	      smode = SFmode;
+	      gen = gen_vec_dupv8sf;
 	      break;
 	    case E_V8DFmode:
+	      smode = DFmode;
 	      if (TARGET_AVX512F)
-		gen = gen_avx512f_vec_dupv8df_1;
+		gen = gen_avx512f_vec_dupv8df;
 	      break;
 	    case E_V8DImode:
 	      if (TARGET_AVX512F)
-		gen = gen_avx512f_vec_dupv8di_1;
+		{
+		  smode = V2DImode;
+		  gen = gen_avx512f_vec_dupv8di;
+		}
 	      break;
 	    /* For other modes prefer other shuffles this function creates.  */
 	    default: break;
@@ -46013,7 +46032,23 @@  expand_vec_perm_1 (struct expand_vec_perm_d *d)
 	  if (gen != NULL)
 	    {
 	      if (!d->testing_p)
-		emit_insn (gen (d->target, d->op0));
+		{
+		  if (smode == VOIDmode)
+		    emit_insn (gen (d->target, d->op0));
+		  else
+		    {
+		      rtx op = d->op0;
+		      unsigned int oppos = 0;
+		      if (SUBREG_P (op))
+			{
+			  op = SUBREG_REG (op);
+			  oppos = SUBREG_BYTE (op);
+			}
+		      emit_insn (gen (d->target,
+				      gen_rtx_SUBREG (smode, op,
+						      oppos)));
+		    }
+		}
 	      return true;
 	    }
 	}
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 7fb2b144f47..4a6fa077db5 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -4399,7 +4399,7 @@ 
       else
 	{
 	  rtx tmp = lowpart_subreg (V16SFmode, operands[3], V4SFmode);
-	  emit_insn (gen_avx512f_vec_dupv16sf_1 (tmp, tmp));
+	  emit_insn (gen_avx512f_vec_dupv16sf (tmp, tmp));
 	}
     }
   else
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index ee73e1fdf80..90a700c154a 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -7117,42 +7117,6 @@ 
    (set_attr "prefix" "orig,maybe_evex")
    (set_attr "mode" "SF")])
 
-(define_insn "avx2_vec_dup<mode>"
-  [(set (match_operand:VF1_128_256 0 "register_operand" "=v")
-	(vec_duplicate:VF1_128_256
-	  (vec_select:SF
-	    (match_operand:V4SF 1 "register_operand" "v")
-	    (parallel [(const_int 0)]))))]
-  "TARGET_AVX2"
-  "vbroadcastss\t{%1, %0|%0, %1}"
-  [(set_attr "type" "sselog1")
-    (set_attr "prefix" "maybe_evex")
-    (set_attr "mode" "<MODE>")])
-
-(define_insn "avx2_vec_dupv8sf_1"
-  [(set (match_operand:V8SF 0 "register_operand" "=v")
-	(vec_duplicate:V8SF
-	  (vec_select:SF
-	    (match_operand:V8SF 1 "register_operand" "v")
-	    (parallel [(const_int 0)]))))]
-  "TARGET_AVX2"
-  "vbroadcastss\t{%x1, %0|%0, %x1}"
-  [(set_attr "type" "sselog1")
-    (set_attr "prefix" "maybe_evex")
-    (set_attr "mode" "V8SF")])
-
-(define_insn "avx512f_vec_dup<mode>_1"
-  [(set (match_operand:VF_512 0 "register_operand" "=v")
-	(vec_duplicate:VF_512
-	  (vec_select:<ssescalarmode>
-	    (match_operand:VF_512 1 "register_operand" "v")
-	    (parallel [(const_int 0)]))))]
-  "TARGET_AVX512F"
-  "vbroadcast<bcstscalarsuff>\t{%x1, %0|%0, %x1}"
-  [(set_attr "type" "sselog1")
-    (set_attr "prefix" "evex")
-    (set_attr "mode" "<MODE>")])
-
 ;; Although insertps takes register source, we prefer
 ;; unpcklps with register source since it is shorter.
 (define_insn "*vec_concatv2sf_sse4_1"
@@ -17918,24 +17882,6 @@ 
    (set_attr "prefix" "vex,evex")
    (set_attr "mode" "<sseinsnmode>")])
 
-(define_insn "avx2_pbroadcast<mode>_1"
-  [(set (match_operand:VI_256 0 "register_operand" "=x,x,v,v")
-	(vec_duplicate:VI_256
-	  (vec_select:<ssescalarmode>
-	    (match_operand:VI_256 1 "nonimmediate_operand" "m,x,m,v")
-	    (parallel [(const_int 0)]))))]
-  "TARGET_AVX2"
-  "@
-   vpbroadcast<ssemodesuffix>\t{%1, %0|%0, %<iptr>1}
-   vpbroadcast<ssemodesuffix>\t{%x1, %0|%0, %x1}
-   vpbroadcast<ssemodesuffix>\t{%1, %0|%0, %<iptr>1}
-   vpbroadcast<ssemodesuffix>\t{%x1, %0|%0, %x1}"
-  [(set_attr "isa" "*,*,<pbroadcast_evex_isa>,<pbroadcast_evex_isa>")
-   (set_attr "type" "ssemov")
-   (set_attr "prefix_extra" "1")
-   (set_attr "prefix" "vex")
-   (set_attr "mode" "<sseinsnmode>")])
-
 (define_insn "<avx2_avx512>_permvar<mode><mask_name>"
   [(set (match_operand:VI48F_256_512 0 "register_operand" "=v")
 	(unspec:VI48F_256_512
@@ -18111,32 +18057,6 @@ 
    (set_attr "prefix" "vex")
    (set_attr "mode" "OI")])
 
-(define_insn "avx2_vec_dupv4df"
-  [(set (match_operand:V4DF 0 "register_operand" "=v")
-	(vec_duplicate:V4DF
-	  (vec_select:DF
-	    (match_operand:V2DF 1 "register_operand" "v")
-	    (parallel [(const_int 0)]))))]
-  "TARGET_AVX2"
-  "vbroadcastsd\t{%1, %0|%0, %1}"
-  [(set_attr "type" "sselog1")
-   (set_attr "prefix" "maybe_evex")
-   (set_attr "mode" "V4DF")])
-
-(define_insn "<avx512>_vec_dup<mode>_1"
-  [(set (match_operand:VI_AVX512BW 0 "register_operand" "=v,v")
-	(vec_duplicate:VI_AVX512BW
-	  (vec_select:<ssescalarmode>
-	    (match_operand:VI_AVX512BW 1 "nonimmediate_operand" "v,m")
-	    (parallel [(const_int 0)]))))]
-  "TARGET_AVX512F"
-  "@
-   vpbroadcast<ssemodesuffix>\t{%x1, %0|%0, %x1}
-   vpbroadcast<ssemodesuffix>\t{%x1, %0|%0, %<iptr>1}"
-  [(set_attr "type" "ssemov")
-   (set_attr "prefix" "evex")
-   (set_attr "mode" "<sseinsnmode>")])
-
 (define_insn "<avx512>_vec_dup<mode><mask_name>"
   [(set (match_operand:V48_AVX512VL 0 "register_operand" "=v")
 	(vec_duplicate:V48_AVX512VL
@@ -18545,8 +18465,7 @@ 
 	     or VSHUFF128.  */
 	  gcc_assert (<MODE>mode == V8SFmode);
 	  if ((mask & 1) == 0)
-	    emit_insn (gen_avx2_vec_dupv8sf (op0,
-					     gen_lowpart (V4SFmode, op0)));
+	    emit_insn (gen_vec_dupv8sf (op0, gen_lowpart (V4SFmode, op0)));
 	  else
 	    emit_insn (gen_avx512vl_shuf_f32x4_1 (op0, op0, op0,
 						  GEN_INT (4), GEN_INT (5),
diff --git a/gcc/testsuite/gcc.target/i386/avx2-vbroadcastss_ps256-1.c b/gcc/testsuite/gcc.target/i386/avx2-vbroadcastss_ps256-1.c
index dfac3916b08..3ff7497aa21 100644
--- a/gcc/testsuite/gcc.target/i386/avx2-vbroadcastss_ps256-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx2-vbroadcastss_ps256-1.c
@@ -1,6 +1,7 @@ 
 /* { dg-do compile } */
 /* { dg-options "-mavx2 -O2" } */
-/* { dg-final { scan-assembler "vbroadcastss\[ \\t\]+\[^\n\]*%xmm\[0-9\]" } } */
+/* { dg-final { scan-assembler "vbroadcastss\[ \\t\]+\[^\n\]*%ymm\[0-9\]" } } */
+/* { dg-final { scan-assembler-not "vmovaps\[\t \]*\[^,\]*,%xmm\[0-9\]" } } */
 
 #include <immintrin.h>
 
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vbroadcast-3.c b/gcc/testsuite/gcc.target/i386/avx512vl-vbroadcast-3.c
index 7233398cd64..1c62364dac4 100644
--- a/gcc/testsuite/gcc.target/i386/avx512vl-vbroadcast-3.c
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-vbroadcast-3.c
@@ -151,8 +151,8 @@  f16 (V2 *x)
 }
 
 /* { dg-final { scan-assembler-times "vbroadcastss\[^\n\r]*%\[re\]di\[^\n\r]*%xmm16" 4 } } */
-/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\r]*%xmm16\[^\n\r]*%ymm16" 3 } } */
-/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\r]*%\[re\]di\[^\n\r]*%ymm16" 3 } } */
+/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\r]*%xmm16\[^\n\r]*%ymm16" 1 } } */
+/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\r]*%\[re\]di\[^\n\r]*%ymm16" 4 } } */
 /* { dg-final { scan-assembler-times "vpermilps\[^\n\r]*\\\$0\[^\n\r]*%xmm16\[^\n\r]*%xmm16" 1 } } */
 /* { dg-final { scan-assembler-times "vpermilps\[^\n\r]*\\\$85\[^\n\r]*%xmm16\[^\n\r]*%xmm16" 1 } } */
 /* { dg-final { scan-assembler-times "vpermilps\[^\n\r]*\\\$170\[^\n\r]*%xmm16\[^\n\r]*%xmm16" 1 } } */
@@ -160,3 +160,4 @@  f16 (V2 *x)
 /* { dg-final { scan-assembler-times "vpermilps\[^\n\r]*\\\$0\[^\n\r]*%ymm16\[^\n\r]*%ymm16" 1 } } */
 /* { dg-final { scan-assembler-times "vpermilps\[^\n\r]*\\\$85\[^\n\r]*%ymm16\[^\n\r]*%ymm16" 2 } } */
 /* { dg-final { scan-assembler-times "vshuff32x4\[^\n\r]*\\\$3\[^\n\r]*%ymm16\[^\n\r]*%ymm16\[^\n\r]*%ymm16" 2 } } */
+/* { dg-final { scan-assembler-times "vshuff32x4\[^\n\r]*\\\$0\[^\n\r]*%ymm16\[^\n\r]*%ymm16\[^\n\r]*%ymm16" 1 } } */