i386: Remove duplicated AVX2/AVX512 vec_dup patterns

Message ID 20181102172506.GA21596@intel.com
State Superseded
Headers show
Series
  • i386: Remove duplicated AVX2/AVX512 vec_dup patterns
Related show

Commit Message

H.J. Lu Nov. 2, 2018, 5:25 p.m.
Remove duplicated AVX2/AVX512 vec_dup patterns and replace them with
subreg.  gcc.target/i386/avx2-vbroadcastss_ps256-1.c is changed by

 avx2_test:
 	.cfi_startproc
-	vmovaps	x(%rip), %xmm1
-	vbroadcastss	%xmm1, %ymm0
+	vbroadcastss	x(%rip), %ymm0
 	vmovaps	%ymm0, y(%rip)
 	vzeroupper
 	ret
	.cfi_endproc

gcc.target/i386/avx512vl-vbroadcast-3.c is changed by

@@ -113,7 +113,7 @@ f10:
 	.cfi_startproc
 	vmovaps	%ymm0, %ymm16
 	vpermilps	$85, %ymm16, %ymm16
-	vbroadcastss	%xmm16, %ymm16
+	vshuff32x4	$0x0, %ymm16, %ymm16, %ymm16
 	vzeroupper
 	ret
 	.cfi_endproc
@@ -153,8 +153,7 @@ f12:
 f13:
 .LFB12:
 	.cfi_startproc
-	vmovaps	(%rdi), %ymm16
-	vbroadcastss	%xmm16, %ymm16
+	vbroadcastss	(%rdi), %ymm16
 	vzeroupper
 	ret
 	.cfi_endproc

OK for trunk?

Thanks.

H.J.
--
gcc/

	* config/i386/i386-builtin.def: Replace CODE_FOR_avx2_vec_dupv4sf,
	CODE_FOR_avx2_vec_dupv8sf and CODE_FOR_avx2_vec_dupv4df with
	CODE_FOR_vec_dupv4sf, CODE_FOR_vec_dupv8sf and
	CODE_FOR_vec_dupv4df, respectively.
	* config/i386/i386.c (expand_vec_perm_1): Use subreg with vec_dup.
	* config/i386/i386.md (SF to DF splitter): Replace
	gen_avx512f_vec_dupv16sf_1 with gen_avx512f_vec_dupv16sf.
	* config/i386/sse.md (VF48_AVX512VL): New.
	(avx2_vec_dup<mode>): Removed.
	(avx2_vec_dupv8sf_1): Likewise.
	(avx512f_vec_dup<mode>_1): Likewise.
	(avx2_pbroadcast<mode>_1): Likewise.
	(avx2_vec_dupv4df): Likewise.
	(<avx512>_vec_dup<mode>_1): Likewise.
	(*avx_vperm_broadcast_<mode>): Replace gen_avx2_vec_dupv8sf with
	gen_vec_dupv8sf.

gcc/testsuite/

	* gcc.target/i386/avx2-vbroadcastss_ps256-1.c: Updated.
	* gcc.target/i386/avx512vl-vbroadcast-3.c: Likewise.
---
 gcc/config/i386/i386-builtin.def              |  6 +-
 gcc/config/i386/i386.c                        | 57 ++++++++++---
 gcc/config/i386/i386.md                       |  2 +-
 gcc/config/i386/sse.md                        | 83 +------------------
 .../i386/avx2-vbroadcastss_ps256-1.c          |  3 +-
 .../gcc.target/i386/avx512vl-vbroadcast-3.c   |  5 +-
 6 files changed, 56 insertions(+), 100 deletions(-)

-- 
2.17.2

Comments

Uros Bizjak Nov. 4, 2018, 4:41 p.m. | #1
On Fri, Nov 2, 2018 at 6:25 PM H.J. Lu <hongjiu.lu@intel.com> wrote:
>

> Remove duplicated AVX2/AVX512 vec_dup patterns and replace them with

> subreg.  gcc.target/i386/avx2-vbroadcastss_ps256-1.c is changed by

>

>  avx2_test:

>         .cfi_startproc

> -       vmovaps x(%rip), %xmm1

> -       vbroadcastss    %xmm1, %ymm0

> +       vbroadcastss    x(%rip), %ymm0

>         vmovaps %ymm0, y(%rip)

>         vzeroupper

>         ret

>         .cfi_endproc

>

> gcc.target/i386/avx512vl-vbroadcast-3.c is changed by

>

> @@ -113,7 +113,7 @@ f10:

>         .cfi_startproc

>         vmovaps %ymm0, %ymm16

>         vpermilps       $85, %ymm16, %ymm16

> -       vbroadcastss    %xmm16, %ymm16

> +       vshuff32x4      $0x0, %ymm16, %ymm16, %ymm16

>         vzeroupper

>         ret

>         .cfi_endproc

> @@ -153,8 +153,7 @@ f12:

>  f13:

>  .LFB12:

>         .cfi_startproc

> -       vmovaps (%rdi), %ymm16

> -       vbroadcastss    %xmm16, %ymm16

> +       vbroadcastss    (%rdi), %ymm16

>         vzeroupper

>         ret

>         .cfi_endproc


Actually, we can achieve the same with pre-reload splitters. Please
see the attached patch for a couple of examples and a fix for
vbroadcastss that accesses the memory in wrong mode.

Uros.
Index: sse.md
===================================================================
--- sse.md	(revision 265740)
+++ sse.md	(working copy)
@@ -7129,6 +7129,20 @@
     (set_attr "prefix" "maybe_evex")
     (set_attr "mode" "<MODE>")])
 
+(define_insn_and_split "*avx2_vec_dup<mode>_1"
+  [(set (match_operand:VF1_128_256 0 "register_operand")
+	(vec_duplicate:VF1_128_256
+	  (vec_select:SF
+	    (match_operand:V4SF 1 "memory_operand")
+	    (parallel [(const_int 0)]))))]
+  "TARGET_AVX2
+   && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(vec_duplicate:VF1_128_256 (match_dup 1)))]
+  "operands[1] = adjust_address_nv (operands[1], SFmode, 0);")
+
 (define_insn "avx2_vec_dupv8sf_1"
   [(set (match_operand:V8SF 0 "register_operand" "=v")
 	(vec_duplicate:V8SF
@@ -7141,6 +7155,20 @@
     (set_attr "prefix" "maybe_evex")
     (set_attr "mode" "V8SF")])
 
+(define_insn_and_split "*avx2_vec_dupv8sf_1"
+  [(set (match_operand:V8SF 0 "register_operand")
+	(vec_duplicate:V8SF
+	  (vec_select:SF
+	    (match_operand:V4SF 1 "memory_operand")
+	    (parallel [(const_int 0)]))))]
+  "TARGET_AVX2
+   && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(vec_duplicate:VF1_128_256 (match_dup 1)))]
+  "operands[1] = adjust_address_nv (operands[1], SFmode, 0);")
+
 (define_insn "avx512f_vec_dup<mode>_1"
   [(set (match_operand:VF_512 0 "register_operand" "=v")
 	(vec_duplicate:VF_512
@@ -17908,7 +17936,7 @@
   [(set (match_operand:VI 0 "register_operand" "=x,v")
 	(vec_duplicate:VI
 	  (vec_select:<ssescalarmode>
-	    (match_operand:<ssexmmmode> 1 "nonimmediate_operand" "xm,vm")
+	    (match_operand:<ssexmmmode> 1 "register_operand" "x,v")
 	    (parallel [(const_int 0)]))))]
   "TARGET_AVX2"
   "vpbroadcast<ssemodesuffix>\t{%1, %0|%0, %<iptr>1}"
@@ -17918,24 +17946,64 @@
    (set_attr "prefix" "vex,evex")
    (set_attr "mode" "<sseinsnmode>")])
 
+(define_insn_and_split "*avx2_pbroadcast<mode>_mem_1"
+  [(set (match_operand:VI 0 "register_operand")
+	(vec_duplicate:VI
+	  (vec_select:<ssescalarmode>
+	    (match_operand:<ssexmmmode> 1 "memory_operand")
+	    (parallel [(const_int 0)]))))]
+  "TARGET_AVX2
+   && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(vec_duplicate:VI (match_dup 1)))]
+  "operands[1] = adjust_address_nv (operands[1], <ssescalarmode>mode, 0);")
+
 (define_insn "avx2_pbroadcast<mode>_1"
-  [(set (match_operand:VI_256 0 "register_operand" "=x,x,v,v")
+  [(set (match_operand:VI_256 0 "register_operand" "=x,v")
 	(vec_duplicate:VI_256
 	  (vec_select:<ssescalarmode>
-	    (match_operand:VI_256 1 "nonimmediate_operand" "m,x,m,v")
+	    (match_operand:VI_256 1 "register_operand" "x,v")
 	    (parallel [(const_int 0)]))))]
   "TARGET_AVX2"
-  "@
-   vpbroadcast<ssemodesuffix>\t{%1, %0|%0, %<iptr>1}
-   vpbroadcast<ssemodesuffix>\t{%x1, %0|%0, %x1}
-   vpbroadcast<ssemodesuffix>\t{%1, %0|%0, %<iptr>1}
-   vpbroadcast<ssemodesuffix>\t{%x1, %0|%0, %x1}"
-  [(set_attr "isa" "*,*,<pbroadcast_evex_isa>,<pbroadcast_evex_isa>")
+  "vpbroadcast<ssemodesuffix>\t{%x1, %0|%0, %x1}"
+  [(set_attr "isa" "*,<pbroadcast_evex_isa>")
    (set_attr "type" "ssemov")
    (set_attr "prefix_extra" "1")
    (set_attr "prefix" "vex")
    (set_attr "mode" "<sseinsnmode>")])
 
+(define_insn_and_split "*avx2_pbroadcast<mode>_1_mem_1"
+  [(set (match_operand:VI_256 0 "register_operand" "=x,v")
+	(vec_duplicate:VI_256
+	  (vec_select:<ssescalarmode>
+	    (match_operand:VI_256 1 "memory_operand" "m,m")
+	    (parallel [(const_int 0)]))))]
+  "TARGET_AVX2
+   && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(vec_duplicate:VI_256 (match_dup 1)))]
+  "operands[1] = adjust_address_nv (operands[1], <ssescalarmode>mode, 0);")
+
+(define_insn "*avx2_pbroadcast<mode>_mem"
+  [(set (match_operand:VI 0 "register_operand" "=x,v")
+	(vec_duplicate:VI
+	  (match_operand:<ssescalarmode> 1 "memory_operand" "m,m")))]
+  "TARGET_AVX2"
+  "vpbroadcast<ssemodesuffix>\t{%1, %0|%0, %1}"
+  [(set_attr "isa" "*,<pbroadcast_evex_isa>")
+   (set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "vex,evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+
+
+
+
 (define_insn "<avx2_avx512>_permvar<mode><mask_name>"
   [(set (match_operand:VI48F_256_512 0 "register_operand" "=v")
 	(unspec:VI48F_256_512
H.J. Lu Nov. 4, 2018, 7:16 p.m. | #2
On Sun, Nov 4, 2018 at 8:41 AM Uros Bizjak <ubizjak@gmail.com> wrote:
>

> On Fri, Nov 2, 2018 at 6:25 PM H.J. Lu <hongjiu.lu@intel.com> wrote:

> >

> > Remove duplicated AVX2/AVX512 vec_dup patterns and replace them with

> > subreg.  gcc.target/i386/avx2-vbroadcastss_ps256-1.c is changed by

> >

> >  avx2_test:

> >         .cfi_startproc

> > -       vmovaps x(%rip), %xmm1

> > -       vbroadcastss    %xmm1, %ymm0

> > +       vbroadcastss    x(%rip), %ymm0

> >         vmovaps %ymm0, y(%rip)

> >         vzeroupper

> >         ret

> >         .cfi_endproc

> >

> > gcc.target/i386/avx512vl-vbroadcast-3.c is changed by

> >

> > @@ -113,7 +113,7 @@ f10:

> >         .cfi_startproc

> >         vmovaps %ymm0, %ymm16

> >         vpermilps       $85, %ymm16, %ymm16

> > -       vbroadcastss    %xmm16, %ymm16

> > +       vshuff32x4      $0x0, %ymm16, %ymm16, %ymm16

> >         vzeroupper

> >         ret

> >         .cfi_endproc

> > @@ -153,8 +153,7 @@ f12:

> >  f13:

> >  .LFB12:

> >         .cfi_startproc

> > -       vmovaps (%rdi), %ymm16

> > -       vbroadcastss    %xmm16, %ymm16

> > +       vbroadcastss    (%rdi), %ymm16

> >         vzeroupper

> >         ret

> >         .cfi_endproc

>

> Actually, we can achieve the same with pre-reload splitters. Please

> see the attached patch for a couple of examples and a fix for

> vbroadcastss that accesses the memory in wrong mode.

>


My patch removes a bunch of duplicated patterns from sse.md.  But
yours adds a couple more patterns.   Isn't fewer patterns preferred?

-- 
H.J.
Uros Bizjak Nov. 4, 2018, 7:44 p.m. | #3
On Sun, Nov 4, 2018 at 8:17 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>

> On Sun, Nov 4, 2018 at 8:41 AM Uros Bizjak <ubizjak@gmail.com> wrote:

> >

> > On Fri, Nov 2, 2018 at 6:25 PM H.J. Lu <hongjiu.lu@intel.com> wrote:

> > >

> > > Remove duplicated AVX2/AVX512 vec_dup patterns and replace them with

> > > subreg.  gcc.target/i386/avx2-vbroadcastss_ps256-1.c is changed by

> > >

> > >  avx2_test:

> > >         .cfi_startproc

> > > -       vmovaps x(%rip), %xmm1

> > > -       vbroadcastss    %xmm1, %ymm0

> > > +       vbroadcastss    x(%rip), %ymm0

> > >         vmovaps %ymm0, y(%rip)

> > >         vzeroupper

> > >         ret

> > >         .cfi_endproc

> > >

> > > gcc.target/i386/avx512vl-vbroadcast-3.c is changed by

> > >

> > > @@ -113,7 +113,7 @@ f10:

> > >         .cfi_startproc

> > >         vmovaps %ymm0, %ymm16

> > >         vpermilps       $85, %ymm16, %ymm16

> > > -       vbroadcastss    %xmm16, %ymm16

> > > +       vshuff32x4      $0x0, %ymm16, %ymm16, %ymm16

> > >         vzeroupper

> > >         ret

> > >         .cfi_endproc

> > > @@ -153,8 +153,7 @@ f12:

> > >  f13:

> > >  .LFB12:

> > >         .cfi_startproc

> > > -       vmovaps (%rdi), %ymm16

> > > -       vbroadcastss    %xmm16, %ymm16

> > > +       vbroadcastss    (%rdi), %ymm16

> > >         vzeroupper

> > >         ret

> > >         .cfi_endproc

> >

> > Actually, we can achieve the same with pre-reload splitters. Please

> > see the attached patch for a couple of examples and a fix for

> > vbroadcastss that accesses the memory in wrong mode.

> >

>

> My patch removes a bunch of duplicated patterns from sse.md.  But

> yours adds a couple more patterns.   Isn't fewer patterns preferred?


Playing SUBREG games before reload does not look safe to me. We would
like to create a simpler instruction out of the combination of vector
load and broadcast, so I think that combine+split is the right tool
for this simplification.

BTW: Half of my proposed patch is a fix to a avx2_pbroadcast<mode>{_1}
pattern, which models wrong access to memory.

Uros.
H.J. Lu Nov. 4, 2018, 8:48 p.m. | #4
On Sun, Nov 4, 2018 at 11:45 AM Uros Bizjak <ubizjak@gmail.com> wrote:
>

> On Sun, Nov 4, 2018 at 8:17 PM H.J. Lu <hjl.tools@gmail.com> wrote:

> >

> > On Sun, Nov 4, 2018 at 8:41 AM Uros Bizjak <ubizjak@gmail.com> wrote:

> > >

> > > On Fri, Nov 2, 2018 at 6:25 PM H.J. Lu <hongjiu.lu@intel.com> wrote:

> > > >

> > > > Remove duplicated AVX2/AVX512 vec_dup patterns and replace them with

> > > > subreg.  gcc.target/i386/avx2-vbroadcastss_ps256-1.c is changed by

> > > >

> > > >  avx2_test:

> > > >         .cfi_startproc

> > > > -       vmovaps x(%rip), %xmm1

> > > > -       vbroadcastss    %xmm1, %ymm0

> > > > +       vbroadcastss    x(%rip), %ymm0

> > > >         vmovaps %ymm0, y(%rip)

> > > >         vzeroupper

> > > >         ret

> > > >         .cfi_endproc

> > > >

> > > > gcc.target/i386/avx512vl-vbroadcast-3.c is changed by

> > > >

> > > > @@ -113,7 +113,7 @@ f10:

> > > >         .cfi_startproc

> > > >         vmovaps %ymm0, %ymm16

> > > >         vpermilps       $85, %ymm16, %ymm16

> > > > -       vbroadcastss    %xmm16, %ymm16

> > > > +       vshuff32x4      $0x0, %ymm16, %ymm16, %ymm16

> > > >         vzeroupper

> > > >         ret

> > > >         .cfi_endproc

> > > > @@ -153,8 +153,7 @@ f12:

> > > >  f13:

> > > >  .LFB12:

> > > >         .cfi_startproc

> > > > -       vmovaps (%rdi), %ymm16

> > > > -       vbroadcastss    %xmm16, %ymm16

> > > > +       vbroadcastss    (%rdi), %ymm16

> > > >         vzeroupper

> > > >         ret

> > > >         .cfi_endproc

> > >

> > > Actually, we can achieve the same with pre-reload splitters. Please

> > > see the attached patch for a couple of examples and a fix for

> > > vbroadcastss that accesses the memory in wrong mode.

> > >

> >

> > My patch removes a bunch of duplicated patterns from sse.md.  But

> > yours adds a couple more patterns.   Isn't fewer patterns preferred?

>

> Playing SUBREG games before reload does not look safe to me. We would


There are plenty of SUBREG usage in i386 backend before preload.  It is
perfectly safe to do so as long as we don't create SUBREG with a different
register class from the base.  Do you have a testcase to show my SUBREG
usage is unsafe?

> like to create a simpler instruction out of the combination of vector

> load and broadcast, so I think that combine+split is the right tool

> for this simplification.


Adding new patterns doesn't simplify the issue.

> BTW: Half of my proposed patch is a fix to a avx2_pbroadcast<mode>{_1}

> pattern, which models wrong access to memory.

>


I will take look at avx2_pbroadcast<mode>{_1}.


-- 
H.J.
Uros Bizjak Nov. 5, 2018, 8:22 a.m. | #5
On Sun, Nov 4, 2018 at 9:49 PM H.J. Lu <hjl.tools@gmail.com> wrote:

> > > > Actually, we can achieve the same with pre-reload splitters. Please

> > > > see the attached patch for a couple of examples and a fix for

> > > > vbroadcastss that accesses the memory in wrong mode.

> > > >

> > >

> > > My patch removes a bunch of duplicated patterns from sse.md.  But

> > > yours adds a couple more patterns.   Isn't fewer patterns preferred?

> >

> > Playing SUBREG games before reload does not look safe to me. We would

>

> There are plenty of SUBREG usage in i386 backend before preload.  It is

> perfectly safe to do so as long as we don't create SUBREG with a different

> register class from the base.  Do you have a testcase to show my SUBREG

> usage is unsafe?


No. However, the patch then substatially changes functionality in the
vector part of the i386 (expand_vec_perm_1), so it needs approval from
the relevant maintainer (Kirill).

Uros.

Patch

diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
index df0f7e975ac..d217add8ee2 100644
--- a/gcc/config/i386/i386-builtin.def
+++ b/gcc/config/i386/i386-builtin.def
@@ -1194,9 +1194,9 @@  BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv16hi, "__builtin_ia32_
 BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv8si, "__builtin_ia32_punpckldq256", IX86_BUILTIN_PUNPCKLDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI)
 BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv4di, "__builtin_ia32_punpcklqdq256", IX86_BUILTIN_PUNPCKLQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI)
 BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_xorv4di3, "__builtin_ia32_pxor256", IX86_BUILTIN_PXOR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI)
-BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4sf, "__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) V4SF_FTYPE_V4SF)
-BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv8sf, "__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, (int) V8SF_FTYPE_V4SF)
-BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4df, "__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, (int) V4DF_FTYPE_V2DF)
+BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) V4SF_FTYPE_V4SF)
+BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, (int) V8SF_FTYPE_V4SF)
+BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, (int) V4DF_FTYPE_V2DF)
 BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vbroadcasti128_v4di, "__builtin_ia32_vbroadcastsi256", IX86_BUILTIN_VBROADCASTSI256, UNKNOWN, (int) V4DI_FTYPE_V2DI)
 BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv4si, "__builtin_ia32_pblendd128", IX86_BUILTIN_PBLENDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_INT)
 BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv8si, "__builtin_ia32_pblendd256", IX86_BUILTIN_PBLENDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT)
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 963c7fcbb34..6b95d774ad1 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -45963,28 +45963,41 @@  expand_vec_perm_1 (struct expand_vec_perm_d *d)
 	{
 	  /* Use vpbroadcast{b,w,d}.  */
 	  rtx (*gen) (rtx, rtx) = NULL;
+	  machine_mode smode = VOIDmode;
 	  switch (d->vmode)
 	    {
 	    case E_V64QImode:
 	      if (TARGET_AVX512BW)
-		gen = gen_avx512bw_vec_dupv64qi_1;
+		{
+		  smode = V16QImode;
+		  gen = gen_avx512bw_vec_dupv64qi;
+		}
 	      break;
 	    case E_V32QImode:
-	      gen = gen_avx2_pbroadcastv32qi_1;
+	      smode = V16QImode;
+	      gen = gen_avx2_pbroadcastv32qi;
 	      break;
 	    case E_V32HImode:
 	      if (TARGET_AVX512BW)
-		gen = gen_avx512bw_vec_dupv32hi_1;
+		{
+		  smode = V8HImode;
+		  gen = gen_avx512bw_vec_dupv32hi;
+		}
 	      break;
 	    case E_V16HImode:
-	      gen = gen_avx2_pbroadcastv16hi_1;
+	      smode = V8HImode;
+	      gen = gen_avx2_pbroadcastv16hi;
 	      break;
 	    case E_V16SImode:
 	      if (TARGET_AVX512F)
-		gen = gen_avx512f_vec_dupv16si_1;
+		{
+		  smode = V4SImode;
+		  gen = gen_avx512f_vec_dupv16si;
+		}
 	      break;
 	    case E_V8SImode:
-	      gen = gen_avx2_pbroadcastv8si_1;
+	      smode = V4SImode;
+	      gen = gen_avx2_pbroadcastv8si;
 	      break;
 	    case E_V16QImode:
 	      gen = gen_avx2_pbroadcastv16qi;
@@ -45993,19 +46006,25 @@  expand_vec_perm_1 (struct expand_vec_perm_d *d)
 	      gen = gen_avx2_pbroadcastv8hi;
 	      break;
 	    case E_V16SFmode:
+	      smode = SFmode;
 	      if (TARGET_AVX512F)
-		gen = gen_avx512f_vec_dupv16sf_1;
+		gen = gen_avx512f_vec_dupv16sf;
 	      break;
 	    case E_V8SFmode:
-	      gen = gen_avx2_vec_dupv8sf_1;
+	      smode = SFmode;
+	      gen = gen_vec_dupv8sf;
 	      break;
 	    case E_V8DFmode:
+	      smode = DFmode;
 	      if (TARGET_AVX512F)
-		gen = gen_avx512f_vec_dupv8df_1;
+		gen = gen_avx512f_vec_dupv8df;
 	      break;
 	    case E_V8DImode:
 	      if (TARGET_AVX512F)
-		gen = gen_avx512f_vec_dupv8di_1;
+		{
+		  smode = V2DImode;
+		  gen = gen_avx512f_vec_dupv8di;
+		}
 	      break;
 	    /* For other modes prefer other shuffles this function creates.  */
 	    default: break;
@@ -46013,7 +46032,23 @@  expand_vec_perm_1 (struct expand_vec_perm_d *d)
 	  if (gen != NULL)
 	    {
 	      if (!d->testing_p)
-		emit_insn (gen (d->target, d->op0));
+		{
+		  if (smode == VOIDmode)
+		    emit_insn (gen (d->target, d->op0));
+		  else
+		    {
+		      rtx op = d->op0;
+		      unsigned int oppos = 0;
+		      if (SUBREG_P (op))
+			{
+			  op = SUBREG_REG (op);
+			  oppos = SUBREG_BYTE (op);
+			}
+		      emit_insn (gen (d->target,
+				      gen_rtx_SUBREG (smode, op,
+						      oppos)));
+		    }
+		}
 	      return true;
 	    }
 	}
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 7fb2b144f47..4a6fa077db5 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -4399,7 +4399,7 @@ 
       else
 	{
 	  rtx tmp = lowpart_subreg (V16SFmode, operands[3], V4SFmode);
-	  emit_insn (gen_avx512f_vec_dupv16sf_1 (tmp, tmp));
+	  emit_insn (gen_avx512f_vec_dupv16sf (tmp, tmp));
 	}
     }
   else
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index ee73e1fdf80..90a700c154a 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -7117,42 +7117,6 @@ 
    (set_attr "prefix" "orig,maybe_evex")
    (set_attr "mode" "SF")])
 
-(define_insn "avx2_vec_dup<mode>"
-  [(set (match_operand:VF1_128_256 0 "register_operand" "=v")
-	(vec_duplicate:VF1_128_256
-	  (vec_select:SF
-	    (match_operand:V4SF 1 "register_operand" "v")
-	    (parallel [(const_int 0)]))))]
-  "TARGET_AVX2"
-  "vbroadcastss\t{%1, %0|%0, %1}"
-  [(set_attr "type" "sselog1")
-    (set_attr "prefix" "maybe_evex")
-    (set_attr "mode" "<MODE>")])
-
-(define_insn "avx2_vec_dupv8sf_1"
-  [(set (match_operand:V8SF 0 "register_operand" "=v")
-	(vec_duplicate:V8SF
-	  (vec_select:SF
-	    (match_operand:V8SF 1 "register_operand" "v")
-	    (parallel [(const_int 0)]))))]
-  "TARGET_AVX2"
-  "vbroadcastss\t{%x1, %0|%0, %x1}"
-  [(set_attr "type" "sselog1")
-    (set_attr "prefix" "maybe_evex")
-    (set_attr "mode" "V8SF")])
-
-(define_insn "avx512f_vec_dup<mode>_1"
-  [(set (match_operand:VF_512 0 "register_operand" "=v")
-	(vec_duplicate:VF_512
-	  (vec_select:<ssescalarmode>
-	    (match_operand:VF_512 1 "register_operand" "v")
-	    (parallel [(const_int 0)]))))]
-  "TARGET_AVX512F"
-  "vbroadcast<bcstscalarsuff>\t{%x1, %0|%0, %x1}"
-  [(set_attr "type" "sselog1")
-    (set_attr "prefix" "evex")
-    (set_attr "mode" "<MODE>")])
-
 ;; Although insertps takes register source, we prefer
 ;; unpcklps with register source since it is shorter.
 (define_insn "*vec_concatv2sf_sse4_1"
@@ -17918,24 +17882,6 @@ 
    (set_attr "prefix" "vex,evex")
    (set_attr "mode" "<sseinsnmode>")])
 
-(define_insn "avx2_pbroadcast<mode>_1"
-  [(set (match_operand:VI_256 0 "register_operand" "=x,x,v,v")
-	(vec_duplicate:VI_256
-	  (vec_select:<ssescalarmode>
-	    (match_operand:VI_256 1 "nonimmediate_operand" "m,x,m,v")
-	    (parallel [(const_int 0)]))))]
-  "TARGET_AVX2"
-  "@
-   vpbroadcast<ssemodesuffix>\t{%1, %0|%0, %<iptr>1}
-   vpbroadcast<ssemodesuffix>\t{%x1, %0|%0, %x1}
-   vpbroadcast<ssemodesuffix>\t{%1, %0|%0, %<iptr>1}
-   vpbroadcast<ssemodesuffix>\t{%x1, %0|%0, %x1}"
-  [(set_attr "isa" "*,*,<pbroadcast_evex_isa>,<pbroadcast_evex_isa>")
-   (set_attr "type" "ssemov")
-   (set_attr "prefix_extra" "1")
-   (set_attr "prefix" "vex")
-   (set_attr "mode" "<sseinsnmode>")])
-
 (define_insn "<avx2_avx512>_permvar<mode><mask_name>"
   [(set (match_operand:VI48F_256_512 0 "register_operand" "=v")
 	(unspec:VI48F_256_512
@@ -18111,32 +18057,6 @@ 
    (set_attr "prefix" "vex")
    (set_attr "mode" "OI")])
 
-(define_insn "avx2_vec_dupv4df"
-  [(set (match_operand:V4DF 0 "register_operand" "=v")
-	(vec_duplicate:V4DF
-	  (vec_select:DF
-	    (match_operand:V2DF 1 "register_operand" "v")
-	    (parallel [(const_int 0)]))))]
-  "TARGET_AVX2"
-  "vbroadcastsd\t{%1, %0|%0, %1}"
-  [(set_attr "type" "sselog1")
-   (set_attr "prefix" "maybe_evex")
-   (set_attr "mode" "V4DF")])
-
-(define_insn "<avx512>_vec_dup<mode>_1"
-  [(set (match_operand:VI_AVX512BW 0 "register_operand" "=v,v")
-	(vec_duplicate:VI_AVX512BW
-	  (vec_select:<ssescalarmode>
-	    (match_operand:VI_AVX512BW 1 "nonimmediate_operand" "v,m")
-	    (parallel [(const_int 0)]))))]
-  "TARGET_AVX512F"
-  "@
-   vpbroadcast<ssemodesuffix>\t{%x1, %0|%0, %x1}
-   vpbroadcast<ssemodesuffix>\t{%x1, %0|%0, %<iptr>1}"
-  [(set_attr "type" "ssemov")
-   (set_attr "prefix" "evex")
-   (set_attr "mode" "<sseinsnmode>")])
-
 (define_insn "<avx512>_vec_dup<mode><mask_name>"
   [(set (match_operand:V48_AVX512VL 0 "register_operand" "=v")
 	(vec_duplicate:V48_AVX512VL
@@ -18545,8 +18465,7 @@ 
 	     or VSHUFF128.  */
 	  gcc_assert (<MODE>mode == V8SFmode);
 	  if ((mask & 1) == 0)
-	    emit_insn (gen_avx2_vec_dupv8sf (op0,
-					     gen_lowpart (V4SFmode, op0)));
+	    emit_insn (gen_vec_dupv8sf (op0, gen_lowpart (V4SFmode, op0)));
 	  else
 	    emit_insn (gen_avx512vl_shuf_f32x4_1 (op0, op0, op0,
 						  GEN_INT (4), GEN_INT (5),
diff --git a/gcc/testsuite/gcc.target/i386/avx2-vbroadcastss_ps256-1.c b/gcc/testsuite/gcc.target/i386/avx2-vbroadcastss_ps256-1.c
index dfac3916b08..3ff7497aa21 100644
--- a/gcc/testsuite/gcc.target/i386/avx2-vbroadcastss_ps256-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx2-vbroadcastss_ps256-1.c
@@ -1,6 +1,7 @@ 
 /* { dg-do compile } */
 /* { dg-options "-mavx2 -O2" } */
-/* { dg-final { scan-assembler "vbroadcastss\[ \\t\]+\[^\n\]*%xmm\[0-9\]" } } */
+/* { dg-final { scan-assembler "vbroadcastss\[ \\t\]+\[^\n\]*%ymm\[0-9\]" } } */
+/* { dg-final { scan-assembler-not "vmovaps\[\t \]*\[^,\]*,%xmm\[0-9\]" } } */
 
 #include <immintrin.h>
 
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vbroadcast-3.c b/gcc/testsuite/gcc.target/i386/avx512vl-vbroadcast-3.c
index 7233398cd64..1c62364dac4 100644
--- a/gcc/testsuite/gcc.target/i386/avx512vl-vbroadcast-3.c
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-vbroadcast-3.c
@@ -151,8 +151,8 @@  f16 (V2 *x)
 }
 
 /* { dg-final { scan-assembler-times "vbroadcastss\[^\n\r]*%\[re\]di\[^\n\r]*%xmm16" 4 } } */
-/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\r]*%xmm16\[^\n\r]*%ymm16" 3 } } */
-/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\r]*%\[re\]di\[^\n\r]*%ymm16" 3 } } */
+/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\r]*%xmm16\[^\n\r]*%ymm16" 1 } } */
+/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\r]*%\[re\]di\[^\n\r]*%ymm16" 4 } } */
 /* { dg-final { scan-assembler-times "vpermilps\[^\n\r]*\\\$0\[^\n\r]*%xmm16\[^\n\r]*%xmm16" 1 } } */
 /* { dg-final { scan-assembler-times "vpermilps\[^\n\r]*\\\$85\[^\n\r]*%xmm16\[^\n\r]*%xmm16" 1 } } */
 /* { dg-final { scan-assembler-times "vpermilps\[^\n\r]*\\\$170\[^\n\r]*%xmm16\[^\n\r]*%xmm16" 1 } } */
@@ -160,3 +160,4 @@  f16 (V2 *x)
 /* { dg-final { scan-assembler-times "vpermilps\[^\n\r]*\\\$0\[^\n\r]*%ymm16\[^\n\r]*%ymm16" 1 } } */
 /* { dg-final { scan-assembler-times "vpermilps\[^\n\r]*\\\$85\[^\n\r]*%ymm16\[^\n\r]*%ymm16" 2 } } */
 /* { dg-final { scan-assembler-times "vshuff32x4\[^\n\r]*\\\$3\[^\n\r]*%ymm16\[^\n\r]*%ymm16\[^\n\r]*%ymm16" 2 } } */
+/* { dg-final { scan-assembler-times "vshuff32x4\[^\n\r]*\\\$0\[^\n\r]*%ymm16\[^\n\r]*%ymm16\[^\n\r]*%ymm16" 1 } } */