[GCC-9,i386] : Emit inter-unit moves using preferred_for_speed infrastructure

Message ID CAFULd4YHC6BccF-1ja_5EVTxFZdoq4TpeC4AU8z5h4ODKhBa8w@mail.gmail.com
State New
Headers show
Series
  • [GCC-9,i386] : Emit inter-unit moves using preferred_for_speed infrastructure
Related show

Commit Message

Uros Bizjak April 18, 2018, 6:44 p.m.
Hello!

Attached patch conditionally emits inter-unit moves using
preferred_for_speed_infrastructure. Current approach, where registers
are enabled only for TARGET_INTER_UNIT_MOVES_{FROM,TO}_VEC is not
optimal, since for non-performance critical parts of the binary, we
can still emit inter-unit moves, shaving a couple of bytes here and
there.

2018-04-18  Uros Bizjak  <ubizjak@gmail.com>

    * config/i386/i386.md ("isa" attribute): Add x64_sse2.
    ("enabled" attribute): Handle "isa" attribute.
    (*movdi_internal): Substitute Yi and Yj constraint with x
    and Ym and Yn constraint with y constraint.  Update "isa"
    attribute and set "preferred_for_speed" attribute from
    TARGET_INTER_UNIT_MOVES_{FROM,TO}_VEC for updated alternatives.
    (*movsi_internal): Ditto.
    (*movdf_internal): Ditto.
    (*movsf_internal): Ditto.
    (*zero_extendsidi2): Ditto.
    * config/i386/sse.md (vec_set<mode>_0): Ditto.
    (sse2_loadld): Ditto.
    (*vec_extract<ssevecmodelower>_0): Ditto.
    (*vec_extractv4si_0_zext_sse4): Ditto.
    (vec_concatv2di): Ditto.
    (*vec_dup<mode>): Ditto.
    * config/i386/mmx.md (*mov<mode>_internal): Ditto.
    * config/i386/constraints.md (Yi): Remove.
    (Yj): Remove.
    (Ym): Remove.
    (Yn): Remove.

Patch was bootstrapped and regression tested on x86_64-linux-gnu {,-m32}.

I plan to commit the patch to gcc-9 when it opens for general development.

Uros.

Patch

diff --git a/gcc/config/i386/constraints.md b/gcc/config/i386/constraints.md
index f9564d3a385c..e750dc4cec74 100644
--- a/gcc/config/i386/constraints.md
+++ b/gcc/config/i386/constraints.md
@@ -99,14 +99,10 @@ 
 
 ;; We use the Y prefix to denote any number of conditional register sets:
 ;;  z	First SSE register.
-;;  i	SSE2 inter-unit moves to SSE register enabled
-;;  j	SSE2 inter-unit moves from SSE register enabled
 ;;  d	any EVEX encodable SSE register for AVX512BW target or any SSE register
 ;;	for SSE4_1 target, when inter-unit moves to SSE register are enabled
 ;;  e	any EVEX encodable SSE register for AVX512BW target or any SSE register
 ;;	for SSE4_1 target, when inter-unit moves from SSE register are enabled
-;;  m	MMX inter-unit moves to MMX register enabled
-;;  n	MMX inter-unit moves from MMX register enabled
 ;;  p	Integer register when TARGET_PARTIAL_REG_STALL is disabled
 ;;  a	Integer register when zero extensions with AND are disabled
 ;;  b	Any register that can be used as the GOT base when calling
@@ -123,14 +119,6 @@ 
 (define_register_constraint "Yz" "TARGET_SSE ? SSE_FIRST_REG : NO_REGS"
  "First SSE register (@code{%xmm0}).")
 
-(define_register_constraint "Yi"
- "TARGET_SSE2 && TARGET_INTER_UNIT_MOVES_TO_VEC ? ALL_SSE_REGS : NO_REGS"
- "@internal Any SSE register, when SSE2 and inter-unit moves to vector registers are enabled.")
-
-(define_register_constraint "Yj"
- "TARGET_SSE2 && TARGET_INTER_UNIT_MOVES_FROM_VEC ? ALL_SSE_REGS : NO_REGS"
- "@internal Any SSE register, when SSE2 and inter-unit moves from vector registers are enabled.")
-
 (define_register_constraint "Yd"
  "TARGET_INTER_UNIT_MOVES_TO_VEC
   ? (TARGET_AVX512DQ
@@ -147,14 +135,6 @@ 
   : NO_REGS"
  "@internal Any EVEX encodable SSE register (@code{%xmm0-%xmm31}) for AVX512DQ target or any SSE register for SSE4_1 target, when inter-unit moves from vector registers are enabled.")
 
-(define_register_constraint "Ym"
- "TARGET_MMX && TARGET_INTER_UNIT_MOVES_TO_VEC ? MMX_REGS : NO_REGS"
- "@internal Any MMX register, when inter-unit moves to vector registers are enabled.")
-
-(define_register_constraint "Yn"
- "TARGET_MMX && TARGET_INTER_UNIT_MOVES_FROM_VEC ? MMX_REGS : NO_REGS"
- "@internal Any MMX register, when inter-unit moves from vector registers are enabled.")
-
 (define_register_constraint "Yp"
  "TARGET_PARTIAL_REG_STALL ? NO_REGS : GENERAL_REGS"
  "@internal Any integer register when TARGET_PARTIAL_REG_STALL is disabled.")
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 95ca2cf9e3d6..285ad3c9d61f 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -810,7 +810,7 @@ 
 (define_attr "movu" "0,1" (const_string "0"))
 
 ;; Used to control the "enabled" attribute on a per-instruction basis.
-(define_attr "isa" "base,x64,x64_sse4,x64_sse4_noavx,x64_avx,nox64,
+(define_attr "isa" "base,x64,x64_sse2,x64_sse4,x64_sse4_noavx,x64_avx,nox64,
 		    sse2,sse2_noavx,sse3,sse4,sse4_noavx,avx,noavx,
 		    avx2,noavx2,bmi,bmi2,fma4,fma,avx512f,noavx512f,
 		    avx512bw,noavx512bw,avx512dq,noavx512dq,
@@ -819,6 +819,8 @@ 
 
 (define_attr "enabled" ""
   (cond [(eq_attr "isa" "x64") (symbol_ref "TARGET_64BIT")
+	 (eq_attr "isa" "x64_sse2")
+	   (symbol_ref "TARGET_64BIT && TARGET_SSE2")
 	 (eq_attr "isa" "x64_sse4")
 	   (symbol_ref "TARGET_64BIT && TARGET_SSE4_1")
 	 (eq_attr "isa" "x64_sse4_noavx")
@@ -2221,9 +2223,9 @@ 
 
 (define_insn "*movdi_internal"
   [(set (match_operand:DI 0 "nonimmediate_operand"
-    "=r  ,o  ,r,r  ,r,m ,*y,*y,?*y,?m,?r ,?*Ym,*v,*v,*v,m ,m,?r ,?*Yd,?r ,?*Yi,?*Ym,?*Yi,*k,*k ,*r,*m")
+    "=r  ,o  ,r,r  ,r,m ,*y,*y,?*y,?m,?r,?*y,*v,*v,*v,m ,m,?r ,?*Yd,?r,?*v,?*y,?*x,*k,*k ,*r,*m")
 	(match_operand:DI 1 "general_operand"
-    "riFo,riF,Z,rem,i,re,C ,*y,m  ,*y,*Yn,r   ,C ,*v,m ,*v,v,*Ye,r   ,*Yj,r   ,*Yj ,*Yn ,*r,*km,*k,*k"))]
+    "riFo,riF,Z,rem,i,re,C ,*y,m  ,*y,*y,r  ,C ,*v,m ,*v,v,*Ye,r   ,*v,r  ,*x ,*y ,*r,*km,*k,*k"))]
   "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
 {
   switch (get_attr_type (insn))
@@ -2301,8 +2303,12 @@ 
   [(set (attr "isa")
      (cond [(eq_attr "alternative" "0,1,17,18")
 	      (const_string "nox64")
-	    (eq_attr "alternative" "2,3,4,5,10,11,19,20,23,25")
+	    (eq_attr "alternative" "2,3,4,5,10,11,23,25")
 	      (const_string "x64")
+	    (eq_attr "alternative" "19,20")
+	      (const_string "x64_sse2")
+	    (eq_attr "alternative" "21,22")
+	      (const_string "sse2")
 	   ]
 	   (const_string "*")))
    (set (attr "type")
@@ -2370,6 +2376,13 @@ 
 	      (const_string "V2SF")
 	   ]
 	   (const_string "DI")))
+   (set (attr "preferred_for_speed")
+     (cond [(eq_attr "alternative" "10,19")
+	      (symbol_ref "TARGET_INTER_UNIT_MOVES_FROM_VEC")
+	    (eq_attr "alternative" "11,20")
+	      (symbol_ref "TARGET_INTER_UNIT_MOVES_TO_VEC")
+	   ]
+	   (symbol_ref "true")))
    (set (attr "enabled")
      (cond [(eq_attr "alternative" "15")
               (if_then_else
@@ -2450,9 +2463,9 @@ 
 
 (define_insn "*movsi_internal"
   [(set (match_operand:SI 0 "nonimmediate_operand"
-    "=r,m ,*y,*y,?*y,?m,?r ,?*Ym,*v,*v,*v,m ,?r ,?*Yi,*k,*k ,*rm")
+    "=r,m ,*y,*y,?*y,?m,?r,?*y,*v,*v,*v,m ,?r,?*v,*k,*k ,*rm")
 	(match_operand:SI 1 "general_operand"
-    "g ,re,C ,*y,m  ,*y,*Yn,r   ,C ,*v,m ,*v,*Yj,r   ,*r,*km,*k"))]
+    "g ,re,C ,*y,m  ,*y,*y,r  ,C ,*v,m ,*v,*v,r  ,*r,*km,*k"))]
   "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
 {
   switch (get_attr_type (insn))
@@ -2513,7 +2526,12 @@ 
       gcc_unreachable ();
     }
 }
-  [(set (attr "type")
+  [(set (attr "isa")
+     (cond [(eq_attr "alternative" "12,13")
+	      (const_string "sse2")
+	   ]
+	   (const_string "*")))
+   (set (attr "type")
      (cond [(eq_attr "alternative" "2")
 	      (const_string "mmx")
 	    (eq_attr "alternative" "3,4,5,6,7")
@@ -2558,7 +2576,14 @@ 
 	         (not (match_test "TARGET_SSE2")))
 	      (const_string "SF")
 	   ]
-	   (const_string "SI")))])
+	   (const_string "SI")))
+   (set (attr "preferred_for_speed")
+     (cond [(eq_attr "alternative" "6,12")
+	      (symbol_ref "TARGET_INTER_UNIT_MOVES_FROM_VEC")
+	    (eq_attr "alternative" "7,13")
+	      (symbol_ref "TARGET_INTER_UNIT_MOVES_TO_VEC")
+	   ]
+	   (symbol_ref "true")))])
 
 (define_insn "*movhi_internal"
   [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r ,r ,m ,k,k ,r,m")
@@ -3470,9 +3495,9 @@ 
 ;; Possible store forwarding (partial memory) stall in alternatives 4, 6 and 7.
 (define_insn "*movdf_internal"
   [(set (match_operand:DF 0 "nonimmediate_operand"
-    "=Yf*f,m   ,Yf*f,?r ,!o,?*r ,!o,!o,?r,?m,?r,?r,v,v,v,m,*x,*x,*x,m ,r ,Yi,r  ,o ,r  ,m")
+    "=Yf*f,m   ,Yf*f,?r ,!o,?*r ,!o,!o,?r,?m,?r,?r,v,v,v,m,*x,*x,*x,m ,r ,v,r  ,o ,r  ,m")
 	(match_operand:DF 1 "general_operand"
-    "Yf*fm,Yf*f,G   ,roF,r ,*roF,*r,F ,rm,rC,C ,F ,C,v,m,v,C ,*x,m ,*x,Yj,r ,roF,rF,rmF,rC"))]
+    "Yf*fm,Yf*f,G   ,roF,r ,*roF,*r,F ,rm,rC,C ,F ,C,v,m,v,C ,*x,m ,*x,v,r ,roF,rF,rmF,rC"))]
   "!(MEM_P (operands[0]) && MEM_P (operands[1]))
    && (lra_in_progress || reload_completed
        || !CONST_DOUBLE_P (operands[1])
@@ -3548,10 +3573,12 @@ 
   [(set (attr "isa")
 	(cond [(eq_attr "alternative" "3,4,5,6,7,22,23")
 		 (const_string "nox64")
-	       (eq_attr "alternative" "8,9,10,11,20,21,24,25")
+	       (eq_attr "alternative" "8,9,10,11,24,25")
 		 (const_string "x64")
 	       (eq_attr "alternative" "12,13,14,15")
 		 (const_string "sse2")
+	       (eq_attr "alternative" "20,21")
+		 (const_string "x64_sse2")
 	      ]
 	      (const_string "*")))
    (set (attr "type")
@@ -3652,7 +3679,12 @@ 
            (symbol_ref "true")))
    (set (attr "preferred_for_speed")
      (cond [(eq_attr "alternative" "3,4")
-              (symbol_ref "TARGET_INTEGER_DFMODE_MOVES")]
+              (symbol_ref "TARGET_INTEGER_DFMODE_MOVES")
+	    (eq_attr "alternative" "20")
+	      (symbol_ref "TARGET_INTER_UNIT_MOVES_FROM_VEC")
+	    (eq_attr "alternative" "21")
+	      (symbol_ref "TARGET_INTER_UNIT_MOVES_TO_VEC")
+	   ]
            (symbol_ref "true")))
    (set (attr "enabled")
      (cond [(eq_attr "alternative" "22,23,24,25")
@@ -3674,9 +3706,9 @@ 
 
 (define_insn "*movsf_internal"
   [(set (match_operand:SF 0 "nonimmediate_operand"
-	  "=Yf*f,m   ,Yf*f,?r ,?m,v,v,v,m,?r,?Yi,!*y,!*y,!m,!r ,!*Ym,r  ,m")
+	  "=Yf*f,m   ,Yf*f,?r ,?m,v,v,v,m,?r,?v,!*y,!*y,!m,!r,!*y,r  ,m")
 	(match_operand:SF 1 "general_operand"
-	  "Yf*fm,Yf*f,G   ,rmF,rF,C,v,m,v,Yj,r  ,*y ,m  ,*y,*Yn,r   ,rmF,rF"))]
+	  "Yf*fm,Yf*f,G   ,rmF,rF,C,v,m,v,v ,r ,*y ,m  ,*y,*y,r  ,rmF,rF"))]
   "!(MEM_P (operands[0]) && MEM_P (operands[1]))
    && (lra_in_progress || reload_completed
        || !CONST_DOUBLE_P (operands[1])
@@ -3738,7 +3770,12 @@ 
       gcc_unreachable ();
     }
 }
-  [(set (attr "type")
+  [(set (attr "isa")
+     (cond [(eq_attr "alternative" "14,15")
+	      (const_string "sse2")
+	   ]
+	   (const_string "*")))
+   (set (attr "type")
 	(cond [(eq_attr "alternative" "0,1,2")
 		 (const_string "fmov")
 	       (eq_attr "alternative" "3,4,16,17")
@@ -3800,6 +3837,13 @@ 
 		       (const_string "SF"))
 	      ]
 	      (const_string "SF")))
+   (set (attr "preferred_for_speed")
+     (cond [(eq_attr "alternative" "9,14")
+	      (symbol_ref "TARGET_INTER_UNIT_MOVES_FROM_VEC")
+	    (eq_attr "alternative" "10,15")
+	      (symbol_ref "TARGET_INTER_UNIT_MOVES_TO_VEC")
+	   ]
+           (symbol_ref "true")))
    (set (attr "enabled")
      (cond [(eq_attr "alternative" "16,17")
               (if_then_else
@@ -3889,10 +3933,10 @@ 
 
 (define_insn "*zero_extendsidi2"
   [(set (match_operand:DI 0 "nonimmediate_operand"
-		"=r,?r,?o,r   ,o,?*Ym,?!*y,$r,$Yi,$x,*x,*v,*r")
+		"=r,?r,?o,r   ,o,?*y,?!*y,$r,$v,$x,*x,*v,*r")
 	(zero_extend:DI
 	 (match_operand:SI 1 "x86_64_zext_operand"
-	        "0 ,rm,r ,rmWz,0,r   ,m   ,Yj,r  ,m ,*x,*v,*k")))]
+	        "0 ,rm,r ,rmWz,0,r  ,m   ,v ,r ,m ,*x,*v,*k")))]
   ""
 {
   switch (get_attr_type (insn))
@@ -3936,7 +3980,7 @@ 
 	      (const_string "nox64")
 	    (eq_attr "alternative" "3")
 	      (const_string "x64")
-	    (eq_attr "alternative" "9")
+	    (eq_attr "alternative" "7,8,9")
 	      (const_string "sse2")
 	    (eq_attr "alternative" "10")
 	      (const_string "sse4")
@@ -3982,7 +4026,14 @@ 
 	    (eq_attr "alternative" "8,10,11")
 	      (const_string "TI")
 	   ]
-	   (const_string "SI")))])
+	   (const_string "SI")))
+   (set (attr "preferred_for_speed")
+     (cond [(eq_attr "alternative" "7")
+	      (symbol_ref "TARGET_INTER_UNIT_MOVES_FROM_VEC")
+	    (eq_attr "alternative" "5,8")
+	      (symbol_ref "TARGET_INTER_UNIT_MOVES_TO_VEC")
+	   ]
+           (symbol_ref "true")))])
 
 (define_split
   [(set (match_operand:DI 0 "memory_operand")
diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index f8575c634358..a75807c4fd99 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -78,9 +78,9 @@ 
 
 (define_insn "*mov<mode>_internal"
   [(set (match_operand:MMXMODE 0 "nonimmediate_operand"
-    "=r ,o ,r,r ,m ,?!y,!y,?!y,m  ,r   ,?!Ym,v,v,v,m,r ,Yi,!Ym,*Yi")
+    "=r ,o ,r,r ,m ,?!y,!y,?!y,m  ,r  ,?!y,v,v,v,m,r,v,!y,*x")
 	(match_operand:MMXMODE 1 "vector_move_operand"
-    "rCo,rC,C,rm,rC,C  ,!y,m  ,?!y,?!Yn,r   ,C,v,m,v,Yj,r ,*Yj,!Yn"))]
+    "rCo,rC,C,rm,rC,C  ,!y,m  ,?!y,?!y,r  ,C,v,m,v,v,r,*x,!y"))]
   "TARGET_MMX
    && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
 {
@@ -146,8 +146,12 @@ 
   [(set (attr "isa")
      (cond [(eq_attr "alternative" "0,1")
 	      (const_string "nox64")
-	    (eq_attr "alternative" "2,3,4,9,10,15,16")
+	    (eq_attr "alternative" "2,3,4,9,10")
 	      (const_string "x64")
+	    (eq_attr "alternative" "15,16")
+	      (const_string "x64_sse2")
+	    (eq_attr "alternative" "17,18")
+	      (const_string "sse2")
 	   ]
 	   (const_string "*")))
    (set (attr "type")
@@ -202,7 +206,14 @@ 
 		      (not (match_test "TARGET_SSE2"))))
 	      (const_string "V2SF")
 	   ]
-	   (const_string "DI")))])
+	   (const_string "DI")))
+   (set (attr "preferred_for_speed")
+     (cond [(eq_attr "alternative" "10,15")
+	      (symbol_ref "TARGET_INTER_UNIT_MOVES_FROM_VEC")
+	    (eq_attr "alternative" "11,16")
+	      (symbol_ref "TARGET_INTER_UNIT_MOVES_TO_VEC")
+	   ]
+	   (symbol_ref "true")))])
 
 (define_split
   [(set (match_operand:MMXMODE 0 "nonimmediate_gr_operand")
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 640971d5e12b..858c29ec3e26 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -7008,7 +7008,7 @@ 
 ;; see comment above inline_secondary_memory_needed function in i386.c
 (define_insn "vec_set<mode>_0"
   [(set (match_operand:VI4F_128 0 "nonimmediate_operand"
-	  "=Yr,*x,v,v,Yi,x,x,v,Yr ,*x ,x  ,m ,m   ,m")
+	  "=Yr,*x,v,v,v,x,x,v,Yr ,*x ,x  ,m ,m   ,m")
 	(vec_merge:VI4F_128
 	  (vec_duplicate:VI4F_128
 	    (match_operand:<ssescalarmode> 2 "general_operand"
@@ -7071,7 +7071,12 @@ 
 	      (const_string "vex")
 	   ]
 	   (const_string "*")))
-   (set_attr "mode" "SF,SF,SF,<ssescalarmode>,SI,SF,SF,SF,TI,TI,TI,*,*,*")])
+   (set_attr "mode" "SF,SF,SF,<ssescalarmode>,SI,SF,SF,SF,TI,TI,TI,*,*,*")
+   (set (attr "preferred_for_speed")
+     (cond [(eq_attr "alternative" "4")
+	      (symbol_ref "TARGET_INTER_UNIT_MOVES_TO_VEC")
+	   ]
+	   (symbol_ref "true")))])
 
 ;; A subset is vec_setv4sf.
 (define_insn "*vec_setv4sf_sse4_1"
@@ -7107,7 +7112,7 @@ 
 
 ;; All of vinsertps, vmovss, vmovd clear also the higher bits.
 (define_insn "vec_set<mode>_0"
-  [(set (match_operand:VI4F_256_512 0 "register_operand" "=v,v,Yi")
+  [(set (match_operand:VI4F_256_512 0 "register_operand" "=v,v,v")
 	(vec_merge:VI4F_256_512
 	  (vec_duplicate:VI4F_256_512
 	    (match_operand:<ssescalarmode> 2 "general_operand" "v,m,r"))
@@ -7123,7 +7128,12 @@ 
 		   (const_string "sselog")
 		   (const_string "ssemov")))
    (set_attr "prefix" "maybe_evex")
-   (set_attr "mode" "SF,<ssescalarmode>,SI")])
+   (set_attr "mode" "SF,<ssescalarmode>,SI")
+   (set (attr "preferred_for_speed")
+     (cond [(eq_attr "alternative" "2")
+	      (symbol_ref "TARGET_INTER_UNIT_MOVES_TO_VEC")
+	   ]
+	   (symbol_ref "true")))])
 
 (define_insn "sse4_1_insertps"
   [(set (match_operand:V4SF 0 "register_operand" "=Yr,*x,v")
@@ -13606,7 +13616,7 @@ 
   "operands[2] = CONST0_RTX (V4SImode);")
 
 (define_insn "sse2_loadld"
-  [(set (match_operand:V4SI 0 "register_operand"       "=v,Yi,x,x,v")
+  [(set (match_operand:V4SI 0 "register_operand"       "=v,v,x,x,v")
 	(vec_merge:V4SI
 	  (vec_duplicate:V4SI
 	    (match_operand:SI 2 "nonimmediate_operand" "m ,r ,m,x,v"))
@@ -13622,7 +13632,12 @@ 
   [(set_attr "isa" "sse2,sse2,noavx,noavx,avx")
    (set_attr "type" "ssemov")
    (set_attr "prefix" "maybe_vex,maybe_vex,orig,orig,maybe_evex")
-   (set_attr "mode" "TI,TI,V4SF,SF,SF")])
+   (set_attr "mode" "TI,TI,V4SF,SF,SF")
+   (set (attr "preferred_for_speed")
+     (cond [(eq_attr "alternative" "1")
+	      (symbol_ref "TARGET_INTER_UNIT_MOVES_TO_VEC")
+	   ]
+	   (symbol_ref "true")))])
 
 ;; QI and HI modes handled by pextr patterns.
 (define_mode_iterator PEXTR_MODE12
@@ -13687,12 +13702,18 @@ 
   "#")
 
 (define_insn "*vec_extract<ssevecmodelower>_0"
-  [(set (match_operand:SWI48 0 "nonimmediate_operand"	       "=r ,v ,m")
+  [(set (match_operand:SWI48 0 "nonimmediate_operand"	       "=r,r,v ,m")
 	(vec_select:SWI48
-	  (match_operand:<ssevecmode> 1 "nonimmediate_operand" "mYj,vm,v")
+	  (match_operand:<ssevecmode> 1 "nonimmediate_operand" "m ,v,vm,v")
 	  (parallel [(const_int 0)])))]
   "TARGET_SSE && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
-  "#")
+  "#"
+  [(set_attr "isa" "*,sse2,*,*")
+   (set (attr "preferred_for_speed")
+     (cond [(eq_attr "alternative" "1")
+	      (symbol_ref "TARGET_INTER_UNIT_MOVES_FROM_VEC")
+	   ]
+	   (symbol_ref "true")))])
 
 (define_insn "*vec_extractv2di_0_sse"
   [(set (match_operand:DI 0 "nonimmediate_operand"     "=v,m")
@@ -13716,11 +13737,16 @@ 
   [(set (match_operand:DI 0 "register_operand" "=r,x,v")
 	(zero_extend:DI
 	  (vec_select:SI
-	    (match_operand:V4SI 1 "register_operand" "Yj,x,v")
+	    (match_operand:V4SI 1 "register_operand" "v,x,v")
 	    (parallel [(const_int 0)]))))]
   "TARGET_SSE4_1"
   "#"
-  [(set_attr "isa" "x64,*,avx512f")])
+  [(set_attr "isa" "x64,*,avx512f")
+   (set (attr "preferred_for_speed")
+     (cond [(eq_attr "alternative" "1")
+	      (symbol_ref "TARGET_INTER_UNIT_MOVES_FROM_VEC")
+	   ]
+	   (symbol_ref "true")))])
 
 (define_insn "*vec_extractv4si_0_zext"
   [(set (match_operand:DI 0 "register_operand" "=r")
@@ -14064,10 +14090,10 @@ 
 ;; movd instead of movq is required to handle broken assemblers.
 (define_insn "vec_concatv2di"
   [(set (match_operand:V2DI 0 "register_operand"
-	  "=Yr,*x,x ,v ,Yi,v ,x    ,x,v ,x,x,v")
+	  "=Yr,*x,x ,v ,v,v ,x   ,x,v ,x,x,v")
 	(vec_concat:V2DI
 	  (match_operand:DI 1 "nonimmediate_operand"
-	  "  0, 0,x ,Yv,r ,vm,?!*Yn,0,Yv,0,0,v")
+	  "  0, 0,x ,Yv,r,vm,?!*y,0,Yv,0,0,v")
 	  (match_operand:DI 2 "vector_move_operand"
 	  " rm,rm,rm,rm,C ,C ,C ,x,Yv,x,m,m")))]
   "TARGET_SSE"
@@ -14092,7 +14118,7 @@ 
 	    (eq_attr "alternative" "3")
 	      (const_string "x64_avx512dq")
 	    (eq_attr "alternative" "4")
-	      (const_string "x64")
+	      (const_string "x64_sse2")
 	    (eq_attr "alternative" "5,6")
 	      (const_string "sse2")
 	    (eq_attr "alternative" "7")
@@ -14129,11 +14155,18 @@ 
 	      (const_string "maybe_evex")
 	   ]
 	   (const_string "orig")))
-   (set_attr "mode" "TI,TI,TI,TI,TI,TI,TI,TI,TI,V4SF,V2SF,V2SF")])
+   (set_attr "mode" "TI,TI,TI,TI,TI,TI,TI,TI,TI,V4SF,V2SF,V2SF")
+   (set (attr "preferred_for_speed")
+     (cond [(eq_attr "alternative" "4")
+	      (symbol_ref "TARGET_INTER_UNIT_MOVES_TO_VEC")
+	    (eq_attr "alternative" "6")
+	      (symbol_ref "TARGET_INTER_UNIT_MOVES_FROM_VEC")
+	   ]
+	   (symbol_ref "true")))])
 
 ;; vmovq clears also the higher bits.
 (define_insn "vec_set<mode>_0"
-  [(set (match_operand:VI8_AVX_AVX512F 0 "register_operand" "=Yi,v")
+  [(set (match_operand:VI8_AVX_AVX512F 0 "register_operand" "=v,v")
 	(vec_merge:VI8_AVX_AVX512F
 	  (vec_duplicate:VI8_AVX_AVX512F
 	    (match_operand:<ssescalarmode> 2 "general_operand" "r,vm"))
@@ -14145,7 +14178,12 @@ 
    (set_attr "type" "ssemov")
    (set_attr "prefix_rex" "1,*")
    (set_attr "prefix" "maybe_evex")
-   (set_attr "mode" "TI")])
+   (set_attr "mode" "TI")
+   (set (attr "preferred_for_speed")
+     (cond [(eq_attr "alternative" "0")
+	      (symbol_ref "TARGET_INTER_UNIT_MOVES_TO_VEC")
+	   ]
+	   (symbol_ref "true")))])
 
 (define_expand "vec_unpacks_lo_<mode>"
   [(match_operand:<sseunpackmode> 0 "register_operand")
@@ -17901,7 +17939,7 @@ 
   [V32QI V16QI V16HI V8HI V8SI V4SI])
 
 (define_insn "*vec_dup<mode>"
-  [(set (match_operand:AVX2_VEC_DUP_MODE 0 "register_operand" "=x,x,Yi")
+  [(set (match_operand:AVX2_VEC_DUP_MODE 0 "register_operand" "=x,x,v")
 	(vec_duplicate:AVX2_VEC_DUP_MODE
 	  (match_operand:<ssescalarmode> 1 "nonimmediate_operand" "m,x,$r")))]
   "TARGET_AVX2"
@@ -17913,7 +17951,12 @@ 
    (set_attr "type" "ssemov")
    (set_attr "prefix_extra" "1")
    (set_attr "prefix" "maybe_evex")
-   (set_attr "mode" "<sseinsnmode>")])
+   (set_attr "mode" "<sseinsnmode>")
+   (set (attr "preferred_for_speed")
+     (cond [(eq_attr "alternative" "2")
+	      (symbol_ref "TARGET_INTER_UNIT_MOVES_TO_VEC")
+	   ]
+	   (symbol_ref "true")))])
 
 (define_insn "vec_dup<mode>"
   [(set (match_operand:AVX_VEC_DUP_MODE 0 "register_operand" "=x,x,x,v,x")