[02/43] i386: Emulate MMX packsswb/packssdw/packuswb with SSE2

Message ID 20190210001947.27278-3-hjl.tools@gmail.com
State Superseded
Headers show
Series
  • V3: Emulate MMX intrinsics with SSE
Related show

Commit Message

H.J. Lu Feb. 10, 2019, 12:19 a.m.
Emulate MMX packsswb/packssdw/packuswb with SSE packsswb/packssdw/packuswb
plus moving bits 64:95 to bits 32:63 in SSE register.  Only SSE register
source operand is allowed.

2019-02-08  H.J. Lu  <hongjiu.lu@intel.com>
	    Uros Bizjak  <ubizjak@gmail.com>

	PR target/89021
	* config/i386/i386-protos.h (ix86_move_vector_high_sse_to_mmx):
	New prototype.
	(ix86_split_mmx_pack): Likewise.
	* config/i386/i386.c (ix86_move_vector_high_sse_to_mmx): New
	function.
	(ix86_split_mmx_pack): Likewise.
	* config/i386/i386.md (mmx_isa): New.
	(enabled): Also check mmx_isa.
	* config/i386/mmx.md (any_s_truncate): New code iterator.
	(s_trunsuffix): New code attr.
	(mmx_packsswb): Removed.
	(mmx_packssdw): Likewise.
	(mmx_packuswb): Likewise.
	(mmx_pack<s_trunsuffix>swb): New define_insn_and_split to emulate
	MMX packsswb/packuswb with SSE2.
	(mmx_packssdw): Likewise.
---
 gcc/config/i386/i386-protos.h |  3 ++
 gcc/config/i386/i386.c        | 54 ++++++++++++++++++++++++++++
 gcc/config/i386/i386.md       | 12 +++++++
 gcc/config/i386/mmx.md        | 67 +++++++++++++++++++----------------
 4 files changed, 106 insertions(+), 30 deletions(-)

-- 
2.20.1

Comments

Uros Bizjak Feb. 10, 2019, 9:56 a.m. | #1
On 2/10/19, H.J. Lu <hjl.tools@gmail.com> wrote:
> Emulate MMX packsswb/packssdw/packuswb with SSE packsswb/packssdw/packuswb

> plus moving bits 64:95 to bits 32:63 in SSE register.  Only SSE register

> source operand is allowed.

>

> 2019-02-08  H.J. Lu  <hongjiu.lu@intel.com>

> 	    Uros Bizjak  <ubizjak@gmail.com>

>

> 	PR target/89021

> 	* config/i386/i386-protos.h (ix86_move_vector_high_sse_to_mmx):

> 	New prototype.

> 	(ix86_split_mmx_pack): Likewise.

> 	* config/i386/i386.c (ix86_move_vector_high_sse_to_mmx): New

> 	function.

> 	(ix86_split_mmx_pack): Likewise.

> 	* config/i386/i386.md (mmx_isa): New.

> 	(enabled): Also check mmx_isa.

> 	* config/i386/mmx.md (any_s_truncate): New code iterator.

> 	(s_trunsuffix): New code attr.

> 	(mmx_packsswb): Removed.

> 	(mmx_packssdw): Likewise.

> 	(mmx_packuswb): Likewise.

> 	(mmx_pack<s_trunsuffix>swb): New define_insn_and_split to emulate

> 	MMX packsswb/packuswb with SSE2.

> 	(mmx_packssdw): Likewise.


LGTM, with a couple of nits below.

> ---

>  gcc/config/i386/i386-protos.h |  3 ++

>  gcc/config/i386/i386.c        | 54 ++++++++++++++++++++++++++++

>  gcc/config/i386/i386.md       | 12 +++++++

>  gcc/config/i386/mmx.md        | 67 +++++++++++++++++++----------------

>  4 files changed, 106 insertions(+), 30 deletions(-)

>

> diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h

> index 2d600173917..bb96a420a85 100644

> --- a/gcc/config/i386/i386-protos.h

> +++ b/gcc/config/i386/i386-protos.h

> @@ -200,6 +200,9 @@ extern void ix86_expand_vecop_qihi (enum rtx_code, rtx,

> rtx, rtx);

>

>  extern rtx ix86_split_stack_guard (void);

>

> +extern void ix86_move_vector_high_sse_to_mmx (rtx);

> +extern void ix86_split_mmx_pack (rtx[], enum rtx_code);

> +

>  #ifdef TREE_CODE

>  extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree,

> int);

>  #endif	/* TREE_CODE  */

> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c

> index ba02c26c8b2..2af7f891350 100644

> --- a/gcc/config/i386/i386.c

> +++ b/gcc/config/i386/i386.c

> @@ -19955,6 +19955,60 @@ ix86_expand_vector_move_misalign (machine_mode

> mode, rtx operands[])

>      gcc_unreachable ();

>  }

>

> +/* Move bits 64:95 to bits 32:63.  */

> +

> +void

> +ix86_move_vector_high_sse_to_mmx (rtx op)

> +{

> +  rtx mask = gen_rtx_PARALLEL (VOIDmode,

> +			       gen_rtvec (4, GEN_INT (0), GEN_INT (2),

> +					  GEN_INT (0), GEN_INT (0)));

> +  rtx dest = gen_rtx_REG (V4SImode, REGNO (op));

> +  op = gen_rtx_VEC_SELECT (V4SImode, dest, mask);

> +  rtx insn = gen_rtx_SET (dest, op);

> +  emit_insn (insn);

> +}

> +

> +/* Split MMX pack with signed/unsigned saturation with SSE/SSE2.  */

> +

> +void

> +ix86_split_mmx_pack (rtx operands[], enum rtx_code code)

> +{

> +  rtx op0 = operands[0];

> +  rtx op1 = operands[1];

> +  rtx op2 = operands[2];

> +

> +  machine_mode dmode = GET_MODE (op0);

> +  machine_mode smode = GET_MODE (op1);

> +  machine_mode inner_dmode = GET_MODE_INNER (dmode);

> +  machine_mode inner_smode = GET_MODE_INNER (smode);

> +

> +  /* Get the corresponding SSE mode for destination.  */

> +  int nunits = 16 / GET_MODE_SIZE (inner_dmode);

> +  machine_mode sse_dmode = mode_for_vector (GET_MODE_INNER (dmode),

> +					    nunits).require ();

> +  machine_mode sse_half_dmode = mode_for_vector (GET_MODE_INNER (dmode),

> +						 nunits / 2).require ();

> +

> +  /* Get the corresponding SSE mode for source.  */

> +  nunits = 16 / GET_MODE_SIZE (inner_smode);

> +  machine_mode sse_smode = mode_for_vector (GET_MODE_INNER (smode),

> +					    nunits).require ();

> +

> +  /* Generate SSE pack with signed/unsigned saturation.  */

> +  rtx dest = gen_rtx_REG (sse_dmode, REGNO (op0));

> +  op1 = gen_rtx_REG (sse_smode, REGNO (op1));

> +  op2 = gen_rtx_REG (sse_smode, REGNO (op2));

> +

> +  op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1);

> +  op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2);

> +  rtx insn = gen_rtx_SET (dest, gen_rtx_VEC_CONCAT (sse_dmode,

> +						    op1, op2));

> +  emit_insn (insn);

> +

> +  ix86_move_vector_high_sse_to_mmx (op0);

> +}

> +

>  /* Helper function of ix86_fixup_binary_operands to canonicalize

>     operand order.  Returns true if the operands should be swapped.  */

>

> diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md

> index 4a32144a71a..72685107fc0 100644

> --- a/gcc/config/i386/i386.md

> +++ b/gcc/config/i386/i386.md

> @@ -792,6 +792,9 @@

>  		    avx512vl,noavx512vl,x64_avx512dq,x64_avx512bw"

>    (const_string "base"))

>

> +;; Define instruction set of MMX instructions

> +(define_attr "mmx_isa" "base,native,x64,x64_noavx,x64_avx" (const_string

> "base"))

> +

>  (define_attr "enabled" ""

>    (cond [(eq_attr "isa" "x64") (symbol_ref "TARGET_64BIT")

>  	 (eq_attr "isa" "x64_sse2")

> @@ -830,6 +833,15 @@

>  	 (eq_attr "isa" "noavx512dq") (symbol_ref "!TARGET_AVX512DQ")

>  	 (eq_attr "isa" "avx512vl") (symbol_ref "TARGET_AVX512VL")

>  	 (eq_attr "isa" "noavx512vl") (symbol_ref "!TARGET_AVX512VL")

> +

> +	 (eq_attr "mmx_isa" "native")

> +	   (symbol_ref "!TARGET_MMX_WITH_SSE")

> +	 (eq_attr "mmx_isa" "x64")

> +	   (symbol_ref "TARGET_MMX_WITH_SSE")

> +	 (eq_attr "mmx_isa" "x64_avx")

> +	   (symbol_ref "TARGET_MMX_WITH_SSE && TARGET_AVX")

> +	 (eq_attr "mmx_isa" "x64_noavx")

> +	   (symbol_ref "TARGET_MMX_WITH_SSE && !TARGET_AVX")

>  	]

>  	(const_int 1)))

>

> diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md

> index c1e0f2c411e..10096f7cab7 100644

> --- a/gcc/config/i386/mmx.md

> +++ b/gcc/config/i386/mmx.md

> @@ -58,6 +58,11 @@

>  ;; Mapping from integer vector mode to mnemonic suffix

>  (define_mode_attr mmxvecsize [(V8QI "b") (V4HI "w") (V2SI "d") (V1DI

> "q")])

>

> +;; Used in signed and unsigned truncations with saturation.

> +(define_code_iterator any_s_truncate [ss_truncate us_truncate])

> +;; Instruction suffix for truncations with saturation.

> +(define_code_attr s_trunsuffix [(ss_truncate "s") (us_truncate "u")])


Please move definitions that have single use nearby their usage site.

> +

>  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

>  ;;

>  ;; Move patterns

> @@ -1046,41 +1051,43 @@

>  ;;

>  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

>

> -(define_insn "mmx_packsswb"

> -  [(set (match_operand:V8QI 0 "register_operand" "=y")

> +(define_insn_and_split "mmx_pack<s_trunsuffix>swb"

> +  [(set (match_operand:V8QI 0 "register_operand" "=y,x,Yv")

>  	(vec_concat:V8QI

> -	  (ss_truncate:V4QI

> -	    (match_operand:V4HI 1 "register_operand" "0"))

> -	  (ss_truncate:V4QI

> -	    (match_operand:V4HI 2 "nonimmediate_operand" "ym"))))]

> -  "TARGET_MMX"

> -  "packsswb\t{%2, %0|%0, %2}"

> -  [(set_attr "type" "mmxshft")

> -   (set_attr "mode" "DI")])

> +	  (any_s_truncate:V4QI

> +	    (match_operand:V4HI 1 "register_operand" "0,0,Yv"))

> +	  (any_s_truncate:V4QI

> +	    (match_operand:V4HI 2 "nonimmediate_operand" "ym,x,Yv"))))]

> +  "TARGET_MMX || TARGET_MMX_WITH_SSE"

> +  "@

> +   pack<s_trunsuffix>swb\t{%2, %0|%0, %2}

> +   #

> +   #"

> +  "&& reload_completed && TARGET_MMX_WITH_SSE"


The above should be without first &&, with "reload_completed" last. In
effect, the condition of the separate split pattern would read as:

"TARGET_MMX_WITH_SSE && reload_completed".

> +  [(const_int 0)]

> +  "ix86_split_mmx_pack (operands, <any_s_truncate:CODE>);"

> +  [(set_attr "mmx_isa" "native,x64_noavx,x64_avx")

> +   (set_attr "type" "mmxshft,sselog,sselog")

> +   (set_attr "mode" "DI,TI,TI")])

>

> -(define_insn "mmx_packssdw"

> -  [(set (match_operand:V4HI 0 "register_operand" "=y")

> +(define_insn_and_split "mmx_packssdw"

> +  [(set (match_operand:V4HI 0 "register_operand" "=y,x,Yv")

>  	(vec_concat:V4HI

>  	  (ss_truncate:V2HI

> -	    (match_operand:V2SI 1 "register_operand" "0"))

> +	    (match_operand:V2SI 1 "register_operand" "0,0,Yv"))

>  	  (ss_truncate:V2HI

> -	    (match_operand:V2SI 2 "nonimmediate_operand" "ym"))))]

> -  "TARGET_MMX"

> -  "packssdw\t{%2, %0|%0, %2}"

> -  [(set_attr "type" "mmxshft")

> -   (set_attr "mode" "DI")])

> -

> -(define_insn "mmx_packuswb"

> -  [(set (match_operand:V8QI 0 "register_operand" "=y")

> -	(vec_concat:V8QI

> -	  (us_truncate:V4QI

> -	    (match_operand:V4HI 1 "register_operand" "0"))

> -	  (us_truncate:V4QI

> -	    (match_operand:V4HI 2 "nonimmediate_operand" "ym"))))]

> -  "TARGET_MMX"

> -  "packuswb\t{%2, %0|%0, %2}"

> -  [(set_attr "type" "mmxshft")

> -   (set_attr "mode" "DI")])

> +	    (match_operand:V2SI 2 "nonimmediate_operand" "ym,x,Yv"))))]

> +  "TARGET_MMX || TARGET_MMX_WITH_SSE"

> +  "@

> +   packssdw\t{%2, %0|%0, %2}

> +   #

> +   #"

> +  "&& reload_completed && TARGET_MMX_WITH_SSE"


Also here.

> +  [(const_int 0)]

> +  "ix86_split_mmx_pack (operands, SS_TRUNCATE);"

> +  [(set_attr "mmx_isa" "native,x64_noavx,x64_avx")

> +   (set_attr "type" "mmxshft,sselog,sselog")

> +   (set_attr "mode" "DI,TI,TI")])

>

>  (define_insn "mmx_punpckhbw"

>    [(set (match_operand:V8QI 0 "register_operand" "=y")

> --

> 2.20.1

>

>
Uros Bizjak Feb. 10, 2019, 10:04 a.m. | #2
On 2/10/19, Uros Bizjak <ubizjak@gmail.com> wrote:
> On 2/10/19, H.J. Lu <hjl.tools@gmail.com> wrote:

>> Emulate MMX packsswb/packssdw/packuswb with SSE

>> packsswb/packssdw/packuswb

>> plus moving bits 64:95 to bits 32:63 in SSE register.  Only SSE register

>> source operand is allowed.

>>

>> 2019-02-08  H.J. Lu  <hongjiu.lu@intel.com>

>> 	    Uros Bizjak  <ubizjak@gmail.com>

>>

>> 	PR target/89021

>> 	* config/i386/i386-protos.h (ix86_move_vector_high_sse_to_mmx):

>> 	New prototype.

>> 	(ix86_split_mmx_pack): Likewise.

>> 	* config/i386/i386.c (ix86_move_vector_high_sse_to_mmx): New

>> 	function.

>> 	(ix86_split_mmx_pack): Likewise.

>> 	* config/i386/i386.md (mmx_isa): New.

>> 	(enabled): Also check mmx_isa.

>> 	* config/i386/mmx.md (any_s_truncate): New code iterator.

>> 	(s_trunsuffix): New code attr.

>> 	(mmx_packsswb): Removed.

>> 	(mmx_packssdw): Likewise.

>> 	(mmx_packuswb): Likewise.

>> 	(mmx_pack<s_trunsuffix>swb): New define_insn_and_split to emulate

>> 	MMX packsswb/packuswb with SSE2.

>> 	(mmx_packssdw): Likewise.

>

> LGTM, with a couple of nits below.


Oh, you also need DONE; at the end of preparation statements,
otherwise splitters will inject (const_int 0) into the insn stream.

Uros.

>> ---

>>  gcc/config/i386/i386-protos.h |  3 ++

>>  gcc/config/i386/i386.c        | 54 ++++++++++++++++++++++++++++

>>  gcc/config/i386/i386.md       | 12 +++++++

>>  gcc/config/i386/mmx.md        | 67 +++++++++++++++++++----------------

>>  4 files changed, 106 insertions(+), 30 deletions(-)

>>

>> diff --git a/gcc/config/i386/i386-protos.h

>> b/gcc/config/i386/i386-protos.h

>> index 2d600173917..bb96a420a85 100644

>> --- a/gcc/config/i386/i386-protos.h

>> +++ b/gcc/config/i386/i386-protos.h

>> @@ -200,6 +200,9 @@ extern void ix86_expand_vecop_qihi (enum rtx_code,

>> rtx,

>> rtx, rtx);

>>

>>  extern rtx ix86_split_stack_guard (void);

>>

>> +extern void ix86_move_vector_high_sse_to_mmx (rtx);

>> +extern void ix86_split_mmx_pack (rtx[], enum rtx_code);

>> +

>>  #ifdef TREE_CODE

>>  extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree,

>> int);

>>  #endif	/* TREE_CODE  */

>> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c

>> index ba02c26c8b2..2af7f891350 100644

>> --- a/gcc/config/i386/i386.c

>> +++ b/gcc/config/i386/i386.c

>> @@ -19955,6 +19955,60 @@ ix86_expand_vector_move_misalign (machine_mode

>> mode, rtx operands[])

>>      gcc_unreachable ();

>>  }

>>

>> +/* Move bits 64:95 to bits 32:63.  */

>> +

>> +void

>> +ix86_move_vector_high_sse_to_mmx (rtx op)

>> +{

>> +  rtx mask = gen_rtx_PARALLEL (VOIDmode,

>> +			       gen_rtvec (4, GEN_INT (0), GEN_INT (2),

>> +					  GEN_INT (0), GEN_INT (0)));

>> +  rtx dest = gen_rtx_REG (V4SImode, REGNO (op));

>> +  op = gen_rtx_VEC_SELECT (V4SImode, dest, mask);

>> +  rtx insn = gen_rtx_SET (dest, op);

>> +  emit_insn (insn);

>> +}

>> +

>> +/* Split MMX pack with signed/unsigned saturation with SSE/SSE2.  */

>> +

>> +void

>> +ix86_split_mmx_pack (rtx operands[], enum rtx_code code)

>> +{

>> +  rtx op0 = operands[0];

>> +  rtx op1 = operands[1];

>> +  rtx op2 = operands[2];

>> +

>> +  machine_mode dmode = GET_MODE (op0);

>> +  machine_mode smode = GET_MODE (op1);

>> +  machine_mode inner_dmode = GET_MODE_INNER (dmode);

>> +  machine_mode inner_smode = GET_MODE_INNER (smode);

>> +

>> +  /* Get the corresponding SSE mode for destination.  */

>> +  int nunits = 16 / GET_MODE_SIZE (inner_dmode);

>> +  machine_mode sse_dmode = mode_for_vector (GET_MODE_INNER (dmode),

>> +					    nunits).require ();

>> +  machine_mode sse_half_dmode = mode_for_vector (GET_MODE_INNER (dmode),

>> +						 nunits / 2).require ();

>> +

>> +  /* Get the corresponding SSE mode for source.  */

>> +  nunits = 16 / GET_MODE_SIZE (inner_smode);

>> +  machine_mode sse_smode = mode_for_vector (GET_MODE_INNER (smode),

>> +					    nunits).require ();

>> +

>> +  /* Generate SSE pack with signed/unsigned saturation.  */

>> +  rtx dest = gen_rtx_REG (sse_dmode, REGNO (op0));

>> +  op1 = gen_rtx_REG (sse_smode, REGNO (op1));

>> +  op2 = gen_rtx_REG (sse_smode, REGNO (op2));

>> +

>> +  op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1);

>> +  op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2);

>> +  rtx insn = gen_rtx_SET (dest, gen_rtx_VEC_CONCAT (sse_dmode,

>> +						    op1, op2));

>> +  emit_insn (insn);

>> +

>> +  ix86_move_vector_high_sse_to_mmx (op0);

>> +}

>> +

>>  /* Helper function of ix86_fixup_binary_operands to canonicalize

>>     operand order.  Returns true if the operands should be swapped.  */

>>

>> diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md

>> index 4a32144a71a..72685107fc0 100644

>> --- a/gcc/config/i386/i386.md

>> +++ b/gcc/config/i386/i386.md

>> @@ -792,6 +792,9 @@

>>  		    avx512vl,noavx512vl,x64_avx512dq,x64_avx512bw"

>>    (const_string "base"))

>>

>> +;; Define instruction set of MMX instructions

>> +(define_attr "mmx_isa" "base,native,x64,x64_noavx,x64_avx" (const_string

>> "base"))

>> +

>>  (define_attr "enabled" ""

>>    (cond [(eq_attr "isa" "x64") (symbol_ref "TARGET_64BIT")

>>  	 (eq_attr "isa" "x64_sse2")

>> @@ -830,6 +833,15 @@

>>  	 (eq_attr "isa" "noavx512dq") (symbol_ref "!TARGET_AVX512DQ")

>>  	 (eq_attr "isa" "avx512vl") (symbol_ref "TARGET_AVX512VL")

>>  	 (eq_attr "isa" "noavx512vl") (symbol_ref "!TARGET_AVX512VL")

>> +

>> +	 (eq_attr "mmx_isa" "native")

>> +	   (symbol_ref "!TARGET_MMX_WITH_SSE")

>> +	 (eq_attr "mmx_isa" "x64")

>> +	   (symbol_ref "TARGET_MMX_WITH_SSE")

>> +	 (eq_attr "mmx_isa" "x64_avx")

>> +	   (symbol_ref "TARGET_MMX_WITH_SSE && TARGET_AVX")

>> +	 (eq_attr "mmx_isa" "x64_noavx")

>> +	   (symbol_ref "TARGET_MMX_WITH_SSE && !TARGET_AVX")

>>  	]

>>  	(const_int 1)))

>>

>> diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md

>> index c1e0f2c411e..10096f7cab7 100644

>> --- a/gcc/config/i386/mmx.md

>> +++ b/gcc/config/i386/mmx.md

>> @@ -58,6 +58,11 @@

>>  ;; Mapping from integer vector mode to mnemonic suffix

>>  (define_mode_attr mmxvecsize [(V8QI "b") (V4HI "w") (V2SI "d") (V1DI

>> "q")])

>>

>> +;; Used in signed and unsigned truncations with saturation.

>> +(define_code_iterator any_s_truncate [ss_truncate us_truncate])

>> +;; Instruction suffix for truncations with saturation.

>> +(define_code_attr s_trunsuffix [(ss_truncate "s") (us_truncate "u")])

>

> Please move definitions that have single use nearby their usage site.

>

>> +

>>  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

>>  ;;

>>  ;; Move patterns

>> @@ -1046,41 +1051,43 @@

>>  ;;

>>  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

>>

>> -(define_insn "mmx_packsswb"

>> -  [(set (match_operand:V8QI 0 "register_operand" "=y")

>> +(define_insn_and_split "mmx_pack<s_trunsuffix>swb"

>> +  [(set (match_operand:V8QI 0 "register_operand" "=y,x,Yv")

>>  	(vec_concat:V8QI

>> -	  (ss_truncate:V4QI

>> -	    (match_operand:V4HI 1 "register_operand" "0"))

>> -	  (ss_truncate:V4QI

>> -	    (match_operand:V4HI 2 "nonimmediate_operand" "ym"))))]

>> -  "TARGET_MMX"

>> -  "packsswb\t{%2, %0|%0, %2}"

>> -  [(set_attr "type" "mmxshft")

>> -   (set_attr "mode" "DI")])

>> +	  (any_s_truncate:V4QI

>> +	    (match_operand:V4HI 1 "register_operand" "0,0,Yv"))

>> +	  (any_s_truncate:V4QI

>> +	    (match_operand:V4HI 2 "nonimmediate_operand" "ym,x,Yv"))))]

>> +  "TARGET_MMX || TARGET_MMX_WITH_SSE"

>> +  "@

>> +   pack<s_trunsuffix>swb\t{%2, %0|%0, %2}

>> +   #

>> +   #"

>> +  "&& reload_completed && TARGET_MMX_WITH_SSE"

>

> The above should be without first &&, with "reload_completed" last. In

> effect, the condition of the separate split pattern would read as:

>

> "TARGET_MMX_WITH_SSE && reload_completed".

>

>> +  [(const_int 0)]

>> +  "ix86_split_mmx_pack (operands, <any_s_truncate:CODE>);"


Missing DONE; above.

>> +  [(set_attr "mmx_isa" "native,x64_noavx,x64_avx")

>> +   (set_attr "type" "mmxshft,sselog,sselog")

>> +   (set_attr "mode" "DI,TI,TI")])

>>

>> -(define_insn "mmx_packssdw"

>> -  [(set (match_operand:V4HI 0 "register_operand" "=y")

>> +(define_insn_and_split "mmx_packssdw"

>> +  [(set (match_operand:V4HI 0 "register_operand" "=y,x,Yv")

>>  	(vec_concat:V4HI

>>  	  (ss_truncate:V2HI

>> -	    (match_operand:V2SI 1 "register_operand" "0"))

>> +	    (match_operand:V2SI 1 "register_operand" "0,0,Yv"))

>>  	  (ss_truncate:V2HI

>> -	    (match_operand:V2SI 2 "nonimmediate_operand" "ym"))))]

>> -  "TARGET_MMX"

>> -  "packssdw\t{%2, %0|%0, %2}"

>> -  [(set_attr "type" "mmxshft")

>> -   (set_attr "mode" "DI")])

>> -

>> -(define_insn "mmx_packuswb"

>> -  [(set (match_operand:V8QI 0 "register_operand" "=y")

>> -	(vec_concat:V8QI

>> -	  (us_truncate:V4QI

>> -	    (match_operand:V4HI 1 "register_operand" "0"))

>> -	  (us_truncate:V4QI

>> -	    (match_operand:V4HI 2 "nonimmediate_operand" "ym"))))]

>> -  "TARGET_MMX"

>> -  "packuswb\t{%2, %0|%0, %2}"

>> -  [(set_attr "type" "mmxshft")

>> -   (set_attr "mode" "DI")])

>> +	    (match_operand:V2SI 2 "nonimmediate_operand" "ym,x,Yv"))))]

>> +  "TARGET_MMX || TARGET_MMX_WITH_SSE"

>> +  "@

>> +   packssdw\t{%2, %0|%0, %2}

>> +   #

>> +   #"

>> +  "&& reload_completed && TARGET_MMX_WITH_SSE"

>

> Also here.

>

>> +  [(const_int 0)]

>> +  "ix86_split_mmx_pack (operands, SS_TRUNCATE);"


And here.

>> +  [(set_attr "mmx_isa" "native,x64_noavx,x64_avx")

>> +   (set_attr "type" "mmxshft,sselog,sselog")

>> +   (set_attr "mode" "DI,TI,TI")])

>>

>>  (define_insn "mmx_punpckhbw"

>>    [(set (match_operand:V8QI 0 "register_operand" "=y")

>> --

>> 2.20.1

>>

>>

>

Patch

diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 2d600173917..bb96a420a85 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -200,6 +200,9 @@  extern void ix86_expand_vecop_qihi (enum rtx_code, rtx, rtx, rtx);
 
 extern rtx ix86_split_stack_guard (void);
 
+extern void ix86_move_vector_high_sse_to_mmx (rtx);
+extern void ix86_split_mmx_pack (rtx[], enum rtx_code);
+
 #ifdef TREE_CODE
 extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree, int);
 #endif	/* TREE_CODE  */
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index ba02c26c8b2..2af7f891350 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -19955,6 +19955,60 @@  ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
     gcc_unreachable ();
 }
 
+/* Move bits 64:95 to bits 32:63.  */
+
+void
+ix86_move_vector_high_sse_to_mmx (rtx op)
+{
+  rtx mask = gen_rtx_PARALLEL (VOIDmode,
+			       gen_rtvec (4, GEN_INT (0), GEN_INT (2),
+					  GEN_INT (0), GEN_INT (0)));
+  rtx dest = gen_rtx_REG (V4SImode, REGNO (op));
+  op = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
+  rtx insn = gen_rtx_SET (dest, op);
+  emit_insn (insn);
+}
+
+/* Split MMX pack with signed/unsigned saturation with SSE/SSE2.  */
+
+void
+ix86_split_mmx_pack (rtx operands[], enum rtx_code code)
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  rtx op2 = operands[2];
+
+  machine_mode dmode = GET_MODE (op0);
+  machine_mode smode = GET_MODE (op1);
+  machine_mode inner_dmode = GET_MODE_INNER (dmode);
+  machine_mode inner_smode = GET_MODE_INNER (smode);
+
+  /* Get the corresponding SSE mode for destination.  */
+  int nunits = 16 / GET_MODE_SIZE (inner_dmode);
+  machine_mode sse_dmode = mode_for_vector (GET_MODE_INNER (dmode),
+					    nunits).require ();
+  machine_mode sse_half_dmode = mode_for_vector (GET_MODE_INNER (dmode),
+						 nunits / 2).require ();
+
+  /* Get the corresponding SSE mode for source.  */
+  nunits = 16 / GET_MODE_SIZE (inner_smode);
+  machine_mode sse_smode = mode_for_vector (GET_MODE_INNER (smode),
+					    nunits).require ();
+
+  /* Generate SSE pack with signed/unsigned saturation.  */
+  rtx dest = gen_rtx_REG (sse_dmode, REGNO (op0));
+  op1 = gen_rtx_REG (sse_smode, REGNO (op1));
+  op2 = gen_rtx_REG (sse_smode, REGNO (op2));
+
+  op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1);
+  op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2);
+  rtx insn = gen_rtx_SET (dest, gen_rtx_VEC_CONCAT (sse_dmode,
+						    op1, op2));
+  emit_insn (insn);
+
+  ix86_move_vector_high_sse_to_mmx (op0);
+}
+
 /* Helper function of ix86_fixup_binary_operands to canonicalize
    operand order.  Returns true if the operands should be swapped.  */
 
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 4a32144a71a..72685107fc0 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -792,6 +792,9 @@ 
 		    avx512vl,noavx512vl,x64_avx512dq,x64_avx512bw"
   (const_string "base"))
 
+;; Define instruction set of MMX instructions
+(define_attr "mmx_isa" "base,native,x64,x64_noavx,x64_avx" (const_string "base"))
+
 (define_attr "enabled" ""
   (cond [(eq_attr "isa" "x64") (symbol_ref "TARGET_64BIT")
 	 (eq_attr "isa" "x64_sse2")
@@ -830,6 +833,15 @@ 
 	 (eq_attr "isa" "noavx512dq") (symbol_ref "!TARGET_AVX512DQ")
 	 (eq_attr "isa" "avx512vl") (symbol_ref "TARGET_AVX512VL")
 	 (eq_attr "isa" "noavx512vl") (symbol_ref "!TARGET_AVX512VL")
+
+	 (eq_attr "mmx_isa" "native")
+	   (symbol_ref "!TARGET_MMX_WITH_SSE")
+	 (eq_attr "mmx_isa" "x64")
+	   (symbol_ref "TARGET_MMX_WITH_SSE")
+	 (eq_attr "mmx_isa" "x64_avx")
+	   (symbol_ref "TARGET_MMX_WITH_SSE && TARGET_AVX")
+	 (eq_attr "mmx_isa" "x64_noavx")
+	   (symbol_ref "TARGET_MMX_WITH_SSE && !TARGET_AVX")
 	]
 	(const_int 1)))
 
diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index c1e0f2c411e..10096f7cab7 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -58,6 +58,11 @@ 
 ;; Mapping from integer vector mode to mnemonic suffix
 (define_mode_attr mmxvecsize [(V8QI "b") (V4HI "w") (V2SI "d") (V1DI "q")])
 
+;; Used in signed and unsigned truncations with saturation.
+(define_code_iterator any_s_truncate [ss_truncate us_truncate])
+;; Instruction suffix for truncations with saturation.
+(define_code_attr s_trunsuffix [(ss_truncate "s") (us_truncate "u")])
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;
 ;; Move patterns
@@ -1046,41 +1051,43 @@ 
 ;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
-(define_insn "mmx_packsswb"
-  [(set (match_operand:V8QI 0 "register_operand" "=y")
+(define_insn_and_split "mmx_pack<s_trunsuffix>swb"
+  [(set (match_operand:V8QI 0 "register_operand" "=y,x,Yv")
 	(vec_concat:V8QI
-	  (ss_truncate:V4QI
-	    (match_operand:V4HI 1 "register_operand" "0"))
-	  (ss_truncate:V4QI
-	    (match_operand:V4HI 2 "nonimmediate_operand" "ym"))))]
-  "TARGET_MMX"
-  "packsswb\t{%2, %0|%0, %2}"
-  [(set_attr "type" "mmxshft")
-   (set_attr "mode" "DI")])
+	  (any_s_truncate:V4QI
+	    (match_operand:V4HI 1 "register_operand" "0,0,Yv"))
+	  (any_s_truncate:V4QI
+	    (match_operand:V4HI 2 "nonimmediate_operand" "ym,x,Yv"))))]
+  "TARGET_MMX || TARGET_MMX_WITH_SSE"
+  "@
+   pack<s_trunsuffix>swb\t{%2, %0|%0, %2}
+   #
+   #"
+  "&& reload_completed && TARGET_MMX_WITH_SSE"
+  [(const_int 0)]
+  "ix86_split_mmx_pack (operands, <any_s_truncate:CODE>);"
+  [(set_attr "mmx_isa" "native,x64_noavx,x64_avx")
+   (set_attr "type" "mmxshft,sselog,sselog")
+   (set_attr "mode" "DI,TI,TI")])
 
-(define_insn "mmx_packssdw"
-  [(set (match_operand:V4HI 0 "register_operand" "=y")
+(define_insn_and_split "mmx_packssdw"
+  [(set (match_operand:V4HI 0 "register_operand" "=y,x,Yv")
 	(vec_concat:V4HI
 	  (ss_truncate:V2HI
-	    (match_operand:V2SI 1 "register_operand" "0"))
+	    (match_operand:V2SI 1 "register_operand" "0,0,Yv"))
 	  (ss_truncate:V2HI
-	    (match_operand:V2SI 2 "nonimmediate_operand" "ym"))))]
-  "TARGET_MMX"
-  "packssdw\t{%2, %0|%0, %2}"
-  [(set_attr "type" "mmxshft")
-   (set_attr "mode" "DI")])
-
-(define_insn "mmx_packuswb"
-  [(set (match_operand:V8QI 0 "register_operand" "=y")
-	(vec_concat:V8QI
-	  (us_truncate:V4QI
-	    (match_operand:V4HI 1 "register_operand" "0"))
-	  (us_truncate:V4QI
-	    (match_operand:V4HI 2 "nonimmediate_operand" "ym"))))]
-  "TARGET_MMX"
-  "packuswb\t{%2, %0|%0, %2}"
-  [(set_attr "type" "mmxshft")
-   (set_attr "mode" "DI")])
+	    (match_operand:V2SI 2 "nonimmediate_operand" "ym,x,Yv"))))]
+  "TARGET_MMX || TARGET_MMX_WITH_SSE"
+  "@
+   packssdw\t{%2, %0|%0, %2}
+   #
+   #"
+  "&& reload_completed && TARGET_MMX_WITH_SSE"
+  [(const_int 0)]
+  "ix86_split_mmx_pack (operands, SS_TRUNCATE);"
+  [(set_attr "mmx_isa" "native,x64_noavx,x64_avx")
+   (set_attr "type" "mmxshft,sselog,sselog")
+   (set_attr "mode" "DI,TI,TI")])
 
 (define_insn "mmx_punpckhbw"
   [(set (match_operand:V8QI 0 "register_operand" "=y")