[06/46] i386: Emulate MMX punpcklXX/punpckhXX with SSE punpcklXX

Message ID 20190201211809.963-7-hjl.tools@gmail.com
State Superseded
Headers show
Series
  • Implement MMX intrinsics with SSE
Related show

Commit Message

H.J. Lu Feb. 1, 2019, 9:17 p.m.
Emulate MMX punpcklXX/punpckhXX with SSE punpcklXX.  For MMX punpckhXX,
move bits 64:127 to bits 0:63 in SSE register.  Only SSE register source
operand is allowed.

	PR target/89021
	* config/i386/i386-protos.h (ix86_split_mmx_punpck): New
	prototype.
	* config/i386/i386.c (ix86_split_mmx_punpck): New function.
	* config/i386/mmx.m (mmx_punpckhbw): Changed to
	define_insn_and_split to support SSE emulation.
	(mmx_punpcklbw): Likewise.
	(mmx_punpckhwd): Likewise.
	(mmx_punpcklwd): Likewise.
	(mmx_punpckhdq): Likewise.
	(mmx_punpckldq): Likewise.
---
 gcc/config/i386/i386-protos.h |   1 +
 gcc/config/i386/i386.c        |  77 ++++++++++++++++++++++++
 gcc/config/i386/mmx.md        | 108 +++++++++++++++++++++-------------
 3 files changed, 144 insertions(+), 42 deletions(-)

-- 
2.20.1

Comments

Uros Bizjak Feb. 4, 2019, 12:01 p.m. | #1
On Fri, Feb 1, 2019 at 10:18 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>

> Emulate MMX punpcklXX/punpckhXX with SSE punpcklXX.  For MMX punpckhXX,

> move bits 64:127 to bits 0:63 in SSE register.  Only SSE register source

> operand is allowed.

>

>         PR target/89021

>         * config/i386/i386-protos.h (ix86_split_mmx_punpck): New

>         prototype.

>         * config/i386/i386.c (ix86_split_mmx_punpck): New function.

>         * config/i386/mmx.m (mmx_punpckhbw): Changed to

>         define_insn_and_split to support SSE emulation.

>         (mmx_punpcklbw): Likewise.

>         (mmx_punpckhwd): Likewise.

>         (mmx_punpcklwd): Likewise.

>         (mmx_punpckhdq): Likewise.

>         (mmx_punpckldq): Likewise.

> ---

>  gcc/config/i386/i386-protos.h |   1 +

>  gcc/config/i386/i386.c        |  77 ++++++++++++++++++++++++

>  gcc/config/i386/mmx.md        | 108 +++++++++++++++++++++-------------

>  3 files changed, 144 insertions(+), 42 deletions(-)

>

> diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h

> index bb96a420a85..dc7fc38d8e4 100644

> --- a/gcc/config/i386/i386-protos.h

> +++ b/gcc/config/i386/i386-protos.h

> @@ -202,6 +202,7 @@ extern rtx ix86_split_stack_guard (void);

>

>  extern void ix86_move_vector_high_sse_to_mmx (rtx);

>  extern void ix86_split_mmx_pack (rtx[], enum rtx_code);

> +extern void ix86_split_mmx_punpck (rtx[], bool);

>

>  #ifdef TREE_CODE

>  extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree, int);

> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c

> index fde32983fa2..d795af1dd93 100644

> --- a/gcc/config/i386/i386.c

> +++ b/gcc/config/i386/i386.c

> @@ -20006,6 +20006,83 @@ ix86_split_mmx_pack (rtx operands[], enum rtx_code code)

>    ix86_move_vector_high_sse_to_mmx (op0);

>  }

>

> +/* Split MMX punpcklXX/punpckhXX with SSE punpcklXX.  */

> +

> +void

> +ix86_split_mmx_punpck (rtx operands[], bool high_p)

> +{

> +  rtx op0 = operands[0];

> +  rtx op1 = operands[1];

> +  rtx op2 = operands[2];

> +  machine_mode mode = GET_MODE (op0);

> +  rtx mask;

> +  /* The corresponding SSE mode.  */

> +  machine_mode sse_mode, double_sse_mode;

> +

> +  switch (mode)

> +    {

> +    case E_V8QImode:

> +      sse_mode = V16QImode;

> +      double_sse_mode = V32QImode;

> +      mask = gen_rtx_PARALLEL (VOIDmode,

> +                              gen_rtvec (16,

> +                                         GEN_INT (0), GEN_INT (16),

> +                                         GEN_INT (1), GEN_INT (17),

> +                                         GEN_INT (2), GEN_INT (18),

> +                                         GEN_INT (3), GEN_INT (19),

> +                                         GEN_INT (4), GEN_INT (20),

> +                                         GEN_INT (5), GEN_INT (21),

> +                                         GEN_INT (6), GEN_INT (22),

> +                                         GEN_INT (7), GEN_INT (23)));

> +      break;

> +

> +    case E_V4HImode:

> +      sse_mode = V8HImode;

> +      double_sse_mode = V16HImode;

> +      mask = gen_rtx_PARALLEL (VOIDmode,

> +                              gen_rtvec (8,

> +                                         GEN_INT (0), GEN_INT (8),

> +                                         GEN_INT (1), GEN_INT (9),

> +                                         GEN_INT (2), GEN_INT (10),

> +                                         GEN_INT (3), GEN_INT (11)));

> +      break;

> +

> +    case E_V2SImode:

> +      sse_mode = V4SImode;

> +      double_sse_mode = V8SImode;

> +      mask = gen_rtx_PARALLEL (VOIDmode,

> +                              gen_rtvec (4,

> +                                         GEN_INT (0), GEN_INT (4),

> +                                         GEN_INT (1), GEN_INT (5)));

> +      break;

> +

> +    default:

> +      gcc_unreachable ();

> +    }

> +

> +  /* Generate SSE punpcklXX.  */

> +  rtx dest = gen_rtx_REG (sse_mode, REGNO (op0));

> +  op1 = gen_rtx_REG (sse_mode, REGNO (op1));

> +  op2 = gen_rtx_REG (sse_mode, REGNO (op2));

> +

> +  op1 = gen_rtx_VEC_CONCAT (double_sse_mode, op1, op2);

> +  op2 = gen_rtx_VEC_SELECT (sse_mode, op1, mask);

> +  rtx insn = gen_rtx_SET (dest, op2);

> +  emit_insn (insn);

> +

> +  if (high_p)

> +    {

> +      /* Move bits 64:127 to bits 0:63.  */

> +      mask = gen_rtx_PARALLEL (VOIDmode,

> +                              gen_rtvec (4, GEN_INT (2), GEN_INT (3),

> +                                         GEN_INT (0), GEN_INT (0)));

> +      dest = gen_rtx_REG (V4SImode, REGNO (dest));

> +      op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask);

> +      insn = gen_rtx_SET (dest, op1);

> +      emit_insn (insn);

> +    }

> +}

> +

>  /* Helper function of ix86_fixup_binary_operands to canonicalize

>     operand order.  Returns true if the operands should be swapped.  */

>

> diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md

> index c183f949a7c..fbd341109d6 100644

> --- a/gcc/config/i386/mmx.md

> +++ b/gcc/config/i386/mmx.md

> @@ -1083,87 +1083,111 @@

>     (set_attr "type" "mmxshft,sselog,sselog")

>     (set_attr "mode" "DI,TI,TI")])

>

> -(define_insn "mmx_punpckhbw"

> -  [(set (match_operand:V8QI 0 "register_operand" "=y")

> +(define_insn_and_split "mmx_punpckhbw"

> +  [(set (match_operand:V8QI 0 "register_operand" "=y,Yx,Yy")

>         (vec_select:V8QI

>           (vec_concat:V16QI

> -           (match_operand:V8QI 1 "register_operand" "0")

> -           (match_operand:V8QI 2 "nonimmediate_operand" "ym"))

> +           (match_operand:V8QI 1 "register_operand" "0,0,Yy")

> +           (match_operand:V8QI 2 "nonimmediate_operand" "ym,Yx,Yy"))

>            (parallel [(const_int 4) (const_int 12)

>                       (const_int 5) (const_int 13)

>                       (const_int 6) (const_int 14)

>                       (const_int 7) (const_int 15)])))]

> -  "TARGET_MMX"

> +  "TARGET_MMX_INSNS"

>    "punpckhbw\t{%2, %0|%0, %2}"


Please add "#" for alternatives that have to be split.

Uros.

> -  [(set_attr "type" "mmxcvt")

> -   (set_attr "mode" "DI")])

> +  "&& reload_completed && TARGET_MMX_WITH_SSE"

> +  [(const_int 0)]

> +  "ix86_split_mmx_punpck (operands, true);"

> +  [(set_attr "isa" "*,noavx,avx")

> +   (set_attr "type" "mmxcvt,sselog,sselog")

> +   (set_attr "mode" "DI,TI,TI")])

>

> -(define_insn "mmx_punpcklbw"

> -  [(set (match_operand:V8QI 0 "register_operand" "=y")

> +(define_insn_and_split "mmx_punpcklbw"

> +  [(set (match_operand:V8QI 0 "register_operand" "=y,Yx,Yy")

>         (vec_select:V8QI

>           (vec_concat:V16QI

> -           (match_operand:V8QI 1 "register_operand" "0")

> -           (match_operand:V8QI 2 "nonimmediate_operand" "ym"))

> +           (match_operand:V8QI 1 "register_operand" "0,0,Yy")

> +           (match_operand:V8QI 2 "nonimmediate_operand" "ym,Yx,Yy"))

>            (parallel [(const_int 0) (const_int 8)

>                       (const_int 1) (const_int 9)

>                       (const_int 2) (const_int 10)

>                       (const_int 3) (const_int 11)])))]

> -  "TARGET_MMX"

> +  "TARGET_MMX_INSNS"

>    "punpcklbw\t{%2, %0|%0, %k2}"

> -  [(set_attr "type" "mmxcvt")

> -   (set_attr "mode" "DI")])

> +  "&& reload_completed && TARGET_MMX_WITH_SSE"

> +  [(const_int 0)]

> +  "ix86_split_mmx_punpck (operands, false);"

> +  [(set_attr "isa" "*,noavx,avx")

> +   (set_attr "type" "mmxcvt,sselog,sselog")

> +   (set_attr "mode" "DI,TI,TI")])

>

> -(define_insn "mmx_punpckhwd"

> -  [(set (match_operand:V4HI 0 "register_operand" "=y")

> +(define_insn_and_split "mmx_punpckhwd"

> +  [(set (match_operand:V4HI 0 "register_operand" "=y,Yx,Yy")

>         (vec_select:V4HI

>           (vec_concat:V8HI

> -           (match_operand:V4HI 1 "register_operand" "0")

> -           (match_operand:V4HI 2 "nonimmediate_operand" "ym"))

> +           (match_operand:V4HI 1 "register_operand" "0,0,Yy")

> +           (match_operand:V4HI 2 "nonimmediate_operand" "ym,Yx,Yy"))

>            (parallel [(const_int 2) (const_int 6)

>                       (const_int 3) (const_int 7)])))]

> -  "TARGET_MMX"

> +  "TARGET_MMX_INSNS"

>    "punpckhwd\t{%2, %0|%0, %2}"

> -  [(set_attr "type" "mmxcvt")

> -   (set_attr "mode" "DI")])

> +  "&& reload_completed && TARGET_MMX_WITH_SSE"

> +  [(const_int 0)]

> +  "ix86_split_mmx_punpck (operands, true);"

> +  [(set_attr "isa" "*,noavx,avx")

> +   (set_attr "type" "mmxcvt,sselog,sselog")

> +   (set_attr "mode" "DI,TI,TI")])

>

> -(define_insn "mmx_punpcklwd"

> -  [(set (match_operand:V4HI 0 "register_operand" "=y")

> +(define_insn_and_split "mmx_punpcklwd"

> +  [(set (match_operand:V4HI 0 "register_operand" "=y,Yx,Yy")

>         (vec_select:V4HI

>           (vec_concat:V8HI

> -           (match_operand:V4HI 1 "register_operand" "0")

> -           (match_operand:V4HI 2 "nonimmediate_operand" "ym"))

> +           (match_operand:V4HI 1 "register_operand" "0,0,Yy")

> +           (match_operand:V4HI 2 "nonimmediate_operand" "ym,Yx,Yy"))

>            (parallel [(const_int 0) (const_int 4)

>                       (const_int 1) (const_int 5)])))]

> -  "TARGET_MMX"

> +  "TARGET_MMX_INSNS"

>    "punpcklwd\t{%2, %0|%0, %k2}"

> -  [(set_attr "type" "mmxcvt")

> -   (set_attr "mode" "DI")])

> +  "&& reload_completed && TARGET_MMX_WITH_SSE"

> +  [(const_int 0)]

> +  "ix86_split_mmx_punpck (operands, false);"

> +  [(set_attr "isa" "*,noavx,avx")

> +   (set_attr "type" "mmxcvt,sselog,sselog")

> +   (set_attr "mode" "DI,TI,TI")])

>

> -(define_insn "mmx_punpckhdq"

> -  [(set (match_operand:V2SI 0 "register_operand" "=y")

> +(define_insn_and_split "mmx_punpckhdq"

> +  [(set (match_operand:V2SI 0 "register_operand" "=y,Yx,Yy")

>         (vec_select:V2SI

>           (vec_concat:V4SI

> -           (match_operand:V2SI 1 "register_operand" "0")

> -           (match_operand:V2SI 2 "nonimmediate_operand" "ym"))

> +           (match_operand:V2SI 1 "register_operand" "0,0,Yy")

> +           (match_operand:V2SI 2 "nonimmediate_operand" "ym,Yx,Yy"))

>           (parallel [(const_int 1)

>                      (const_int 3)])))]

> -  "TARGET_MMX"

> +  "TARGET_MMX_INSNS"

>    "punpckhdq\t{%2, %0|%0, %2}"

> -  [(set_attr "type" "mmxcvt")

> -   (set_attr "mode" "DI")])

> +  "&& reload_completed && TARGET_MMX_WITH_SSE"

> +  [(const_int 0)]

> +  "ix86_split_mmx_punpck (operands, true);"

> +  [(set_attr "isa" "*,noavx,avx")

> +   (set_attr "type" "mmxcvt,sselog,sselog")

> +   (set_attr "mode" "DI,TI,TI")])

>

> -(define_insn "mmx_punpckldq"

> -  [(set (match_operand:V2SI 0 "register_operand" "=y")

> +(define_insn_and_split "mmx_punpckldq"

> +  [(set (match_operand:V2SI 0 "register_operand" "=y,Yx,Yy")

>         (vec_select:V2SI

>           (vec_concat:V4SI

> -           (match_operand:V2SI 1 "register_operand" "0")

> -           (match_operand:V2SI 2 "nonimmediate_operand" "ym"))

> +           (match_operand:V2SI 1 "register_operand" "0,0,Yy")

> +           (match_operand:V2SI 2 "nonimmediate_operand" "ym,Yx,Yy"))

>           (parallel [(const_int 0)

>                      (const_int 2)])))]

> -  "TARGET_MMX"

> +  "TARGET_MMX_INSNS"

>    "punpckldq\t{%2, %0|%0, %k2}"

> -  [(set_attr "type" "mmxcvt")

> -   (set_attr "mode" "DI")])

> +  "&& reload_completed && TARGET_MMX_WITH_SSE"

> +  [(const_int 0)]

> +  "ix86_split_mmx_punpck (operands, false);"

> +  [(set_attr "isa" "*,noavx,avx")

> +   (set_attr "type" "mmxcvt,sselog,sselog")

> +   (set_attr "mode" "DI,TI,TI")])

>

>  (define_expand "mmx_pinsrw"

>    [(set (match_operand:V4HI 0 "register_operand")

> --

> 2.20.1

>
H.J. Lu Feb. 4, 2019, 1:56 p.m. | #2
On Mon, Feb 4, 2019 at 4:01 AM Uros Bizjak <ubizjak@gmail.com> wrote:
>

> On Fri, Feb 1, 2019 at 10:18 PM H.J. Lu <hjl.tools@gmail.com> wrote:

> >

> > Emulate MMX punpcklXX/punpckhXX with SSE punpcklXX.  For MMX punpckhXX,

> > move bits 64:127 to bits 0:63 in SSE register.  Only SSE register source

> > operand is allowed.

> >

> >         PR target/89021

> >         * config/i386/i386-protos.h (ix86_split_mmx_punpck): New

> >         prototype.

> >         * config/i386/i386.c (ix86_split_mmx_punpck): New function.

> >         * config/i386/mmx.m (mmx_punpckhbw): Changed to

> >         define_insn_and_split to support SSE emulation.

> >         (mmx_punpcklbw): Likewise.

> >         (mmx_punpckhwd): Likewise.

> >         (mmx_punpcklwd): Likewise.

> >         (mmx_punpckhdq): Likewise.

> >         (mmx_punpckldq): Likewise.

> > ---

> >  gcc/config/i386/i386-protos.h |   1 +

> >  gcc/config/i386/i386.c        |  77 ++++++++++++++++++++++++

> >  gcc/config/i386/mmx.md        | 108 +++++++++++++++++++++-------------

> >  3 files changed, 144 insertions(+), 42 deletions(-)

> >

> > diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h

> > index bb96a420a85..dc7fc38d8e4 100644

> > --- a/gcc/config/i386/i386-protos.h

> > +++ b/gcc/config/i386/i386-protos.h

> > @@ -202,6 +202,7 @@ extern rtx ix86_split_stack_guard (void);

> >

> >  extern void ix86_move_vector_high_sse_to_mmx (rtx);

> >  extern void ix86_split_mmx_pack (rtx[], enum rtx_code);

> > +extern void ix86_split_mmx_punpck (rtx[], bool);

> >

> >  #ifdef TREE_CODE

> >  extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree, int);

> > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c

> > index fde32983fa2..d795af1dd93 100644

> > --- a/gcc/config/i386/i386.c

> > +++ b/gcc/config/i386/i386.c

> > @@ -20006,6 +20006,83 @@ ix86_split_mmx_pack (rtx operands[], enum rtx_code code)

> >    ix86_move_vector_high_sse_to_mmx (op0);

> >  }

> >

> > +/* Split MMX punpcklXX/punpckhXX with SSE punpcklXX.  */

> > +

> > +void

> > +ix86_split_mmx_punpck (rtx operands[], bool high_p)

> > +{

> > +  rtx op0 = operands[0];

> > +  rtx op1 = operands[1];

> > +  rtx op2 = operands[2];

> > +  machine_mode mode = GET_MODE (op0);

> > +  rtx mask;

> > +  /* The corresponding SSE mode.  */

> > +  machine_mode sse_mode, double_sse_mode;

> > +

> > +  switch (mode)

> > +    {

> > +    case E_V8QImode:

> > +      sse_mode = V16QImode;

> > +      double_sse_mode = V32QImode;

> > +      mask = gen_rtx_PARALLEL (VOIDmode,

> > +                              gen_rtvec (16,

> > +                                         GEN_INT (0), GEN_INT (16),

> > +                                         GEN_INT (1), GEN_INT (17),

> > +                                         GEN_INT (2), GEN_INT (18),

> > +                                         GEN_INT (3), GEN_INT (19),

> > +                                         GEN_INT (4), GEN_INT (20),

> > +                                         GEN_INT (5), GEN_INT (21),

> > +                                         GEN_INT (6), GEN_INT (22),

> > +                                         GEN_INT (7), GEN_INT (23)));

> > +      break;

> > +

> > +    case E_V4HImode:

> > +      sse_mode = V8HImode;

> > +      double_sse_mode = V16HImode;

> > +      mask = gen_rtx_PARALLEL (VOIDmode,

> > +                              gen_rtvec (8,

> > +                                         GEN_INT (0), GEN_INT (8),

> > +                                         GEN_INT (1), GEN_INT (9),

> > +                                         GEN_INT (2), GEN_INT (10),

> > +                                         GEN_INT (3), GEN_INT (11)));

> > +      break;

> > +

> > +    case E_V2SImode:

> > +      sse_mode = V4SImode;

> > +      double_sse_mode = V8SImode;

> > +      mask = gen_rtx_PARALLEL (VOIDmode,

> > +                              gen_rtvec (4,

> > +                                         GEN_INT (0), GEN_INT (4),

> > +                                         GEN_INT (1), GEN_INT (5)));

> > +      break;

> > +

> > +    default:

> > +      gcc_unreachable ();

> > +    }

> > +

> > +  /* Generate SSE punpcklXX.  */

> > +  rtx dest = gen_rtx_REG (sse_mode, REGNO (op0));

> > +  op1 = gen_rtx_REG (sse_mode, REGNO (op1));

> > +  op2 = gen_rtx_REG (sse_mode, REGNO (op2));

> > +

> > +  op1 = gen_rtx_VEC_CONCAT (double_sse_mode, op1, op2);

> > +  op2 = gen_rtx_VEC_SELECT (sse_mode, op1, mask);

> > +  rtx insn = gen_rtx_SET (dest, op2);

> > +  emit_insn (insn);

> > +

> > +  if (high_p)

> > +    {

> > +      /* Move bits 64:127 to bits 0:63.  */

> > +      mask = gen_rtx_PARALLEL (VOIDmode,

> > +                              gen_rtvec (4, GEN_INT (2), GEN_INT (3),

> > +                                         GEN_INT (0), GEN_INT (0)));

> > +      dest = gen_rtx_REG (V4SImode, REGNO (dest));

> > +      op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask);

> > +      insn = gen_rtx_SET (dest, op1);

> > +      emit_insn (insn);

> > +    }

> > +}

> > +

> >  /* Helper function of ix86_fixup_binary_operands to canonicalize

> >     operand order.  Returns true if the operands should be swapped.  */

> >

> > diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md

> > index c183f949a7c..fbd341109d6 100644

> > --- a/gcc/config/i386/mmx.md

> > +++ b/gcc/config/i386/mmx.md

> > @@ -1083,87 +1083,111 @@

> >     (set_attr "type" "mmxshft,sselog,sselog")

> >     (set_attr "mode" "DI,TI,TI")])

> >

> > -(define_insn "mmx_punpckhbw"

> > -  [(set (match_operand:V8QI 0 "register_operand" "=y")

> > +(define_insn_and_split "mmx_punpckhbw"

> > +  [(set (match_operand:V8QI 0 "register_operand" "=y,Yx,Yy")

> >         (vec_select:V8QI

> >           (vec_concat:V16QI

> > -           (match_operand:V8QI 1 "register_operand" "0")

> > -           (match_operand:V8QI 2 "nonimmediate_operand" "ym"))

> > +           (match_operand:V8QI 1 "register_operand" "0,0,Yy")

> > +           (match_operand:V8QI 2 "nonimmediate_operand" "ym,Yx,Yy"))

> >            (parallel [(const_int 4) (const_int 12)

> >                       (const_int 5) (const_int 13)

> >                       (const_int 6) (const_int 14)

> >                       (const_int 7) (const_int 15)])))]

> > -  "TARGET_MMX"

> > +  "TARGET_MMX_INSNS"

> >    "punpckhbw\t{%2, %0|%0, %2}"

>

> Please add "#" for alternatives that have to be split.

>


Did you mean

(define_insn_and_split "mmx_punpckhbw"
  [(set (match_operand:V8QI 0 "register_operand" "=y,Yx,Yy")
        (vec_select:V8QI
          (vec_concat:V16QI
            (match_operand:V8QI 1 "register_operand" "0,0,Yy")
            (match_operand:V8QI 2 "nonimmediate_operand" "ym,Yx,Yy"))
          (parallel [(const_int 4) (const_int 12)
                     (const_int 5) (const_int 13)
                     (const_int 6) (const_int 14)
                     (const_int 7) (const_int 15)])))]
  "TARGET_MMX_INSNS"
  "#"
  "&& reload_completed && TARGET_MMX_WITH_SSE"
  [(const_int 0)]
  "ix86_split_mmx_punpck (operands, true);"
  [(set_attr "isa" "*,noavx,avx")
   (set_attr "type" "mmxcvt,sselog,sselog")
   (set_attr "mode" "DI,TI,TI")])

(define_insn "*mmx_punpckhbw"
  [(set (match_operand:V8QI 0 "register_operand" "=y")
        (vec_select:V8QI
          (vec_concat:V16QI
            (match_operand:V8QI 1 "register_operand" "0")
            (match_operand:V8QI 2 "nonimmediate_operand" "ym"))
          (parallel [(const_int 4) (const_int 12)
                     (const_int 5) (const_int 13)
                     (const_int 6) (const_int 14)
                     (const_int 7) (const_int 15)])))]
  "TARGET_MMX"
  "punpckhbw\t{%2, %0|%0, %2}"
  [(set_attr "type" "mmxcvt")
   (set_attr "mode" "DI")])

What is the advantage of an extra pattern?

-- 
H.J.
Uros Bizjak Feb. 4, 2019, 2:02 p.m. | #3
On Mon, Feb 4, 2019 at 2:57 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>

> On Mon, Feb 4, 2019 at 4:01 AM Uros Bizjak <ubizjak@gmail.com> wrote:

> >

> > On Fri, Feb 1, 2019 at 10:18 PM H.J. Lu <hjl.tools@gmail.com> wrote:

> > >

> > > Emulate MMX punpcklXX/punpckhXX with SSE punpcklXX.  For MMX punpckhXX,

> > > move bits 64:127 to bits 0:63 in SSE register.  Only SSE register source

> > > operand is allowed.

> > >

> > >         PR target/89021

> > >         * config/i386/i386-protos.h (ix86_split_mmx_punpck): New

> > >         prototype.

> > >         * config/i386/i386.c (ix86_split_mmx_punpck): New function.

> > >         * config/i386/mmx.m (mmx_punpckhbw): Changed to

> > >         define_insn_and_split to support SSE emulation.

> > >         (mmx_punpcklbw): Likewise.

> > >         (mmx_punpckhwd): Likewise.

> > >         (mmx_punpcklwd): Likewise.

> > >         (mmx_punpckhdq): Likewise.

> > >         (mmx_punpckldq): Likewise.

> > > ---

> > >  gcc/config/i386/i386-protos.h |   1 +

> > >  gcc/config/i386/i386.c        |  77 ++++++++++++++++++++++++

> > >  gcc/config/i386/mmx.md        | 108 +++++++++++++++++++++-------------

> > >  3 files changed, 144 insertions(+), 42 deletions(-)

> > >

> > > diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h

> > > index bb96a420a85..dc7fc38d8e4 100644

> > > --- a/gcc/config/i386/i386-protos.h

> > > +++ b/gcc/config/i386/i386-protos.h

> > > @@ -202,6 +202,7 @@ extern rtx ix86_split_stack_guard (void);

> > >

> > >  extern void ix86_move_vector_high_sse_to_mmx (rtx);

> > >  extern void ix86_split_mmx_pack (rtx[], enum rtx_code);

> > > +extern void ix86_split_mmx_punpck (rtx[], bool);

> > >

> > >  #ifdef TREE_CODE

> > >  extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree, int);

> > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c

> > > index fde32983fa2..d795af1dd93 100644

> > > --- a/gcc/config/i386/i386.c

> > > +++ b/gcc/config/i386/i386.c

> > > @@ -20006,6 +20006,83 @@ ix86_split_mmx_pack (rtx operands[], enum rtx_code code)

> > >    ix86_move_vector_high_sse_to_mmx (op0);

> > >  }

> > >

> > > +/* Split MMX punpcklXX/punpckhXX with SSE punpcklXX.  */

> > > +

> > > +void

> > > +ix86_split_mmx_punpck (rtx operands[], bool high_p)

> > > +{

> > > +  rtx op0 = operands[0];

> > > +  rtx op1 = operands[1];

> > > +  rtx op2 = operands[2];

> > > +  machine_mode mode = GET_MODE (op0);

> > > +  rtx mask;

> > > +  /* The corresponding SSE mode.  */

> > > +  machine_mode sse_mode, double_sse_mode;

> > > +

> > > +  switch (mode)

> > > +    {

> > > +    case E_V8QImode:

> > > +      sse_mode = V16QImode;

> > > +      double_sse_mode = V32QImode;

> > > +      mask = gen_rtx_PARALLEL (VOIDmode,

> > > +                              gen_rtvec (16,

> > > +                                         GEN_INT (0), GEN_INT (16),

> > > +                                         GEN_INT (1), GEN_INT (17),

> > > +                                         GEN_INT (2), GEN_INT (18),

> > > +                                         GEN_INT (3), GEN_INT (19),

> > > +                                         GEN_INT (4), GEN_INT (20),

> > > +                                         GEN_INT (5), GEN_INT (21),

> > > +                                         GEN_INT (6), GEN_INT (22),

> > > +                                         GEN_INT (7), GEN_INT (23)));

> > > +      break;

> > > +

> > > +    case E_V4HImode:

> > > +      sse_mode = V8HImode;

> > > +      double_sse_mode = V16HImode;

> > > +      mask = gen_rtx_PARALLEL (VOIDmode,

> > > +                              gen_rtvec (8,

> > > +                                         GEN_INT (0), GEN_INT (8),

> > > +                                         GEN_INT (1), GEN_INT (9),

> > > +                                         GEN_INT (2), GEN_INT (10),

> > > +                                         GEN_INT (3), GEN_INT (11)));

> > > +      break;

> > > +

> > > +    case E_V2SImode:

> > > +      sse_mode = V4SImode;

> > > +      double_sse_mode = V8SImode;

> > > +      mask = gen_rtx_PARALLEL (VOIDmode,

> > > +                              gen_rtvec (4,

> > > +                                         GEN_INT (0), GEN_INT (4),

> > > +                                         GEN_INT (1), GEN_INT (5)));

> > > +      break;

> > > +

> > > +    default:

> > > +      gcc_unreachable ();

> > > +    }

> > > +

> > > +  /* Generate SSE punpcklXX.  */

> > > +  rtx dest = gen_rtx_REG (sse_mode, REGNO (op0));

> > > +  op1 = gen_rtx_REG (sse_mode, REGNO (op1));

> > > +  op2 = gen_rtx_REG (sse_mode, REGNO (op2));

> > > +

> > > +  op1 = gen_rtx_VEC_CONCAT (double_sse_mode, op1, op2);

> > > +  op2 = gen_rtx_VEC_SELECT (sse_mode, op1, mask);

> > > +  rtx insn = gen_rtx_SET (dest, op2);

> > > +  emit_insn (insn);

> > > +

> > > +  if (high_p)

> > > +    {

> > > +      /* Move bits 64:127 to bits 0:63.  */

> > > +      mask = gen_rtx_PARALLEL (VOIDmode,

> > > +                              gen_rtvec (4, GEN_INT (2), GEN_INT (3),

> > > +                                         GEN_INT (0), GEN_INT (0)));

> > > +      dest = gen_rtx_REG (V4SImode, REGNO (dest));

> > > +      op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask);

> > > +      insn = gen_rtx_SET (dest, op1);

> > > +      emit_insn (insn);

> > > +    }

> > > +}

> > > +

> > >  /* Helper function of ix86_fixup_binary_operands to canonicalize

> > >     operand order.  Returns true if the operands should be swapped.  */

> > >

> > > diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md

> > > index c183f949a7c..fbd341109d6 100644

> > > --- a/gcc/config/i386/mmx.md

> > > +++ b/gcc/config/i386/mmx.md

> > > @@ -1083,87 +1083,111 @@

> > >     (set_attr "type" "mmxshft,sselog,sselog")

> > >     (set_attr "mode" "DI,TI,TI")])

> > >

> > > -(define_insn "mmx_punpckhbw"

> > > -  [(set (match_operand:V8QI 0 "register_operand" "=y")

> > > +(define_insn_and_split "mmx_punpckhbw"

> > > +  [(set (match_operand:V8QI 0 "register_operand" "=y,Yx,Yy")

> > >         (vec_select:V8QI

> > >           (vec_concat:V16QI

> > > -           (match_operand:V8QI 1 "register_operand" "0")

> > > -           (match_operand:V8QI 2 "nonimmediate_operand" "ym"))

> > > +           (match_operand:V8QI 1 "register_operand" "0,0,Yy")

> > > +           (match_operand:V8QI 2 "nonimmediate_operand" "ym,Yx,Yy"))

> > >            (parallel [(const_int 4) (const_int 12)

> > >                       (const_int 5) (const_int 13)

> > >                       (const_int 6) (const_int 14)

> > >                       (const_int 7) (const_int 15)])))]

> > > -  "TARGET_MMX"

> > > +  "TARGET_MMX_INSNS"

> > >    "punpckhbw\t{%2, %0|%0, %2}"

> >

> > Please add "#" for alternatives that have to be split.

> >

>

> Did you mean

>

> (define_insn_and_split "mmx_punpckhbw"

>   [(set (match_operand:V8QI 0 "register_operand" "=y,Yx,Yy")

>         (vec_select:V8QI

>           (vec_concat:V16QI

>             (match_operand:V8QI 1 "register_operand" "0,0,Yy")

>             (match_operand:V8QI 2 "nonimmediate_operand" "ym,Yx,Yy"))

>           (parallel [(const_int 4) (const_int 12)

>                      (const_int 5) (const_int 13)

>                      (const_int 6) (const_int 14)

>                      (const_int 7) (const_int 15)])))]

>   "TARGET_MMX_INSNS"

>   "#"

>   "&& reload_completed && TARGET_MMX_WITH_SSE"

>   [(const_int 0)]

>   "ix86_split_mmx_punpck (operands, true);"

>   [(set_attr "isa" "*,noavx,avx")

>    (set_attr "type" "mmxcvt,sselog,sselog")

>    (set_attr "mode" "DI,TI,TI")])

>

> (define_insn "*mmx_punpckhbw"

>   [(set (match_operand:V8QI 0 "register_operand" "=y")

>         (vec_select:V8QI

>           (vec_concat:V16QI

>             (match_operand:V8QI 1 "register_operand" "0")

>             (match_operand:V8QI 2 "nonimmediate_operand" "ym"))

>           (parallel [(const_int 4) (const_int 12)

>                      (const_int 5) (const_int 13)

>                      (const_int 6) (const_int 14)

>                      (const_int 7) (const_int 15)])))]

>   "TARGET_MMX"

>   "punpckhbw\t{%2, %0|%0, %2}"

>   [(set_attr "type" "mmxcvt")

>    (set_attr "mode" "DI")])


No.

You introduced two extra alternatives that have to be split, so you
have to add two additional entries to insn template. Since these two
have to be split, there is no real instruction to be emitted, so you
need to emit "# for corresponding alternative.

Uros.

> What is the advantage of an extra pattern?

>

> --

> H.J.

Patch

diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index bb96a420a85..dc7fc38d8e4 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -202,6 +202,7 @@  extern rtx ix86_split_stack_guard (void);
 
 extern void ix86_move_vector_high_sse_to_mmx (rtx);
 extern void ix86_split_mmx_pack (rtx[], enum rtx_code);
+extern void ix86_split_mmx_punpck (rtx[], bool);
 
 #ifdef TREE_CODE
 extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree, int);
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index fde32983fa2..d795af1dd93 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -20006,6 +20006,83 @@  ix86_split_mmx_pack (rtx operands[], enum rtx_code code)
   ix86_move_vector_high_sse_to_mmx (op0);
 }
 
+/* Split MMX punpcklXX/punpckhXX with SSE punpcklXX.  */
+
+void
+ix86_split_mmx_punpck (rtx operands[], bool high_p)
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  rtx op2 = operands[2];
+  machine_mode mode = GET_MODE (op0);
+  rtx mask;
+  /* The corresponding SSE mode.  */
+  machine_mode sse_mode, double_sse_mode;
+
+  switch (mode)
+    {
+    case E_V8QImode:
+      sse_mode = V16QImode;
+      double_sse_mode = V32QImode;
+      mask = gen_rtx_PARALLEL (VOIDmode,
+			       gen_rtvec (16,
+					  GEN_INT (0), GEN_INT (16),
+					  GEN_INT (1), GEN_INT (17),
+					  GEN_INT (2), GEN_INT (18),
+					  GEN_INT (3), GEN_INT (19),
+					  GEN_INT (4), GEN_INT (20),
+					  GEN_INT (5), GEN_INT (21),
+					  GEN_INT (6), GEN_INT (22),
+					  GEN_INT (7), GEN_INT (23)));
+      break;
+
+    case E_V4HImode:
+      sse_mode = V8HImode;
+      double_sse_mode = V16HImode;
+      mask = gen_rtx_PARALLEL (VOIDmode,
+			       gen_rtvec (8,
+					  GEN_INT (0), GEN_INT (8),
+					  GEN_INT (1), GEN_INT (9),
+					  GEN_INT (2), GEN_INT (10),
+					  GEN_INT (3), GEN_INT (11)));
+      break;
+
+    case E_V2SImode:
+      sse_mode = V4SImode;
+      double_sse_mode = V8SImode;
+      mask = gen_rtx_PARALLEL (VOIDmode,
+			       gen_rtvec (4,
+					  GEN_INT (0), GEN_INT (4),
+					  GEN_INT (1), GEN_INT (5)));
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  /* Generate SSE punpcklXX.  */
+  rtx dest = gen_rtx_REG (sse_mode, REGNO (op0));
+  op1 = gen_rtx_REG (sse_mode, REGNO (op1));
+  op2 = gen_rtx_REG (sse_mode, REGNO (op2));
+
+  op1 = gen_rtx_VEC_CONCAT (double_sse_mode, op1, op2);
+  op2 = gen_rtx_VEC_SELECT (sse_mode, op1, mask);
+  rtx insn = gen_rtx_SET (dest, op2);
+  emit_insn (insn);
+
+  if (high_p)
+    {
+      /* Move bits 64:127 to bits 0:63.  */
+      mask = gen_rtx_PARALLEL (VOIDmode,
+			       gen_rtvec (4, GEN_INT (2), GEN_INT (3),
+					  GEN_INT (0), GEN_INT (0)));
+      dest = gen_rtx_REG (V4SImode, REGNO (dest));
+      op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
+      insn = gen_rtx_SET (dest, op1);
+      emit_insn (insn);
+    }
+}
+
 /* Helper function of ix86_fixup_binary_operands to canonicalize
    operand order.  Returns true if the operands should be swapped.  */
 
diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index c183f949a7c..fbd341109d6 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -1083,87 +1083,111 @@ 
    (set_attr "type" "mmxshft,sselog,sselog")
    (set_attr "mode" "DI,TI,TI")])
 
-(define_insn "mmx_punpckhbw"
-  [(set (match_operand:V8QI 0 "register_operand" "=y")
+(define_insn_and_split "mmx_punpckhbw"
+  [(set (match_operand:V8QI 0 "register_operand" "=y,Yx,Yy")
 	(vec_select:V8QI
 	  (vec_concat:V16QI
-	    (match_operand:V8QI 1 "register_operand" "0")
-	    (match_operand:V8QI 2 "nonimmediate_operand" "ym"))
+	    (match_operand:V8QI 1 "register_operand" "0,0,Yy")
+	    (match_operand:V8QI 2 "nonimmediate_operand" "ym,Yx,Yy"))
           (parallel [(const_int 4) (const_int 12)
                      (const_int 5) (const_int 13)
                      (const_int 6) (const_int 14)
                      (const_int 7) (const_int 15)])))]
-  "TARGET_MMX"
+  "TARGET_MMX_INSNS"
   "punpckhbw\t{%2, %0|%0, %2}"
-  [(set_attr "type" "mmxcvt")
-   (set_attr "mode" "DI")])
+  "&& reload_completed && TARGET_MMX_WITH_SSE"
+  [(const_int 0)]
+  "ix86_split_mmx_punpck (operands, true);"
+  [(set_attr "isa" "*,noavx,avx")
+   (set_attr "type" "mmxcvt,sselog,sselog")
+   (set_attr "mode" "DI,TI,TI")])
 
-(define_insn "mmx_punpcklbw"
-  [(set (match_operand:V8QI 0 "register_operand" "=y")
+(define_insn_and_split "mmx_punpcklbw"
+  [(set (match_operand:V8QI 0 "register_operand" "=y,Yx,Yy")
 	(vec_select:V8QI
 	  (vec_concat:V16QI
-	    (match_operand:V8QI 1 "register_operand" "0")
-	    (match_operand:V8QI 2 "nonimmediate_operand" "ym"))
+	    (match_operand:V8QI 1 "register_operand" "0,0,Yy")
+	    (match_operand:V8QI 2 "nonimmediate_operand" "ym,Yx,Yy"))
           (parallel [(const_int 0) (const_int 8)
                      (const_int 1) (const_int 9)
                      (const_int 2) (const_int 10)
                      (const_int 3) (const_int 11)])))]
-  "TARGET_MMX"
+  "TARGET_MMX_INSNS"
   "punpcklbw\t{%2, %0|%0, %k2}"
-  [(set_attr "type" "mmxcvt")
-   (set_attr "mode" "DI")])
+  "&& reload_completed && TARGET_MMX_WITH_SSE"
+  [(const_int 0)]
+  "ix86_split_mmx_punpck (operands, false);"
+  [(set_attr "isa" "*,noavx,avx")
+   (set_attr "type" "mmxcvt,sselog,sselog")
+   (set_attr "mode" "DI,TI,TI")])
 
-(define_insn "mmx_punpckhwd"
-  [(set (match_operand:V4HI 0 "register_operand" "=y")
+(define_insn_and_split "mmx_punpckhwd"
+  [(set (match_operand:V4HI 0 "register_operand" "=y,Yx,Yy")
 	(vec_select:V4HI
 	  (vec_concat:V8HI
-	    (match_operand:V4HI 1 "register_operand" "0")
-	    (match_operand:V4HI 2 "nonimmediate_operand" "ym"))
+	    (match_operand:V4HI 1 "register_operand" "0,0,Yy")
+	    (match_operand:V4HI 2 "nonimmediate_operand" "ym,Yx,Yy"))
           (parallel [(const_int 2) (const_int 6)
                      (const_int 3) (const_int 7)])))]
-  "TARGET_MMX"
+  "TARGET_MMX_INSNS"
   "punpckhwd\t{%2, %0|%0, %2}"
-  [(set_attr "type" "mmxcvt")
-   (set_attr "mode" "DI")])
+  "&& reload_completed && TARGET_MMX_WITH_SSE"
+  [(const_int 0)]
+  "ix86_split_mmx_punpck (operands, true);"
+  [(set_attr "isa" "*,noavx,avx")
+   (set_attr "type" "mmxcvt,sselog,sselog")
+   (set_attr "mode" "DI,TI,TI")])
 
-(define_insn "mmx_punpcklwd"
-  [(set (match_operand:V4HI 0 "register_operand" "=y")
+(define_insn_and_split "mmx_punpcklwd"
+  [(set (match_operand:V4HI 0 "register_operand" "=y,Yx,Yy")
 	(vec_select:V4HI
 	  (vec_concat:V8HI
-	    (match_operand:V4HI 1 "register_operand" "0")
-	    (match_operand:V4HI 2 "nonimmediate_operand" "ym"))
+	    (match_operand:V4HI 1 "register_operand" "0,0,Yy")
+	    (match_operand:V4HI 2 "nonimmediate_operand" "ym,Yx,Yy"))
           (parallel [(const_int 0) (const_int 4)
                      (const_int 1) (const_int 5)])))]
-  "TARGET_MMX"
+  "TARGET_MMX_INSNS"
   "punpcklwd\t{%2, %0|%0, %k2}"
-  [(set_attr "type" "mmxcvt")
-   (set_attr "mode" "DI")])
+  "&& reload_completed && TARGET_MMX_WITH_SSE"
+  [(const_int 0)]
+  "ix86_split_mmx_punpck (operands, false);"
+  [(set_attr "isa" "*,noavx,avx")
+   (set_attr "type" "mmxcvt,sselog,sselog")
+   (set_attr "mode" "DI,TI,TI")])
 
-(define_insn "mmx_punpckhdq"
-  [(set (match_operand:V2SI 0 "register_operand" "=y")
+(define_insn_and_split "mmx_punpckhdq"
+  [(set (match_operand:V2SI 0 "register_operand" "=y,Yx,Yy")
 	(vec_select:V2SI
 	  (vec_concat:V4SI
-	    (match_operand:V2SI 1 "register_operand" "0")
-	    (match_operand:V2SI 2 "nonimmediate_operand" "ym"))
+	    (match_operand:V2SI 1 "register_operand" "0,0,Yy")
+	    (match_operand:V2SI 2 "nonimmediate_operand" "ym,Yx,Yy"))
 	  (parallel [(const_int 1)
 		     (const_int 3)])))]
-  "TARGET_MMX"
+  "TARGET_MMX_INSNS"
   "punpckhdq\t{%2, %0|%0, %2}"
-  [(set_attr "type" "mmxcvt")
-   (set_attr "mode" "DI")])
+  "&& reload_completed && TARGET_MMX_WITH_SSE"
+  [(const_int 0)]
+  "ix86_split_mmx_punpck (operands, true);"
+  [(set_attr "isa" "*,noavx,avx")
+   (set_attr "type" "mmxcvt,sselog,sselog")
+   (set_attr "mode" "DI,TI,TI")])
 
-(define_insn "mmx_punpckldq"
-  [(set (match_operand:V2SI 0 "register_operand" "=y")
+(define_insn_and_split "mmx_punpckldq"
+  [(set (match_operand:V2SI 0 "register_operand" "=y,Yx,Yy")
 	(vec_select:V2SI
 	  (vec_concat:V4SI
-	    (match_operand:V2SI 1 "register_operand" "0")
-	    (match_operand:V2SI 2 "nonimmediate_operand" "ym"))
+	    (match_operand:V2SI 1 "register_operand" "0,0,Yy")
+	    (match_operand:V2SI 2 "nonimmediate_operand" "ym,Yx,Yy"))
 	  (parallel [(const_int 0)
 		     (const_int 2)])))]
-  "TARGET_MMX"
+  "TARGET_MMX_INSNS"
   "punpckldq\t{%2, %0|%0, %k2}"
-  [(set_attr "type" "mmxcvt")
-   (set_attr "mode" "DI")])
+  "&& reload_completed && TARGET_MMX_WITH_SSE"
+  [(const_int 0)]
+  "ix86_split_mmx_punpck (operands, false);"
+  [(set_attr "isa" "*,noavx,avx")
+   (set_attr "type" "mmxcvt,sselog,sselog")
+   (set_attr "mode" "DI,TI,TI")])
 
 (define_expand "mmx_pinsrw"
   [(set (match_operand:V4HI 0 "register_operand")