x86: Allow V1TI vector register pushes

Message ID 20200516181309.GA172820@gmail.com
State New
Headers show
Series
  • x86: Allow V1TI vector register pushes
Related show

Commit Message

Richard Biener via Gcc-patches May 16, 2020, 6:13 p.m.
On Fri, May 15, 2020 at 11:21:30AM +0200, Uros Bizjak wrote:
> On Wed, May 13, 2020 at 5:58 PM H.J. Lu <hjl.tools@gmail.com> wrote:

> 

> > > > > The question is, why STV pass creates its funny sequence? The original

> > > > > sequence should be easily solved by storing DImode from XMM register

> > > > > and (with patched gcc) pushing DImode value from the same XMM

> > > > > register.

> > > >

> > > > STV pass is mostly OK since it has to use XMM to move upper and lower

> > > > 32 bits of a 64-bit integer.  The problem is that push XMM becomes very

> > > > expensive later.

> > >

> > > As shown in the patch, XMM push should be just decrement of SP reg and

> > > move to the created stack slot.

> >

> > OK for master if there are no regressions?

> 

> diff --git a/gcc/config/i386/i386-features.c b/gcc/config/i386/i386-features.c

> index 78fb373db6e..6cad125fa83 100644

> --- a/gcc/config/i386/i386-features.c

> +++ b/gcc/config/i386/i386-features.c

> @@ -1264,7 +1264,8 @@ has_non_address_hard_reg (rtx_insn *insn)

>    FOR_EACH_INSN_DEF (ref, insn)

>      if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))

>      && !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)

> -    && DF_REF_REGNO (ref) != FLAGS_REG)

> +    && DF_REF_REGNO (ref) != FLAGS_REG

> +    && DF_REF_REGNO (ref) != SP_REG)

>        return true;

> 

> I don't think this part is correct. The function comment says:

> 

> /* Return 1 if INSN uses or defines a hard register.

>    Hard register uses in a memory address are ignored.

>    Clobbers and flags definitions are ignored.  */

> 

> and you are putting SP_REG into clobber part.

> 

> OTOH, SP_REG in:

> 

> (insn 28 27 29 3 (set (mem:DI (pre_dec:SI (reg/f:SI 7 sp)) [2  S8 A64])

>         (reg/v:DI 85 [ target ])) "x.i":19:5 40 {*pushdi2}

> 

> is inside memory, so REG_SP should already be ignored. Please

> investigate, why it is not the case.


Push isn't a simple memory access since it also updates stack pointer.
Fixed to handle pseudo register push.

> 

> +(define_insn "*pushv1ti2"

> +  [(set (match_operand:V1TI 0 "push_operand" "=<")

> +    (match_operand:V1TI 1 "general_no_elim_operand" "v"))]

> +  ""

> +  "#"

> +  [(set_attr "type" "multi")

> +   (set_attr "mode" "TI")])

> +

> +(define_split

> +  [(set (match_operand:V1TI 0 "push_operand" "")

> +    (match_operand:V1TI 1 "sse_reg_operand" ""))]

> +  "TARGET_64BIT && reload_completed"

> +  [(set (reg:P SP_REG) (plus:P (reg:P SP_REG) (match_dup 2)))

> +    (set (match_dup 0) (match_dup 1))]

> +{

> +  operands[2] = GEN_INT (-PUSH_ROUNDING (16));

> +  /* Preserve memory attributes. */

> +  operands[0] = replace_equiv_address (operands[0], stack_pointer_rtx);

> +})

> 

> The above part is wrong on many levels, e.g. using wrong predicate,

> unnecessary rounding, it should be defined as define_insn_and_split,

> conditionalized on TARGET_64BIT && TARGET_STV and put nearby existing

> pushti patterns.


Fixed.

> 

> I will implement push changes (modulo V1T1mode) by myself, since they

> are independent of STV stuff.

> 


Here is the updated patch.  Tested on Linux/x86 and Linux/x86-64.  OK
for master?

Thanks.

H.J.
---
Add V1TI vector register push and split it after reload to a sequence
of:

(set (reg:P SP_REG) (plus:P SP_REG) (const_int -8)))
(set (match_dup 0) (match_dup 1))

so that STV pass can convert TI mode integer push to V1TI vector register
push.  Rename has_non_address_hard_reg to pseudo_reg_set, combine calls
of single_set and has_non_address_hard_reg to pseudo_reg_set, to ignore
pseudo register push.

Remove c-c++-common/dfp/func-vararg-mixed-2.c since it is compiled with
-mpreferred-stack-boundary=2 and leads to segfault:

Dump of assembler code for function __bid_nesd2:
   0x08049210 <+0>:	endbr32
   0x08049214 <+4>:	push   %esi
   0x08049215 <+5>:	push   %ebx
   0x08049216 <+6>:	call   0x8049130 <__x86.get_pc_thunk.bx>
   0x0804921b <+11>:	add    $0x8de5,%ebx
   0x08049221 <+17>:	sub    $0x20,%esp
   0x08049224 <+20>:	mov    0x30(%esp),%esi
   0x08049228 <+24>:	pushl  0x2c(%esp)
   0x0804922c <+28>:	call   0x804e600 <__bid32_to_bid64>
   0x08049231 <+33>:	mov    %esi,(%esp)
   0x08049234 <+36>:	movd   %edx,%xmm1
   0x08049238 <+40>:	movd   %eax,%xmm0
   0x0804923c <+44>:	punpckldq %xmm1,%xmm0
=> 0x08049240 <+48>:	movaps %xmm0,0x10(%esp)
   0x08049245 <+53>:	call   0x804e600 <__bid32_to_bid64>
   0x0804924a <+58>:	push   %edx
   0x0804924b <+59>:	push   %eax
   0x0804924c <+60>:	pushl  0x1c(%esp)
   0x08049250 <+64>:	pushl  0x1c(%esp)
   0x08049254 <+68>:	call   0x804b260 <__bid64_quiet_not_equal>
   0x08049259 <+73>:	add    $0x34,%esp
   0x0804925c <+76>:	pop    %ebx
   0x0804925d <+77>:	pop    %esi
   0x0804925e <+78>:	ret

when libgcc is compiled with -msse2.  According to GCC manual:

'-mpreferred-stack-boundary=NUM'
     Attempt to keep the stack boundary aligned to a 2 raised to NUM
     byte boundary.  If '-mpreferred-stack-boundary' is not specified,
     the default is 4 (16 bytes or 128-bits).

     *Warning:* If you use this switch, then you must build all modules
     with the same value, including any libraries.  This includes the
     system libraries and startup modules.

c-c++-common/dfp/func-vararg-mixed-2.c, which was added by

commit 3b2488ca6ece182f2136a20ee5fa0bb92f935b0f
Author: H.J. Lu <hongjiu.lu@intel.com>
Date:   Wed Jul 30 19:24:02 2008 +0000

    func-vararg-alternate-d128-2.c: New.

    2008-07-30  H.J. Lu  <hongjiu.lu@intel.com>
                Joey Ye  <joey.ye@intel.com>

            * gcc.dg/dfp/func-vararg-alternate-d128-2.c: New.
            * gcc.dg/dfp/func-vararg-mixed-2.c: Likewise.

isn't expected to work with libgcc.

gcc/

	PR target/95021
	* config/i386/i386-features.c (has_non_address_hard_reg):
	Renamed to ...
	(pseudo_reg_set): This.  Return the SET expression.  Ignore
	pseudo register push.
	(general_scalar_to_vector_candidate_p): Combine single_set and
	has_non_address_hard_reg calls to pseudo_reg_set.
	(timode_scalar_to_vector_candidate_p): Likewise.
	* config/i386/i386.md (*pushv1ti2): New pattern.

gcc/testsuite/

	PR target/95021
	* c-c++-common/dfp/func-vararg-mixed-2.c: Removed.
	* gcc.target/i386/pr95021-1.c: New test.
	* gcc.target/i386/pr95021-2.c: Likewise.
	* gcc.target/i386/pr95021-3.c: Likewise.
	* gcc.target/i386/pr95021-4.c: Likewise.
	* gcc.target/i386/pr95021-5.c: Likewise.
---
 gcc/config/i386/i386-features.c               |  37 +++---
 gcc/config/i386/i386.md                       |  16 +++
 .../c-c++-common/dfp/func-vararg-mixed-2.c    | 105 ------------------
 gcc/testsuite/gcc.target/i386/pr95021-1.c     |  25 +++++
 gcc/testsuite/gcc.target/i386/pr95021-2.c     |  53 +++++++++
 gcc/testsuite/gcc.target/i386/pr95021-3.c     |  25 +++++
 gcc/testsuite/gcc.target/i386/pr95021-4.c     |  25 +++++
 gcc/testsuite/gcc.target/i386/pr95021-5.c     |  59 ++++++++++
 8 files changed, 224 insertions(+), 121 deletions(-)
 delete mode 100644 gcc/testsuite/c-c++-common/dfp/func-vararg-mixed-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr95021-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr95021-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr95021-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr95021-4.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr95021-5.c

-- 
2.26.2

Comments

Richard Biener via Gcc-patches May 17, 2020, 4:07 p.m. | #1
On Sat, May 16, 2020 at 8:13 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>

> On Fri, May 15, 2020 at 11:21:30AM +0200, Uros Bizjak wrote:

> > On Wed, May 13, 2020 at 5:58 PM H.J. Lu <hjl.tools@gmail.com> wrote:

> >

> > > > > > The question is, why STV pass creates its funny sequence? The original

> > > > > > sequence should be easily solved by storing DImode from XMM register

> > > > > > and (with patched gcc) pushing DImode value from the same XMM

> > > > > > register.

> > > > >

> > > > > STV pass is mostly OK since it has to use XMM to move upper and lower

> > > > > 32 bits of a 64-bit integer.  The problem is that push XMM becomes very

> > > > > expensive later.

> > > >

> > > > As shown in the patch, XMM push should be just decrement of SP reg and

> > > > move to the created stack slot.

> > >

> > > OK for master if there are no regressions?

> >

> > diff --git a/gcc/config/i386/i386-features.c b/gcc/config/i386/i386-features.c

> > index 78fb373db6e..6cad125fa83 100644

> > --- a/gcc/config/i386/i386-features.c

> > +++ b/gcc/config/i386/i386-features.c

> > @@ -1264,7 +1264,8 @@ has_non_address_hard_reg (rtx_insn *insn)

> >    FOR_EACH_INSN_DEF (ref, insn)

> >      if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))

> >      && !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)

> > -    && DF_REF_REGNO (ref) != FLAGS_REG)

> > +    && DF_REF_REGNO (ref) != FLAGS_REG

> > +    && DF_REF_REGNO (ref) != SP_REG)

> >        return true;

> >

> > I don't think this part is correct. The function comment says:

> >

> > /* Return 1 if INSN uses or defines a hard register.

> >    Hard register uses in a memory address are ignored.

> >    Clobbers and flags definitions are ignored.  */

> >

> > and you are putting SP_REG into clobber part.

> >

> > OTOH, SP_REG in:

> >

> > (insn 28 27 29 3 (set (mem:DI (pre_dec:SI (reg/f:SI 7 sp)) [2  S8 A64])

> >         (reg/v:DI 85 [ target ])) "x.i":19:5 40 {*pushdi2}

> >

> > is inside memory, so REG_SP should already be ignored. Please

> > investigate, why it is not the case.

>

> Push isn't a simple memory access since it also updates stack pointer.

> Fixed to handle pseudo register push.

>

> >

> > +(define_insn "*pushv1ti2"

> > +  [(set (match_operand:V1TI 0 "push_operand" "=<")

> > +    (match_operand:V1TI 1 "general_no_elim_operand" "v"))]

> > +  ""

> > +  "#"

> > +  [(set_attr "type" "multi")

> > +   (set_attr "mode" "TI")])

> > +

> > +(define_split

> > +  [(set (match_operand:V1TI 0 "push_operand" "")

> > +    (match_operand:V1TI 1 "sse_reg_operand" ""))]

> > +  "TARGET_64BIT && reload_completed"

> > +  [(set (reg:P SP_REG) (plus:P (reg:P SP_REG) (match_dup 2)))

> > +    (set (match_dup 0) (match_dup 1))]

> > +{

> > +  operands[2] = GEN_INT (-PUSH_ROUNDING (16));

> > +  /* Preserve memory attributes. */

> > +  operands[0] = replace_equiv_address (operands[0], stack_pointer_rtx);

> > +})

> >

> > The above part is wrong on many levels, e.g. using wrong predicate,

> > unnecessary rounding, it should be defined as define_insn_and_split,

> > conditionalized on TARGET_64BIT && TARGET_STV and put nearby existing

> > pushti patterns.

>

> Fixed.

>

> >

> > I will implement push changes (modulo V1T1mode) by myself, since they

> > are independent of STV stuff.

> >

>

> Here is the updated patch.  Tested on Linux/x86 and Linux/x86-64.  OK

> for master?

>

> Thanks.

>

> H.J.

> ---

> Add V1TI vector register push and split it after reload to a sequence

> of:

>

> (set (reg:P SP_REG) (plus:P SP_REG) (const_int -8)))

> (set (match_dup 0) (match_dup 1))

>

> so that STV pass can convert TI mode integer push to V1TI vector register

> push.  Rename has_non_address_hard_reg to pseudo_reg_set, combine calls

> of single_set and has_non_address_hard_reg to pseudo_reg_set, to ignore

> pseudo register push.

>

> Remove c-c++-common/dfp/func-vararg-mixed-2.c since it is compiled with

> -mpreferred-stack-boundary=2 and leads to segfault:

>

> Dump of assembler code for function __bid_nesd2:

>    0x08049210 <+0>:     endbr32

>    0x08049214 <+4>:     push   %esi

>    0x08049215 <+5>:     push   %ebx

>    0x08049216 <+6>:     call   0x8049130 <__x86.get_pc_thunk.bx>

>    0x0804921b <+11>:    add    $0x8de5,%ebx

>    0x08049221 <+17>:    sub    $0x20,%esp

>    0x08049224 <+20>:    mov    0x30(%esp),%esi

>    0x08049228 <+24>:    pushl  0x2c(%esp)

>    0x0804922c <+28>:    call   0x804e600 <__bid32_to_bid64>

>    0x08049231 <+33>:    mov    %esi,(%esp)

>    0x08049234 <+36>:    movd   %edx,%xmm1

>    0x08049238 <+40>:    movd   %eax,%xmm0

>    0x0804923c <+44>:    punpckldq %xmm1,%xmm0

> => 0x08049240 <+48>:    movaps %xmm0,0x10(%esp)

>    0x08049245 <+53>:    call   0x804e600 <__bid32_to_bid64>

>    0x0804924a <+58>:    push   %edx

>    0x0804924b <+59>:    push   %eax

>    0x0804924c <+60>:    pushl  0x1c(%esp)

>    0x08049250 <+64>:    pushl  0x1c(%esp)

>    0x08049254 <+68>:    call   0x804b260 <__bid64_quiet_not_equal>

>    0x08049259 <+73>:    add    $0x34,%esp

>    0x0804925c <+76>:    pop    %ebx

>    0x0804925d <+77>:    pop    %esi

>    0x0804925e <+78>:    ret

>

> when libgcc is compiled with -msse2.  According to GCC manual:

>

> '-mpreferred-stack-boundary=NUM'

>      Attempt to keep the stack boundary aligned to a 2 raised to NUM

>      byte boundary.  If '-mpreferred-stack-boundary' is not specified,

>      the default is 4 (16 bytes or 128-bits).

>

>      *Warning:* If you use this switch, then you must build all modules

>      with the same value, including any libraries.  This includes the

>      system libraries and startup modules.

>

> c-c++-common/dfp/func-vararg-mixed-2.c, which was added by

>

> commit 3b2488ca6ece182f2136a20ee5fa0bb92f935b0f

> Author: H.J. Lu <hongjiu.lu@intel.com>

> Date:   Wed Jul 30 19:24:02 2008 +0000

>

>     func-vararg-alternate-d128-2.c: New.

>

>     2008-07-30  H.J. Lu  <hongjiu.lu@intel.com>

>                 Joey Ye  <joey.ye@intel.com>

>

>             * gcc.dg/dfp/func-vararg-alternate-d128-2.c: New.

>             * gcc.dg/dfp/func-vararg-mixed-2.c: Likewise.

>

> isn't expected to work with libgcc.

>

> gcc/

>

>         PR target/95021

>         * config/i386/i386-features.c (has_non_address_hard_reg):

>         Renamed to ...

>         (pseudo_reg_set): This.  Return the SET expression.  Ignore

>         pseudo register push.

>         (general_scalar_to_vector_candidate_p): Combine single_set and

>         has_non_address_hard_reg calls to pseudo_reg_set.

>         (timode_scalar_to_vector_candidate_p): Likewise.

>         * config/i386/i386.md (*pushv1ti2): New pattern.

>

> gcc/testsuite/

>

>         PR target/95021

>         * c-c++-common/dfp/func-vararg-mixed-2.c: Removed.

>         * gcc.target/i386/pr95021-1.c: New test.

>         * gcc.target/i386/pr95021-2.c: Likewise.

>         * gcc.target/i386/pr95021-3.c: Likewise.

>         * gcc.target/i386/pr95021-4.c: Likewise.

>         * gcc.target/i386/pr95021-5.c: Likewise.


OK with a small nit in i386.md and testcase changes, as explained inline.

Thanks,
Uros.

> ---

>  gcc/config/i386/i386-features.c               |  37 +++---

>  gcc/config/i386/i386.md                       |  16 +++

>  .../c-c++-common/dfp/func-vararg-mixed-2.c    | 105 ------------------

>  gcc/testsuite/gcc.target/i386/pr95021-1.c     |  25 +++++

>  gcc/testsuite/gcc.target/i386/pr95021-2.c     |  53 +++++++++

>  gcc/testsuite/gcc.target/i386/pr95021-3.c     |  25 +++++

>  gcc/testsuite/gcc.target/i386/pr95021-4.c     |  25 +++++

>  gcc/testsuite/gcc.target/i386/pr95021-5.c     |  59 ++++++++++

>  8 files changed, 224 insertions(+), 121 deletions(-)

>  delete mode 100644 gcc/testsuite/c-c++-common/dfp/func-vararg-mixed-2.c

>  create mode 100644 gcc/testsuite/gcc.target/i386/pr95021-1.c

>  create mode 100644 gcc/testsuite/gcc.target/i386/pr95021-2.c

>  create mode 100644 gcc/testsuite/gcc.target/i386/pr95021-3.c

>  create mode 100644 gcc/testsuite/gcc.target/i386/pr95021-4.c

>  create mode 100644 gcc/testsuite/gcc.target/i386/pr95021-5.c

>

> diff --git a/gcc/config/i386/i386-features.c b/gcc/config/i386/i386-features.c

> index 78fb373db6e..b9b764c092a 100644

> --- a/gcc/config/i386/i386-features.c

> +++ b/gcc/config/i386/i386-features.c

> @@ -1253,25 +1253,36 @@ scalar_chain::convert ()

>    return converted_insns;

>  }

>

> -/* Return 1 if INSN uses or defines a hard register.

> -   Hard register uses in a memory address are ignored.

> -   Clobbers and flags definitions are ignored.  */

> +/* Return the SET expression if INSN doesn't reference hard register.

> +   Return NULL if INSN uses or defines a hard register, excluding

> +   pseudo register pushes, hard register uses in a memory address,

> +   clobbers and flags definitions.  */

>

> -static bool

> -has_non_address_hard_reg (rtx_insn *insn)

> +static rtx

> +pseudo_reg_set (rtx_insn *insn)

>  {

> +  rtx set = single_set (insn);

> +  if (!set)

> +    return NULL;

> +

> +  /* Check pseudo register push first. */

> +  if (REG_P (SET_SRC (set))

> +      && !HARD_REGISTER_P (SET_SRC (set))

> +      && push_operand (SET_DEST (set), GET_MODE (SET_DEST (set))))

> +    return set;

> +

>    df_ref ref;

>    FOR_EACH_INSN_DEF (ref, insn)

>      if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))

>         && !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)

>         && DF_REF_REGNO (ref) != FLAGS_REG)

> -      return true;

> +      return NULL;

>

>    FOR_EACH_INSN_USE (ref, insn)

>      if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref)))

> -      return true;

> +      return NULL;

>

> -  return false;

> +  return set;

>  }

>

>  /* Check if comparison INSN may be transformed

> @@ -1345,14 +1356,11 @@ convertible_comparison_p (rtx_insn *insn, enum machine_mode mode)

>  static bool

>  general_scalar_to_vector_candidate_p (rtx_insn *insn, enum machine_mode mode)

>  {

> -  rtx def_set = single_set (insn);

> +  rtx def_set = pseudo_reg_set (insn);

>

>    if (!def_set)

>      return false;

>

> -  if (has_non_address_hard_reg (insn))

> -    return false;

> -

>    rtx src = SET_SRC (def_set);

>    rtx dst = SET_DEST (def_set);

>

> @@ -1442,14 +1450,11 @@ general_scalar_to_vector_candidate_p (rtx_insn *insn, enum machine_mode mode)

>  static bool

>  timode_scalar_to_vector_candidate_p (rtx_insn *insn)

>  {

> -  rtx def_set = single_set (insn);

> +  rtx def_set = pseudo_reg_set (insn);

>

>    if (!def_set)

>      return false;

>

> -  if (has_non_address_hard_reg (insn))

> -    return false;

> -

>    rtx src = SET_SRC (def_set);

>    rtx dst = SET_DEST (def_set);

>

> diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md

> index 1bf0c1a7f01..bcb44a3fb46 100644

> --- a/gcc/config/i386/i386.md

> +++ b/gcc/config/i386/i386.md

> @@ -1700,6 +1700,22 @@ (define_insn "*pushdi2_rex64"

>    [(set_attr "type" "push,multi,multi")

>     (set_attr "mode" "DI")])

>

> +(define_insn_and_split "*pushv1ti2"

> +  [(set (match_operand:V1TI 0 "push_operand" "=<")

> +       (match_operand:V1TI 1 "register_operand" "v"))]

> +  "TARGET_64BIT && TARGET_STV"

> +  "#"

> +  "&& reload_completed"

> +  [(set (reg:P SP_REG) (plus:P (reg:P SP_REG) (match_dup 2)))

> +   (set (match_dup 0) (match_dup 1))]

> +{

> +  operands[2] = GEN_INT (-PUSH_ROUNDING (GET_MODE_SIZE (V1TImode)));

> +  /* Preserve memory attributes. */

> +  operands[0] = replace_equiv_address (operands[0], stack_pointer_rtx);

> +}

> +  [(set_attr "type" "multi")

> +   (set_attr "mode" "TI")])


Please put the above pattern just above DWI "*push<mode>2".

> +

>  ;; Convert impossible pushes of immediate to existing instructions.

>  ;; First try to get scratch register and go through it.  In case this

>  ;; fails, push sign extended lower part first and then overwrite

> diff --git a/gcc/testsuite/c-c++-common/dfp/func-vararg-mixed-2.c b/gcc/testsuite/c-c++-common/dfp/func-vararg-mixed-2.c

> deleted file mode 100644

> index 02cafb016d1..00000000000

> --- a/gcc/testsuite/c-c++-common/dfp/func-vararg-mixed-2.c

> +++ /dev/null

> @@ -1,105 +0,0 @@

> -/* { dg-do run { target { { i?86-*-* x86_64-*-* } && ia32 } } } */

> -/* { dg-options "-mpreferred-stack-boundary=2" } */

> -

> -/* C99 6.5.2.2 Function calls.

> -   Test passing varargs of the combination of decimal float types and

> -   other types.  */

> -

> -#include <stdarg.h>

> -#include "dfp-dbg.h"

> -

> -/* Supposing the list of varying number of arguments is:

> -   unsigned int, _Decimal128, double, _Decimal32, _Decimal64.  */

> -

> -static _Decimal32

> -vararg_d32 (unsigned arg, ...)

> -{

> -  va_list ap;

> -  _Decimal32 result;

> -

> -  va_start (ap, arg);

> -

> -  va_arg (ap, unsigned int);

> -  va_arg (ap, _Decimal128);

> -  va_arg (ap, double);

> -  result = va_arg (ap, _Decimal32);

> -

> -  va_end (ap);

> -  return result;

> -}

> -

> -static _Decimal32

> -vararg_d64 (unsigned arg, ...)

> -{

> -  va_list ap;

> -  _Decimal64 result;

> -

> -  va_start (ap, arg);

> -

> -  va_arg (ap, unsigned int);

> -  va_arg (ap, _Decimal128);

> -  va_arg (ap, double);

> -  va_arg (ap, _Decimal32);

> -  result = va_arg (ap, _Decimal64);

> -

> -  va_end (ap);

> -  return result;

> -}

> -

> -static _Decimal128

> -vararg_d128 (unsigned arg, ...)

> -{

> -  va_list ap;

> -  _Decimal128 result;

> -

> -  va_start (ap, arg);

> -

> -  va_arg (ap, unsigned int);

> -  result = va_arg (ap, _Decimal128);

> -

> -  va_end (ap);

> -  return result;

> -}

> -

> -static unsigned int

> -vararg_int (unsigned arg, ...)

> -{

> -  va_list ap;

> -  unsigned int result;

> -

> -  va_start (ap, arg);

> -

> -  result = va_arg (ap, unsigned int);

> -

> -  va_end (ap);

> -  return result;

> -}

> -

> -static double

> -vararg_double (unsigned arg, ...)

> -{

> -  va_list ap;

> -  float result;

> -

> -  va_start (ap, arg);

> -

> -  va_arg (ap, unsigned int);

> -  va_arg (ap, _Decimal128);

> -  result = va_arg (ap, double);

> -

> -  va_end (ap);

> -  return result;

> -}

> -

> -

> -int

> -main ()

> -{

> -  if (vararg_d32 (3, 0, 1.0dl, 2.0, 3.0df, 4.0dd) != 3.0df) FAILURE

> -  if (vararg_d64 (4, 0, 1.0dl, 2.0, 3.0df, 4.0dd) != 4.0dd) FAILURE

> -  if (vararg_d128 (1, 0, 1.0dl, 2.0, 3.0df, 4.0dd) != 1.0dl) FAILURE

> -  if (vararg_int (0, 0, 1.0dl, 2.0, 3.0df, 4.0dd) != 0) FAILURE

> -  if (vararg_double (2, 0, 1.0dl, 2.0, 3.0df, 4.0dd) != 2.0) FAILURE

> -

> -  FINISH

> -}

> diff --git a/gcc/testsuite/gcc.target/i386/pr95021-1.c b/gcc/testsuite/gcc.target/i386/pr95021-1.c

> new file mode 100644

> index 00000000000..218776240ee

> --- /dev/null

> +++ b/gcc/testsuite/gcc.target/i386/pr95021-1.c

> @@ -0,0 +1,25 @@

> +/* { dg-do compile { target ia32 } } */

> +/* { dg-options "-O2 -msse2 -mstv -mregparm=0 -W" } *


No need for -mregparm=0, it is the the default. Also, do we really need -W?

> +/* { dg-final { scan-assembler "movq\[ \t\]+\[^\n\]*, %xmm" } } */


Please rather scan for "movq    %xmmX, (%esp)", this is what we are
looking for, the bad code has:

        movd    %xmm1, 20(%esp)
        pushl   20(%esp)

Also, we can check that "psrlq   $32, %xmm1" is not there, which means
that DImode value wasn't broken to SImode. Please improve this test in
all 32bit testcases.

> +

> +typedef long long a;

> +struct __jmp_buf_tag   {

> +};

> +typedef struct __jmp_buf_tag sigjmp_buf[1];

> +sigjmp_buf jmp_buf;

> +int __sigsetjmp (sigjmp_buf, int);

> +typedef struct {

> +  a target;

> +} b;

> +extern a *target_p;

> +b *c;

> +extern void foo (a);

> +void

> +d(void)

> +{

> +  if (__sigsetjmp(jmp_buf, 1)) {

> +    a target = *target_p;

> +    c->target = target;

> +    foo(target);

> +  }

> +}

> diff --git a/gcc/testsuite/gcc.target/i386/pr95021-2.c b/gcc/testsuite/gcc.target/i386/pr95021-2.c

> new file mode 100644

> index 00000000000..473b5f81664

> --- /dev/null

> +++ b/gcc/testsuite/gcc.target/i386/pr95021-2.c

> @@ -0,0 +1,53 @@

> +/* { dg-do run { target ia32 } } */

> +/* { dg-require-effective-target sse2_runtime } */

> +/* { dg-options "-O2 -msse2 -mstv -W" } */


This is the same testcase as the one above. The default is -mregparm=0.

> +

> +#include <stdlib.h>

> +#include <setjmp.h>

> +

> +jmp_buf buf;

> +

> +long long *target_p;

> +long long *c;

> +

> +int count;

> +

> +__attribute__ ((noclone, noinline))

> +void

> +foo (long long x)

> +{

> +  if (x != *c)

> +    abort ();

> +  if (!count)

> +    {

> +      count++;

> +      longjmp (buf, 1);

> +    }

> +}

> +

> +__attribute__ ((noclone, noinline))

> +void

> +bar (void)

> +{

> +  if (setjmp (buf))

> +    {

> +      long long target = *target_p;

> +      *c = target;

> +      foo (target);

> +    }

> +  else

> +    foo (0);

> +}

> +

> +int

> +main ()

> +{

> +  long long val = 30;

> +  long long local = 0;

> +  target_p = &val;

> +  c = &local;

> +  bar ();

> +  if (val != local)

> +    abort ();

> +  return 0;

> +}

> diff --git a/gcc/testsuite/gcc.target/i386/pr95021-3.c b/gcc/testsuite/gcc.target/i386/pr95021-3.c

> new file mode 100644

> index 00000000000..b213b5db072

> --- /dev/null

> +++ b/gcc/testsuite/gcc.target/i386/pr95021-3.c

> @@ -0,0 +1,25 @@

> +/* { dg-do compile { target ia32 } } */

> +/* { dg-options "-O2 -msse2 -mstv -mregparm=3 -W" } */

> +/* { dg-final { scan-assembler "movq\[ \t\]+\[^\n\]*, %xmm" } } */


Just #include "pr95021-1.c". No need to dupicate the same code.

> +typedef long long a;

> +struct __jmp_buf_tag   {

> +};

> +typedef struct __jmp_buf_tag sigjmp_buf[1];

> +sigjmp_buf jmp_buf;

> +int __sigsetjmp (sigjmp_buf, int);

> +typedef struct {

> +  a target;

> +} b;

> +extern a *target_p;

> +b *c;

> +extern void foo (a);

> +void

> +d(void)

> +{

> +  if (__sigsetjmp(jmp_buf, 1)) {

> +    a target = *target_p;

> +    c->target = target;

> +    foo(target);

> +  }> +}

> diff --git a/gcc/testsuite/gcc.target/i386/pr95021-4.c b/gcc/testsuite/gcc.target/i386/pr95021-4.c

> new file mode 100644

> index 00000000000..a080d7b289f

> --- /dev/null

> +++ b/gcc/testsuite/gcc.target/i386/pr95021-4.c

> @@ -0,0 +1,25 @@

> +/* { dg-do compile { target int128 } } */

> +/* { dg-options "-O2 -msse2 -mstv -W" } */

> +/* { dg-final { scan-assembler "movdqa\[ \t\]+\[^\n\]*, %xmm" } } */


Please scan for "movaps  %xmmX, (%rsp)" here.

> +

> +typedef __int128 a;

> +struct __jmp_buf_tag   {

> +};

> +typedef struct __jmp_buf_tag sigjmp_buf[1];

> +sigjmp_buf jmp_buf;

> +int __sigsetjmp (sigjmp_buf, int);

> +typedef struct {

> +  a target;

> +} b;

> +extern a *target_p;

> +b *c;

> +extern void foo (int, int, int, int, int, int, a);

> +void

> +d(void)

> +{

> +  if (__sigsetjmp(jmp_buf, 1)) {

> +    a target = *target_p;

> +    c->target = target;

> +    foo(1, 2, 3, 4, 5, 6, target);

> +  }

> +}

> diff --git a/gcc/testsuite/gcc.target/i386/pr95021-5.c b/gcc/testsuite/gcc.target/i386/pr95021-5.c

> new file mode 100644

> index 00000000000..3fcf2c3922e

> --- /dev/null

> +++ b/gcc/testsuite/gcc.target/i386/pr95021-5.c

> @@ -0,0 +1,59 @@

> +/* { dg-do run { target int128 } } */

> +/* { dg-require-effective-target sse2_runtime } */

> +/* { dg-options "-O2 -msse2 -mstv -W" } */

> +

> +#include <stdlib.h>

> +#include <setjmp.h>

> +

> +jmp_buf buf;

> +

> +__int128 *target_p;

> +__int128 *c;

> +

> +int count;

> +

> +__attribute__ ((noclone, noinline))

> +void

> +foo (__int128 i1, __int128 i2, __int128 i3, __int128 x)

> +{

> +  if (i1 != 0xbadbeef1)

> +    abort ();

> +  if (i2 != 0x2badbeef)

> +    abort ();

> +  if (i3 != 0xbad3beef)

> +    abort ();

> +  if (x != *c)

> +    abort ();

> +  if (!count)

> +    {

> +      count++;

> +      longjmp (buf, 1);

> +    }

> +}

> +

> +__attribute__ ((noclone, noinline))

> +void

> +bar (void)

> +{

> +  if (setjmp (buf))

> +    {

> +      __int128 target = *target_p;

> +      *c = target;

> +      foo (0xbadbeef1, 0x2badbeef, 0xbad3beef, target);

> +    }

> +  else

> +    foo (0xbadbeef1, 0x2badbeef, 0xbad3beef, 0);

> +}

> +

> +int

> +main ()

> +{

> +  __int128 val = 30;

> +  __int128 local = 0;

> +  target_p = &val;

> +  c = &local;

> +  bar ();

> +  if (val != local)

> +    abort ();

> +  return 0;

> +}

> --

> 2.26.2

>
Richard Biener via Gcc-patches May 17, 2020, 4:53 p.m. | #2
On Sun, May 17, 2020 at 9:07 AM Uros Bizjak <ubizjak@gmail.com> wrote:
>

> On Sat, May 16, 2020 at 8:13 PM H.J. Lu <hjl.tools@gmail.com> wrote:

> >

> > On Fri, May 15, 2020 at 11:21:30AM +0200, Uros Bizjak wrote:

> > > On Wed, May 13, 2020 at 5:58 PM H.J. Lu <hjl.tools@gmail.com> wrote:

> > >

> > > > > > > The question is, why STV pass creates its funny sequence? The original

> > > > > > > sequence should be easily solved by storing DImode from XMM register

> > > > > > > and (with patched gcc) pushing DImode value from the same XMM

> > > > > > > register.

> > > > > >

> > > > > > STV pass is mostly OK since it has to use XMM to move upper and lower

> > > > > > 32 bits of a 64-bit integer.  The problem is that push XMM becomes very

> > > > > > expensive later.

> > > > >

> > > > > As shown in the patch, XMM push should be just decrement of SP reg and

> > > > > move to the created stack slot.

> > > >

> > > > OK for master if there are no regressions?

> > >

> > > diff --git a/gcc/config/i386/i386-features.c b/gcc/config/i386/i386-features.c

> > > index 78fb373db6e..6cad125fa83 100644

> > > --- a/gcc/config/i386/i386-features.c

> > > +++ b/gcc/config/i386/i386-features.c

> > > @@ -1264,7 +1264,8 @@ has_non_address_hard_reg (rtx_insn *insn)

> > >    FOR_EACH_INSN_DEF (ref, insn)

> > >      if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))

> > >      && !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)

> > > -    && DF_REF_REGNO (ref) != FLAGS_REG)

> > > +    && DF_REF_REGNO (ref) != FLAGS_REG

> > > +    && DF_REF_REGNO (ref) != SP_REG)

> > >        return true;

> > >

> > > I don't think this part is correct. The function comment says:

> > >

> > > /* Return 1 if INSN uses or defines a hard register.

> > >    Hard register uses in a memory address are ignored.

> > >    Clobbers and flags definitions are ignored.  */

> > >

> > > and you are putting SP_REG into clobber part.

> > >

> > > OTOH, SP_REG in:

> > >

> > > (insn 28 27 29 3 (set (mem:DI (pre_dec:SI (reg/f:SI 7 sp)) [2  S8 A64])

> > >         (reg/v:DI 85 [ target ])) "x.i":19:5 40 {*pushdi2}

> > >

> > > is inside memory, so REG_SP should already be ignored. Please

> > > investigate, why it is not the case.

> >

> > Push isn't a simple memory access since it also updates stack pointer.

> > Fixed to handle pseudo register push.

> >

> > >

> > > +(define_insn "*pushv1ti2"

> > > +  [(set (match_operand:V1TI 0 "push_operand" "=<")

> > > +    (match_operand:V1TI 1 "general_no_elim_operand" "v"))]

> > > +  ""

> > > +  "#"

> > > +  [(set_attr "type" "multi")

> > > +   (set_attr "mode" "TI")])

> > > +

> > > +(define_split

> > > +  [(set (match_operand:V1TI 0 "push_operand" "")

> > > +    (match_operand:V1TI 1 "sse_reg_operand" ""))]

> > > +  "TARGET_64BIT && reload_completed"

> > > +  [(set (reg:P SP_REG) (plus:P (reg:P SP_REG) (match_dup 2)))

> > > +    (set (match_dup 0) (match_dup 1))]

> > > +{

> > > +  operands[2] = GEN_INT (-PUSH_ROUNDING (16));

> > > +  /* Preserve memory attributes. */

> > > +  operands[0] = replace_equiv_address (operands[0], stack_pointer_rtx);

> > > +})

> > >

> > > The above part is wrong on many levels, e.g. using wrong predicate,

> > > unnecessary rounding, it should be defined as define_insn_and_split,

> > > conditionalized on TARGET_64BIT && TARGET_STV and put nearby existing

> > > pushti patterns.

> >

> > Fixed.

> >

> > >

> > > I will implement push changes (modulo V1T1mode) by myself, since they

> > > are independent of STV stuff.

> > >

> >

> > Here is the updated patch.  Tested on Linux/x86 and Linux/x86-64.  OK

> > for master?

> >

> > Thanks.

> >

> > H.J.

> > ---

> > Add V1TI vector register push and split it after reload to a sequence

> > of:

> >

> > (set (reg:P SP_REG) (plus:P SP_REG) (const_int -8)))

> > (set (match_dup 0) (match_dup 1))

> >

> > so that STV pass can convert TI mode integer push to V1TI vector register

> > push.  Rename has_non_address_hard_reg to pseudo_reg_set, combine calls

> > of single_set and has_non_address_hard_reg to pseudo_reg_set, to ignore

> > pseudo register push.

> >

> > Remove c-c++-common/dfp/func-vararg-mixed-2.c since it is compiled with

> > -mpreferred-stack-boundary=2 and leads to segfault:

> >

> > Dump of assembler code for function __bid_nesd2:

> >    0x08049210 <+0>:     endbr32

> >    0x08049214 <+4>:     push   %esi

> >    0x08049215 <+5>:     push   %ebx

> >    0x08049216 <+6>:     call   0x8049130 <__x86.get_pc_thunk.bx>

> >    0x0804921b <+11>:    add    $0x8de5,%ebx

> >    0x08049221 <+17>:    sub    $0x20,%esp

> >    0x08049224 <+20>:    mov    0x30(%esp),%esi

> >    0x08049228 <+24>:    pushl  0x2c(%esp)

> >    0x0804922c <+28>:    call   0x804e600 <__bid32_to_bid64>

> >    0x08049231 <+33>:    mov    %esi,(%esp)

> >    0x08049234 <+36>:    movd   %edx,%xmm1

> >    0x08049238 <+40>:    movd   %eax,%xmm0

> >    0x0804923c <+44>:    punpckldq %xmm1,%xmm0

> > => 0x08049240 <+48>:    movaps %xmm0,0x10(%esp)

> >    0x08049245 <+53>:    call   0x804e600 <__bid32_to_bid64>

> >    0x0804924a <+58>:    push   %edx

> >    0x0804924b <+59>:    push   %eax

> >    0x0804924c <+60>:    pushl  0x1c(%esp)

> >    0x08049250 <+64>:    pushl  0x1c(%esp)

> >    0x08049254 <+68>:    call   0x804b260 <__bid64_quiet_not_equal>

> >    0x08049259 <+73>:    add    $0x34,%esp

> >    0x0804925c <+76>:    pop    %ebx

> >    0x0804925d <+77>:    pop    %esi

> >    0x0804925e <+78>:    ret

> >

> > when libgcc is compiled with -msse2.  According to GCC manual:

> >

> > '-mpreferred-stack-boundary=NUM'

> >      Attempt to keep the stack boundary aligned to a 2 raised to NUM

> >      byte boundary.  If '-mpreferred-stack-boundary' is not specified,

> >      the default is 4 (16 bytes or 128-bits).

> >

> >      *Warning:* If you use this switch, then you must build all modules

> >      with the same value, including any libraries.  This includes the

> >      system libraries and startup modules.

> >

> > c-c++-common/dfp/func-vararg-mixed-2.c, which was added by

> >

> > commit 3b2488ca6ece182f2136a20ee5fa0bb92f935b0f

> > Author: H.J. Lu <hongjiu.lu@intel.com>

> > Date:   Wed Jul 30 19:24:02 2008 +0000

> >

> >     func-vararg-alternate-d128-2.c: New.

> >

> >     2008-07-30  H.J. Lu  <hongjiu.lu@intel.com>

> >                 Joey Ye  <joey.ye@intel.com>

> >

> >             * gcc.dg/dfp/func-vararg-alternate-d128-2.c: New.

> >             * gcc.dg/dfp/func-vararg-mixed-2.c: Likewise.

> >

> > isn't expected to work with libgcc.

> >

> > gcc/

> >

> >         PR target/95021

> >         * config/i386/i386-features.c (has_non_address_hard_reg):

> >         Renamed to ...

> >         (pseudo_reg_set): This.  Return the SET expression.  Ignore

> >         pseudo register push.

> >         (general_scalar_to_vector_candidate_p): Combine single_set and

> >         has_non_address_hard_reg calls to pseudo_reg_set.

> >         (timode_scalar_to_vector_candidate_p): Likewise.

> >         * config/i386/i386.md (*pushv1ti2): New pattern.

> >

> > gcc/testsuite/

> >

> >         PR target/95021

> >         * c-c++-common/dfp/func-vararg-mixed-2.c: Removed.

> >         * gcc.target/i386/pr95021-1.c: New test.

> >         * gcc.target/i386/pr95021-2.c: Likewise.

> >         * gcc.target/i386/pr95021-3.c: Likewise.

> >         * gcc.target/i386/pr95021-4.c: Likewise.

> >         * gcc.target/i386/pr95021-5.c: Likewise.

>

> OK with a small nit in i386.md and testcase changes, as explained inline.

>

> Thanks,

> Uros.

>

> > ---

> >  gcc/config/i386/i386-features.c               |  37 +++---

> >  gcc/config/i386/i386.md                       |  16 +++

> >  .../c-c++-common/dfp/func-vararg-mixed-2.c    | 105 ------------------

> >  gcc/testsuite/gcc.target/i386/pr95021-1.c     |  25 +++++

> >  gcc/testsuite/gcc.target/i386/pr95021-2.c     |  53 +++++++++

> >  gcc/testsuite/gcc.target/i386/pr95021-3.c     |  25 +++++

> >  gcc/testsuite/gcc.target/i386/pr95021-4.c     |  25 +++++

> >  gcc/testsuite/gcc.target/i386/pr95021-5.c     |  59 ++++++++++

> >  8 files changed, 224 insertions(+), 121 deletions(-)

> >  delete mode 100644 gcc/testsuite/c-c++-common/dfp/func-vararg-mixed-2.c

> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr95021-1.c

> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr95021-2.c

> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr95021-3.c

> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr95021-4.c

> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr95021-5.c

> >

> > diff --git a/gcc/config/i386/i386-features.c b/gcc/config/i386/i386-features.c

> > index 78fb373db6e..b9b764c092a 100644

> > --- a/gcc/config/i386/i386-features.c

> > +++ b/gcc/config/i386/i386-features.c

> > @@ -1253,25 +1253,36 @@ scalar_chain::convert ()

> >    return converted_insns;

> >  }

> >

> > -/* Return 1 if INSN uses or defines a hard register.

> > -   Hard register uses in a memory address are ignored.

> > -   Clobbers and flags definitions are ignored.  */

> > +/* Return the SET expression if INSN doesn't reference hard register.

> > +   Return NULL if INSN uses or defines a hard register, excluding

> > +   pseudo register pushes, hard register uses in a memory address,

> > +   clobbers and flags definitions.  */

> >

> > -static bool

> > -has_non_address_hard_reg (rtx_insn *insn)

> > +static rtx

> > +pseudo_reg_set (rtx_insn *insn)

> >  {

> > +  rtx set = single_set (insn);

> > +  if (!set)

> > +    return NULL;

> > +

> > +  /* Check pseudo register push first. */

> > +  if (REG_P (SET_SRC (set))

> > +      && !HARD_REGISTER_P (SET_SRC (set))

> > +      && push_operand (SET_DEST (set), GET_MODE (SET_DEST (set))))

> > +    return set;

> > +

> >    df_ref ref;

> >    FOR_EACH_INSN_DEF (ref, insn)

> >      if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))

> >         && !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)

> >         && DF_REF_REGNO (ref) != FLAGS_REG)

> > -      return true;

> > +      return NULL;

> >

> >    FOR_EACH_INSN_USE (ref, insn)

> >      if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref)))

> > -      return true;

> > +      return NULL;

> >

> > -  return false;

> > +  return set;

> >  }

> >

> >  /* Check if comparison INSN may be transformed

> > @@ -1345,14 +1356,11 @@ convertible_comparison_p (rtx_insn *insn, enum machine_mode mode)

> >  static bool

> >  general_scalar_to_vector_candidate_p (rtx_insn *insn, enum machine_mode mode)

> >  {

> > -  rtx def_set = single_set (insn);

> > +  rtx def_set = pseudo_reg_set (insn);

> >

> >    if (!def_set)

> >      return false;

> >

> > -  if (has_non_address_hard_reg (insn))

> > -    return false;

> > -

> >    rtx src = SET_SRC (def_set);

> >    rtx dst = SET_DEST (def_set);

> >

> > @@ -1442,14 +1450,11 @@ general_scalar_to_vector_candidate_p (rtx_insn *insn, enum machine_mode mode)

> >  static bool

> >  timode_scalar_to_vector_candidate_p (rtx_insn *insn)

> >  {

> > -  rtx def_set = single_set (insn);

> > +  rtx def_set = pseudo_reg_set (insn);

> >

> >    if (!def_set)

> >      return false;

> >

> > -  if (has_non_address_hard_reg (insn))

> > -    return false;

> > -

> >    rtx src = SET_SRC (def_set);

> >    rtx dst = SET_DEST (def_set);

> >

> > diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md

> > index 1bf0c1a7f01..bcb44a3fb46 100644

> > --- a/gcc/config/i386/i386.md

> > +++ b/gcc/config/i386/i386.md

> > @@ -1700,6 +1700,22 @@ (define_insn "*pushdi2_rex64"

> >    [(set_attr "type" "push,multi,multi")

> >     (set_attr "mode" "DI")])

> >

> > +(define_insn_and_split "*pushv1ti2"

> > +  [(set (match_operand:V1TI 0 "push_operand" "=<")

> > +       (match_operand:V1TI 1 "register_operand" "v"))]

> > +  "TARGET_64BIT && TARGET_STV"

> > +  "#"

> > +  "&& reload_completed"

> > +  [(set (reg:P SP_REG) (plus:P (reg:P SP_REG) (match_dup 2)))

> > +   (set (match_dup 0) (match_dup 1))]

> > +{

> > +  operands[2] = GEN_INT (-PUSH_ROUNDING (GET_MODE_SIZE (V1TImode)));

> > +  /* Preserve memory attributes. */

> > +  operands[0] = replace_equiv_address (operands[0], stack_pointer_rtx);

> > +}

> > +  [(set_attr "type" "multi")

> > +   (set_attr "mode" "TI")])

>

> Please put the above pattern just above DWI "*push<mode>2".


Fixed.

> > +

> >  ;; Convert impossible pushes of immediate to existing instructions.

> >  ;; First try to get scratch register and go through it.  In case this

> >  ;; fails, push sign extended lower part first and then overwrite

> > diff --git a/gcc/testsuite/c-c++-common/dfp/func-vararg-mixed-2.c b/gcc/testsuite/c-c++-common/dfp/func-vararg-mixed-2.c

> > deleted file mode 100644

> > index 02cafb016d1..00000000000

> > --- a/gcc/testsuite/c-c++-common/dfp/func-vararg-mixed-2.c

> > +++ /dev/null

> > @@ -1,105 +0,0 @@

> > -/* { dg-do run { target { { i?86-*-* x86_64-*-* } && ia32 } } } */

> > -/* { dg-options "-mpreferred-stack-boundary=2" } */

> > -

> > -/* C99 6.5.2.2 Function calls.

> > -   Test passing varargs of the combination of decimal float types and

> > -   other types.  */

> > -

> > -#include <stdarg.h>

> > -#include "dfp-dbg.h"

> > -

> > -/* Supposing the list of varying number of arguments is:

> > -   unsigned int, _Decimal128, double, _Decimal32, _Decimal64.  */

> > -

> > -static _Decimal32

> > -vararg_d32 (unsigned arg, ...)

> > -{

> > -  va_list ap;

> > -  _Decimal32 result;

> > -

> > -  va_start (ap, arg);

> > -

> > -  va_arg (ap, unsigned int);

> > -  va_arg (ap, _Decimal128);

> > -  va_arg (ap, double);

> > -  result = va_arg (ap, _Decimal32);

> > -

> > -  va_end (ap);

> > -  return result;

> > -}

> > -

> > -static _Decimal32

> > -vararg_d64 (unsigned arg, ...)

> > -{

> > -  va_list ap;

> > -  _Decimal64 result;

> > -

> > -  va_start (ap, arg);

> > -

> > -  va_arg (ap, unsigned int);

> > -  va_arg (ap, _Decimal128);

> > -  va_arg (ap, double);

> > -  va_arg (ap, _Decimal32);

> > -  result = va_arg (ap, _Decimal64);

> > -

> > -  va_end (ap);

> > -  return result;

> > -}

> > -

> > -static _Decimal128

> > -vararg_d128 (unsigned arg, ...)

> > -{

> > -  va_list ap;

> > -  _Decimal128 result;

> > -

> > -  va_start (ap, arg);

> > -

> > -  va_arg (ap, unsigned int);

> > -  result = va_arg (ap, _Decimal128);

> > -

> > -  va_end (ap);

> > -  return result;

> > -}

> > -

> > -static unsigned int

> > -vararg_int (unsigned arg, ...)

> > -{

> > -  va_list ap;

> > -  unsigned int result;

> > -

> > -  va_start (ap, arg);

> > -

> > -  result = va_arg (ap, unsigned int);

> > -

> > -  va_end (ap);

> > -  return result;

> > -}

> > -

> > -static double

> > -vararg_double (unsigned arg, ...)

> > -{

> > -  va_list ap;

> > -  float result;

> > -

> > -  va_start (ap, arg);

> > -

> > -  va_arg (ap, unsigned int);

> > -  va_arg (ap, _Decimal128);

> > -  result = va_arg (ap, double);

> > -

> > -  va_end (ap);

> > -  return result;

> > -}

> > -

> > -

> > -int

> > -main ()

> > -{

> > -  if (vararg_d32 (3, 0, 1.0dl, 2.0, 3.0df, 4.0dd) != 3.0df) FAILURE

> > -  if (vararg_d64 (4, 0, 1.0dl, 2.0, 3.0df, 4.0dd) != 4.0dd) FAILURE

> > -  if (vararg_d128 (1, 0, 1.0dl, 2.0, 3.0df, 4.0dd) != 1.0dl) FAILURE

> > -  if (vararg_int (0, 0, 1.0dl, 2.0, 3.0df, 4.0dd) != 0) FAILURE

> > -  if (vararg_double (2, 0, 1.0dl, 2.0, 3.0df, 4.0dd) != 2.0) FAILURE

> > -

> > -  FINISH

> > -}

> > diff --git a/gcc/testsuite/gcc.target/i386/pr95021-1.c b/gcc/testsuite/gcc.target/i386/pr95021-1.c

> > new file mode 100644

> > index 00000000000..218776240ee

> > --- /dev/null

> > +++ b/gcc/testsuite/gcc.target/i386/pr95021-1.c

> > @@ -0,0 +1,25 @@

> > +/* { dg-do compile { target ia32 } } */

> > +/* { dg-options "-O2 -msse2 -mstv -mregparm=0 -W" } *

>

> No need for -mregparm=0, it is the the default. Also, do we really need -W?


Fixed.  -W is added for

[hjl@gnu-cfl-2 i386]$ gcc -m32 pr95021-1.c -msse2 -mstv -W -O2 -S -o /tmp/x.s
pr95021-1.c: In function ‘bar’:
pr95021-1.c:21:17: warning: variable ‘target’ might be clobbered by
‘longjmp’ or ‘vfork’ [-Wclobbered]
   21 |       long long target = *target_p;
      |                 ^~~~~~
[hjl@gnu-cfl-2 i386]$

> > +/* { dg-final { scan-assembler "movq\[ \t\]+\[^\n\]*, %xmm" } } */

>

> Please rather scan for "movq    %xmmX, (%esp)", this is what we are

> looking for, the bad code has:

>

>         movd    %xmm1, 20(%esp)

>         pushl   20(%esp)


Fixed.

> Also, we can check that "psrlq   $32, %xmm1" is not there, which means

> that DImode value wasn't broken to SImode. Please improve this test in

> all 32bit testcases.


Fixed.  I consolidated compile-time tests with run-time tests.

> > +

> > +typedef long long a;

> > +struct __jmp_buf_tag   {

> > +};

> > +typedef struct __jmp_buf_tag sigjmp_buf[1];

> > +sigjmp_buf jmp_buf;

> > +int __sigsetjmp (sigjmp_buf, int);

> > +typedef struct {

> > +  a target;

> > +} b;

> > +extern a *target_p;

> > +b *c;

> > +extern void foo (a);

> > +void

> > +d(void)

> > +{

> > +  if (__sigsetjmp(jmp_buf, 1)) {

> > +    a target = *target_p;

> > +    c->target = target;

> > +    foo(target);

> > +  }

> > +}

> > diff --git a/gcc/testsuite/gcc.target/i386/pr95021-2.c b/gcc/testsuite/gcc.target/i386/pr95021-2.c

> > new file mode 100644

> > index 00000000000..473b5f81664

> > --- /dev/null

> > +++ b/gcc/testsuite/gcc.target/i386/pr95021-2.c

> > @@ -0,0 +1,53 @@

> > +/* { dg-do run { target ia32 } } */

> > +/* { dg-require-effective-target sse2_runtime } */

> > +/* { dg-options "-O2 -msse2 -mstv -W" } */

>

> This is the same testcase as the one above. The default is -mregparm=0.


I added this run-time to verify that the new *pushv1ti2 pattern works correctly.

> > +

> > +

> > +#include <stdlib.h>

> > +#include <setjmp.h>

> > +

> > +jmp_buf buf;

> > +

> > +long long *target_p;

> > +long long *c;

> > +

> > +int count;

> > +

> > +__attribute__ ((noclone, noinline))

> > +void

> > +foo (long long x)

> > +{

> > +  if (x != *c)

> > +    abort ();

> > +  if (!count)

> > +    {

> > +      count++;

> > +      longjmp (buf, 1);

> > +    }

> > +}

> > +

> > +__attribute__ ((noclone, noinline))

> > +void

> > +bar (void)

> > +{

> > +  if (setjmp (buf))

> > +    {

> > +      long long target = *target_p;

> > +      *c = target;

> > +      foo (target);

> > +    }

> > +  else

> > +    foo (0);

> > +}

> > +

> > +int

> > +main ()

> > +{

> > +  long long val = 30;

> > +  long long local = 0;

> > +  target_p = &val;

> > +  c = &local;

> > +  bar ();

> > +  if (val != local)

> > +    abort ();

> > +  return 0;

> > +}

> > diff --git a/gcc/testsuite/gcc.target/i386/pr95021-3.c b/gcc/testsuite/gcc.target/i386/pr95021-3.c

> > new file mode 100644

> > index 00000000000..b213b5db072

> > --- /dev/null

> > +++ b/gcc/testsuite/gcc.target/i386/pr95021-3.c

> > @@ -0,0 +1,25 @@

> > +/* { dg-do compile { target ia32 } } */

> > +/* { dg-options "-O2 -msse2 -mstv -mregparm=3 -W" } */

> > +/* { dg-final { scan-assembler "movq\[ \t\]+\[^\n\]*, %xmm" } } */

>

> Just #include "pr95021-1.c". No need to dupicate the same code.


Fixed.

> > +typedef long long a;

> > +struct __jmp_buf_tag   {

> > +};

> > +typedef struct __jmp_buf_tag sigjmp_buf[1];

> > +sigjmp_buf jmp_buf;

> > +int __sigsetjmp (sigjmp_buf, int);

> > +typedef struct {

> > +  a target;

> > +} b;

> > +extern a *target_p;

> > +b *c;

> > +extern void foo (a);

> > +void

> > +d(void)

> > +{

> > +  if (__sigsetjmp(jmp_buf, 1)) {

> > +    a target = *target_p;

> > +    c->target = target;

> > +    foo(target);

> > +  }> +}

> > diff --git a/gcc/testsuite/gcc.target/i386/pr95021-4.c b/gcc/testsuite/gcc.target/i386/pr95021-4.c

> > new file mode 100644

> > index 00000000000..a080d7b289f

> > --- /dev/null

> > +++ b/gcc/testsuite/gcc.target/i386/pr95021-4.c

> > @@ -0,0 +1,25 @@

> > +/* { dg-do compile { target int128 } } */

> > +/* { dg-options "-O2 -msse2 -mstv -W" } */

> > +/* { dg-final { scan-assembler "movdqa\[ \t\]+\[^\n\]*, %xmm" } } */

>

> Please scan for "movaps  %xmmX, (%rsp)" here.


Fixed.

> > +

> > +typedef __int128 a;

> > +struct __jmp_buf_tag   {

> > +};

> > +typedef struct __jmp_buf_tag sigjmp_buf[1];

> > +sigjmp_buf jmp_buf;

> > +int __sigsetjmp (sigjmp_buf, int);

> > +typedef struct {

> > +  a target;

> > +} b;

> > +extern a *target_p;

> > +b *c;

> > +extern void foo (int, int, int, int, int, int, a);

> > +void

> > +d(void)

> > +{

> > +  if (__sigsetjmp(jmp_buf, 1)) {

> > +    a target = *target_p;

> > +    c->target = target;

> > +    foo(1, 2, 3, 4, 5, 6, target);

> > +  }

> > +}

> > diff --git a/gcc/testsuite/gcc.target/i386/pr95021-5.c b/gcc/testsuite/gcc.target/i386/pr95021-5.c

> > new file mode 100644

> > index 00000000000..3fcf2c3922e

> > --- /dev/null

> > +++ b/gcc/testsuite/gcc.target/i386/pr95021-5.c

> > @@ -0,0 +1,59 @@

> > +/* { dg-do run { target int128 } } */

> > +/* { dg-require-effective-target sse2_runtime } */

> > +/* { dg-options "-O2 -msse2 -mstv -W" } */

> > +

> > +#include <stdlib.h>

> > +#include <setjmp.h>

> > +

> > +jmp_buf buf;

> > +

> > +__int128 *target_p;

> > +__int128 *c;

> > +

> > +int count;

> > +

> > +__attribute__ ((noclone, noinline))

> > +void

> > +foo (__int128 i1, __int128 i2, __int128 i3, __int128 x)

> > +{

> > +  if (i1 != 0xbadbeef1)

> > +    abort ();

> > +  if (i2 != 0x2badbeef)

> > +    abort ();

> > +  if (i3 != 0xbad3beef)

> > +    abort ();

> > +  if (x != *c)

> > +    abort ();

> > +  if (!count)

> > +    {

> > +      count++;

> > +      longjmp (buf, 1);

> > +    }

> > +}

> > +

> > +__attribute__ ((noclone, noinline))

> > +void

> > +bar (void)

> > +{

> > +  if (setjmp (buf))

> > +    {

> > +      __int128 target = *target_p;

> > +      *c = target;

> > +      foo (0xbadbeef1, 0x2badbeef, 0xbad3beef, target);

> > +    }

> > +  else

> > +    foo (0xbadbeef1, 0x2badbeef, 0xbad3beef, 0);

> > +}

> > +

> > +int

> > +main ()

> > +{

> > +  __int128 val = 30;

> > +  __int128 local = 0;

> > +  target_p = &val;

> > +  c = &local;

> > +  bar ();

> > +  if (val != local)

> > +    abort ();

> > +  return 0;

> > +}

> > --

> > 2.26.2

> >


This is the updated patch I am checking in.

Thanks.

-- 
H.J.
From 3d24d6f2a9bac6a348a731e6df80ce60995214a9 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Tue, 12 May 2020 11:30:29 -0700
Subject: [PATCH] x86: Allow V1TI vector register pushes

Add V1TI vector register push and split it after reload to a sequence
of:

(set (reg:P SP_REG) (plus:P SP_REG) (const_int -8)))
(set (match_dup 0) (match_dup 1))

so that STV pass can convert TI mode integer push to V1TI vector register
push.  Rename has_non_address_hard_reg to pseudo_reg_set, combine calls
of single_set and has_non_address_hard_reg to pseudo_reg_set, to ignore
pseudo register push.

Remove c-c++-common/dfp/func-vararg-mixed-2.c since it is compiled with
-mpreferred-stack-boundary=2 and leads to segfault:

Dump of assembler code for function __bid_nesd2:
   0x08049210 <+0>:	endbr32
   0x08049214 <+4>:	push   %esi
   0x08049215 <+5>:	push   %ebx
   0x08049216 <+6>:	call   0x8049130 <__x86.get_pc_thunk.bx>
   0x0804921b <+11>:	add    $0x8de5,%ebx
   0x08049221 <+17>:	sub    $0x20,%esp
   0x08049224 <+20>:	mov    0x30(%esp),%esi
   0x08049228 <+24>:	pushl  0x2c(%esp)
   0x0804922c <+28>:	call   0x804e600 <__bid32_to_bid64>
   0x08049231 <+33>:	mov    %esi,(%esp)
   0x08049234 <+36>:	movd   %edx,%xmm1
   0x08049238 <+40>:	movd   %eax,%xmm0
   0x0804923c <+44>:	punpckldq %xmm1,%xmm0
=> 0x08049240 <+48>:	movaps %xmm0,0x10(%esp)
   0x08049245 <+53>:	call   0x804e600 <__bid32_to_bid64>
   0x0804924a <+58>:	push   %edx
   0x0804924b <+59>:	push   %eax
   0x0804924c <+60>:	pushl  0x1c(%esp)
   0x08049250 <+64>:	pushl  0x1c(%esp)
   0x08049254 <+68>:	call   0x804b260 <__bid64_quiet_not_equal>
   0x08049259 <+73>:	add    $0x34,%esp
   0x0804925c <+76>:	pop    %ebx
   0x0804925d <+77>:	pop    %esi
   0x0804925e <+78>:	ret

when libgcc is compiled with -msse2.  According to GCC manual:

'-mpreferred-stack-boundary=NUM'
     Attempt to keep the stack boundary aligned to a 2 raised to NUM
     byte boundary.  If '-mpreferred-stack-boundary' is not specified,
     the default is 4 (16 bytes or 128-bits).

     *Warning:* If you use this switch, then you must build all modules
     with the same value, including any libraries.  This includes the
     system libraries and startup modules.

c-c++-common/dfp/func-vararg-mixed-2.c, which was added by

commit 3b2488ca6ece182f2136a20ee5fa0bb92f935b0f
Author: H.J. Lu <hongjiu.lu@intel.com>
Date:   Wed Jul 30 19:24:02 2008 +0000

    func-vararg-alternate-d128-2.c: New.

    2008-07-30  H.J. Lu  <hongjiu.lu@intel.com>
                Joey Ye  <joey.ye@intel.com>

            * gcc.dg/dfp/func-vararg-alternate-d128-2.c: New.
            * gcc.dg/dfp/func-vararg-mixed-2.c: Likewise.

isn't expected to work with libgcc.

gcc/

	PR target/95021
	* config/i386/i386-features.c (has_non_address_hard_reg):
	Renamed to ...
	(pseudo_reg_set): This.  Return the SET expression.  Ignore
	pseudo register push.
	(general_scalar_to_vector_candidate_p): Combine single_set and
	has_non_address_hard_reg calls to pseudo_reg_set.
	(timode_scalar_to_vector_candidate_p): Likewise.
	* config/i386/i386.md (*pushv1ti2): New pattern.

gcc/testsuite/

	PR target/95021
	* c-c++-common/dfp/func-vararg-mixed-2.c: Removed.
	* gcc.target/i386/pr95021-1.c: New test.
	* gcc.target/i386/pr95021-2.c: Likewise.
	* gcc.target/i386/pr95021-3.c: Likewise.
	* gcc.target/i386/pr95021-4.c: Likewise.
	* gcc.target/i386/pr95021-5.c: Likewise.
---
 gcc/config/i386/i386-features.c               |  37 +++---
 gcc/config/i386/i386.md                       |  16 +++
 .../c-c++-common/dfp/func-vararg-mixed-2.c    | 105 ------------------
 gcc/testsuite/gcc.target/i386/pr95021-1.c     |  27 +++++
 gcc/testsuite/gcc.target/i386/pr95021-2.c     |  39 +++++++
 gcc/testsuite/gcc.target/i386/pr95021-3.c     |   5 +
 gcc/testsuite/gcc.target/i386/pr95021-4.c     |  28 +++++
 gcc/testsuite/gcc.target/i386/pr95021-5.c     |  45 ++++++++
 8 files changed, 181 insertions(+), 121 deletions(-)
 delete mode 100644 gcc/testsuite/c-c++-common/dfp/func-vararg-mixed-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr95021-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr95021-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr95021-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr95021-4.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr95021-5.c

diff --git a/gcc/config/i386/i386-features.c b/gcc/config/i386/i386-features.c
index 78fb373db6e..b9b764c092a 100644
--- a/gcc/config/i386/i386-features.c
+++ b/gcc/config/i386/i386-features.c
@@ -1253,25 +1253,36 @@ scalar_chain::convert ()
   return converted_insns;
 }
 
-/* Return 1 if INSN uses or defines a hard register.
-   Hard register uses in a memory address are ignored.
-   Clobbers and flags definitions are ignored.  */
+/* Return the SET expression if INSN doesn't reference hard register.
+   Return NULL if INSN uses or defines a hard register, excluding
+   pseudo register pushes, hard register uses in a memory address,
+   clobbers and flags definitions.  */
 
-static bool
-has_non_address_hard_reg (rtx_insn *insn)
+static rtx
+pseudo_reg_set (rtx_insn *insn)
 {
+  rtx set = single_set (insn);
+  if (!set)
+    return NULL;
+
+  /* Check pseudo register push first. */
+  if (REG_P (SET_SRC (set))
+      && !HARD_REGISTER_P (SET_SRC (set))
+      && push_operand (SET_DEST (set), GET_MODE (SET_DEST (set))))
+    return set;
+
   df_ref ref;
   FOR_EACH_INSN_DEF (ref, insn)
     if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))
 	&& !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
 	&& DF_REF_REGNO (ref) != FLAGS_REG)
-      return true;
+      return NULL;
 
   FOR_EACH_INSN_USE (ref, insn)
     if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref)))
-      return true;
+      return NULL;
 
-  return false;
+  return set;
 }
 
 /* Check if comparison INSN may be transformed
@@ -1345,14 +1356,11 @@ convertible_comparison_p (rtx_insn *insn, enum machine_mode mode)
 static bool
 general_scalar_to_vector_candidate_p (rtx_insn *insn, enum machine_mode mode)
 {
-  rtx def_set = single_set (insn);
+  rtx def_set = pseudo_reg_set (insn);
 
   if (!def_set)
     return false;
 
-  if (has_non_address_hard_reg (insn))
-    return false;
-
   rtx src = SET_SRC (def_set);
   rtx dst = SET_DEST (def_set);
 
@@ -1442,14 +1450,11 @@ general_scalar_to_vector_candidate_p (rtx_insn *insn, enum machine_mode mode)
 static bool
 timode_scalar_to_vector_candidate_p (rtx_insn *insn)
 {
-  rtx def_set = single_set (insn);
+  rtx def_set = pseudo_reg_set (insn);
 
   if (!def_set)
     return false;
 
-  if (has_non_address_hard_reg (insn))
-    return false;
-
   rtx src = SET_SRC (def_set);
   rtx dst = SET_DEST (def_set);
 
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 1bf0c1a7f01..9fd32f28bf3 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -1674,6 +1674,22 @@ (define_insn "*cmpi<unord><MODEF:mode>"
 
 ;; Push/pop instructions.
 
+(define_insn_and_split "*pushv1ti2"
+  [(set (match_operand:V1TI 0 "push_operand" "=<")
+	(match_operand:V1TI 1 "register_operand" "v"))]
+  "TARGET_64BIT && TARGET_STV"
+  "#"
+  "&& reload_completed"
+  [(set (reg:P SP_REG) (plus:P (reg:P SP_REG) (match_dup 2)))
+   (set (match_dup 0) (match_dup 1))]
+{
+  operands[2] = GEN_INT (-PUSH_ROUNDING (GET_MODE_SIZE (V1TImode)));
+  /* Preserve memory attributes. */
+  operands[0] = replace_equiv_address (operands[0], stack_pointer_rtx);
+}
+  [(set_attr "type" "multi")
+   (set_attr "mode" "TI")])
+
 (define_insn "*push<mode>2"
   [(set (match_operand:DWI 0 "push_operand" "=<,<")
 	(match_operand:DWI 1 "general_no_elim_operand" "riF*o,*v"))]
diff --git a/gcc/testsuite/c-c++-common/dfp/func-vararg-mixed-2.c b/gcc/testsuite/c-c++-common/dfp/func-vararg-mixed-2.c
deleted file mode 100644
index 02cafb016d1..00000000000
--- a/gcc/testsuite/c-c++-common/dfp/func-vararg-mixed-2.c
+++ /dev/null
@@ -1,105 +0,0 @@
-/* { dg-do run { target { { i?86-*-* x86_64-*-* } && ia32 } } } */
-/* { dg-options "-mpreferred-stack-boundary=2" } */
-
-/* C99 6.5.2.2 Function calls.
-   Test passing varargs of the combination of decimal float types and
-   other types.  */
-
-#include <stdarg.h>
-#include "dfp-dbg.h"
-
-/* Supposing the list of varying number of arguments is:
-   unsigned int, _Decimal128, double, _Decimal32, _Decimal64.  */
-
-static _Decimal32
-vararg_d32 (unsigned arg, ...)
-{
-  va_list ap;
-  _Decimal32 result;
-
-  va_start (ap, arg);
-
-  va_arg (ap, unsigned int);
-  va_arg (ap, _Decimal128);
-  va_arg (ap, double);
-  result = va_arg (ap, _Decimal32);
-
-  va_end (ap);
-  return result;
-}
-
-static _Decimal32
-vararg_d64 (unsigned arg, ...)
-{
-  va_list ap;
-  _Decimal64 result;
-
-  va_start (ap, arg);
-
-  va_arg (ap, unsigned int);
-  va_arg (ap, _Decimal128);
-  va_arg (ap, double);
-  va_arg (ap, _Decimal32);
-  result = va_arg (ap, _Decimal64);
-
-  va_end (ap);
-  return result;
-}
-
-static _Decimal128
-vararg_d128 (unsigned arg, ...)
-{
-  va_list ap;
-  _Decimal128 result;
-
-  va_start (ap, arg);
-
-  va_arg (ap, unsigned int);
-  result = va_arg (ap, _Decimal128);
-
-  va_end (ap);
-  return result;
-}
-
-static unsigned int
-vararg_int (unsigned arg, ...)
-{
-  va_list ap;
-  unsigned int result;
-
-  va_start (ap, arg);
-
-  result = va_arg (ap, unsigned int);
-
-  va_end (ap);
-  return result;
-}
-
-static double
-vararg_double (unsigned arg, ...)
-{
-  va_list ap;
-  float result;
-
-  va_start (ap, arg);
-
-  va_arg (ap, unsigned int);
-  va_arg (ap, _Decimal128);
-  result = va_arg (ap, double);
-
-  va_end (ap);
-  return result;
-}
-
-
-int
-main ()
-{
-  if (vararg_d32 (3, 0, 1.0dl, 2.0, 3.0df, 4.0dd) != 3.0df) FAILURE
-  if (vararg_d64 (4, 0, 1.0dl, 2.0, 3.0df, 4.0dd) != 4.0dd) FAILURE
-  if (vararg_d128 (1, 0, 1.0dl, 2.0, 3.0df, 4.0dd) != 1.0dl) FAILURE
-  if (vararg_int (0, 0, 1.0dl, 2.0, 3.0df, 4.0dd) != 0) FAILURE
-  if (vararg_double (2, 0, 1.0dl, 2.0, 3.0df, 4.0dd) != 2.0) FAILURE
-
-  FINISH
-}
diff --git a/gcc/testsuite/gcc.target/i386/pr95021-1.c b/gcc/testsuite/gcc.target/i386/pr95021-1.c
new file mode 100644
index 00000000000..a0b9a262a87
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr95021-1.c
@@ -0,0 +1,27 @@
+/* { dg-do compile { target ia32 } } */
+/* { dg-options "-O2 -msse2 -mstv -W" } */
+/* { dg-final { scan-assembler "movq\[ \t\]%xmm\[0-9\]+, \\(%esp\\)" } } */
+/* { dg-final { scan-assembler-not "psrlq" } } */
+
+#include <setjmp.h>
+
+extern jmp_buf buf;
+
+extern long long *target_p;
+extern long long *c;
+
+extern void foo (long long);
+
+__attribute__ ((noclone, noinline))
+void
+bar (void)
+{
+  if (setjmp (buf))
+    {
+      long long target = *target_p;
+      *c = target;
+      foo (target);
+    }
+  else
+    foo (0);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr95021-2.c b/gcc/testsuite/gcc.target/i386/pr95021-2.c
new file mode 100644
index 00000000000..53247e52784
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr95021-2.c
@@ -0,0 +1,39 @@
+/* { dg-do run { target ia32 } } */
+/* { dg-require-effective-target sse2_runtime } */
+/* { dg-options "-O2 -msse2 -mstv -W" } */
+
+#include <stdlib.h>
+#include "pr95021-1.c"
+
+jmp_buf buf;
+
+long long *target_p;
+long long *c;
+
+int count;
+
+__attribute__ ((noclone, noinline))
+void
+foo (long long x)
+{
+  if (x != *c)
+    abort ();
+  if (!count)
+    {
+      count++;
+      longjmp (buf, 1);
+    }
+}
+
+int
+main ()
+{
+  long long val = 30;
+  long long local = 0;
+  target_p = &val;
+  c = &local;
+  bar ();
+  if (val != local)
+    abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr95021-3.c b/gcc/testsuite/gcc.target/i386/pr95021-3.c
new file mode 100644
index 00000000000..1748161a77c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr95021-3.c
@@ -0,0 +1,5 @@
+/* { dg-do compile { target ia32 } } */
+/* { dg-options "-O2 -msse2 -mstv -mregparm=3 -W" } */
+/* { dg-final { scan-assembler "movq\[ \t\]+\[^\n\]*, %xmm" } } */
+
+#include "pr95021-1.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr95021-4.c b/gcc/testsuite/gcc.target/i386/pr95021-4.c
new file mode 100644
index 00000000000..d5bb28cc01a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr95021-4.c
@@ -0,0 +1,28 @@
+/* { dg-do compile { target int128 } } */
+/* { dg-options "-O2 -msse2 -mstv -W" } */
+/* { dg-final { scan-assembler "(movaps|vmovdqa)\[ \t\]%xmm\[0-9\]+, \\(%rsp\\)" } } */
+
+#include <setjmp.h>
+
+extern jmp_buf buf;
+
+extern __int128 *target_p;
+__int128 *c;
+
+extern int count;
+
+extern void foo (__int128, __int128, __int128, __int128);
+
+__attribute__ ((noclone, noinline))
+void
+bar (void)
+{
+  if (setjmp (buf))
+    {
+      __int128 target = *target_p;
+      *c = target;
+      foo (0xbadbeef1, 0x2badbeef, 0xbad3beef, target);
+    }
+  else
+    foo (0xbadbeef1, 0x2badbeef, 0xbad3beef, 0);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr95021-5.c b/gcc/testsuite/gcc.target/i386/pr95021-5.c
new file mode 100644
index 00000000000..d8658095bc8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr95021-5.c
@@ -0,0 +1,45 @@
+/* { dg-do run { target int128 } } */
+/* { dg-require-effective-target sse2_runtime } */
+/* { dg-options "-O2 -msse2 -mstv -W" } */
+
+#include <stdlib.h>
+#include "pr95021-4.c"
+
+jmp_buf buf;
+
+__int128 *target_p;
+__int128 *c;
+
+int count;
+
+__attribute__ ((noclone, noinline))
+void
+foo (__int128 i1, __int128 i2, __int128 i3, __int128 x)
+{
+  if (i1 != 0xbadbeef1)
+    abort ();
+  if (i2 != 0x2badbeef)
+    abort ();
+  if (i3 != 0xbad3beef)
+    abort ();
+  if (x != *c)
+    abort ();
+  if (!count)
+    {
+      count++;
+      longjmp (buf, 1);
+    }
+}
+
+int
+main ()
+{
+  __int128 val = 30;
+  __int128 local = 0;
+  target_p = &val;
+  c = &local;
+  bar ();
+  if (val != local)
+    abort ();
+  return 0;
+}

Patch

diff --git a/gcc/config/i386/i386-features.c b/gcc/config/i386/i386-features.c
index 78fb373db6e..b9b764c092a 100644
--- a/gcc/config/i386/i386-features.c
+++ b/gcc/config/i386/i386-features.c
@@ -1253,25 +1253,36 @@  scalar_chain::convert ()
   return converted_insns;
 }
 
-/* Return 1 if INSN uses or defines a hard register.
-   Hard register uses in a memory address are ignored.
-   Clobbers and flags definitions are ignored.  */
+/* Return the SET expression if INSN doesn't reference hard register.
+   Return NULL if INSN uses or defines a hard register, excluding
+   pseudo register pushes, hard register uses in a memory address,
+   clobbers and flags definitions.  */
 
-static bool
-has_non_address_hard_reg (rtx_insn *insn)
+static rtx
+pseudo_reg_set (rtx_insn *insn)
 {
+  rtx set = single_set (insn);
+  if (!set)
+    return NULL;
+
+  /* Check pseudo register push first. */
+  if (REG_P (SET_SRC (set))
+      && !HARD_REGISTER_P (SET_SRC (set))
+      && push_operand (SET_DEST (set), GET_MODE (SET_DEST (set))))
+    return set;
+
   df_ref ref;
   FOR_EACH_INSN_DEF (ref, insn)
     if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))
 	&& !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
 	&& DF_REF_REGNO (ref) != FLAGS_REG)
-      return true;
+      return NULL;
 
   FOR_EACH_INSN_USE (ref, insn)
     if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref)))
-      return true;
+      return NULL;
 
-  return false;
+  return set;
 }
 
 /* Check if comparison INSN may be transformed
@@ -1345,14 +1356,11 @@  convertible_comparison_p (rtx_insn *insn, enum machine_mode mode)
 static bool
 general_scalar_to_vector_candidate_p (rtx_insn *insn, enum machine_mode mode)
 {
-  rtx def_set = single_set (insn);
+  rtx def_set = pseudo_reg_set (insn);
 
   if (!def_set)
     return false;
 
-  if (has_non_address_hard_reg (insn))
-    return false;
-
   rtx src = SET_SRC (def_set);
   rtx dst = SET_DEST (def_set);
 
@@ -1442,14 +1450,11 @@  general_scalar_to_vector_candidate_p (rtx_insn *insn, enum machine_mode mode)
 static bool
 timode_scalar_to_vector_candidate_p (rtx_insn *insn)
 {
-  rtx def_set = single_set (insn);
+  rtx def_set = pseudo_reg_set (insn);
 
   if (!def_set)
     return false;
 
-  if (has_non_address_hard_reg (insn))
-    return false;
-
   rtx src = SET_SRC (def_set);
   rtx dst = SET_DEST (def_set);
 
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 1bf0c1a7f01..bcb44a3fb46 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -1700,6 +1700,22 @@  (define_insn "*pushdi2_rex64"
   [(set_attr "type" "push,multi,multi")
    (set_attr "mode" "DI")])
 
+(define_insn_and_split "*pushv1ti2"
+  [(set (match_operand:V1TI 0 "push_operand" "=<")
+	(match_operand:V1TI 1 "register_operand" "v"))]
+  "TARGET_64BIT && TARGET_STV"
+  "#"
+  "&& reload_completed"
+  [(set (reg:P SP_REG) (plus:P (reg:P SP_REG) (match_dup 2)))
+   (set (match_dup 0) (match_dup 1))]
+{
+  operands[2] = GEN_INT (-PUSH_ROUNDING (GET_MODE_SIZE (V1TImode)));
+  /* Preserve memory attributes. */
+  operands[0] = replace_equiv_address (operands[0], stack_pointer_rtx);
+}
+  [(set_attr "type" "multi")
+   (set_attr "mode" "TI")])
+
 ;; Convert impossible pushes of immediate to existing instructions.
 ;; First try to get scratch register and go through it.  In case this
 ;; fails, push sign extended lower part first and then overwrite
diff --git a/gcc/testsuite/c-c++-common/dfp/func-vararg-mixed-2.c b/gcc/testsuite/c-c++-common/dfp/func-vararg-mixed-2.c
deleted file mode 100644
index 02cafb016d1..00000000000
--- a/gcc/testsuite/c-c++-common/dfp/func-vararg-mixed-2.c
+++ /dev/null
@@ -1,105 +0,0 @@ 
-/* { dg-do run { target { { i?86-*-* x86_64-*-* } && ia32 } } } */
-/* { dg-options "-mpreferred-stack-boundary=2" } */
-
-/* C99 6.5.2.2 Function calls.
-   Test passing varargs of the combination of decimal float types and
-   other types.  */
-
-#include <stdarg.h>
-#include "dfp-dbg.h"
-
-/* Supposing the list of varying number of arguments is:
-   unsigned int, _Decimal128, double, _Decimal32, _Decimal64.  */
-
-static _Decimal32
-vararg_d32 (unsigned arg, ...)
-{
-  va_list ap;
-  _Decimal32 result;
-
-  va_start (ap, arg);
-
-  va_arg (ap, unsigned int);
-  va_arg (ap, _Decimal128);
-  va_arg (ap, double);
-  result = va_arg (ap, _Decimal32);
-
-  va_end (ap);
-  return result;
-}
-
-static _Decimal32
-vararg_d64 (unsigned arg, ...)
-{
-  va_list ap;
-  _Decimal64 result;
-
-  va_start (ap, arg);
-
-  va_arg (ap, unsigned int);
-  va_arg (ap, _Decimal128);
-  va_arg (ap, double);
-  va_arg (ap, _Decimal32);
-  result = va_arg (ap, _Decimal64);
-
-  va_end (ap);
-  return result;
-}
-
-static _Decimal128
-vararg_d128 (unsigned arg, ...)
-{
-  va_list ap;
-  _Decimal128 result;
-
-  va_start (ap, arg);
-
-  va_arg (ap, unsigned int);
-  result = va_arg (ap, _Decimal128);
-
-  va_end (ap);
-  return result;
-}
-
-static unsigned int
-vararg_int (unsigned arg, ...)
-{
-  va_list ap;
-  unsigned int result;
-
-  va_start (ap, arg);
-
-  result = va_arg (ap, unsigned int);
-
-  va_end (ap);
-  return result;
-}
-
-static double
-vararg_double (unsigned arg, ...)
-{
-  va_list ap;
-  float result;
-
-  va_start (ap, arg);
-
-  va_arg (ap, unsigned int);
-  va_arg (ap, _Decimal128);
-  result = va_arg (ap, double);
-
-  va_end (ap);
-  return result;
-}
-
-
-int
-main ()
-{
-  if (vararg_d32 (3, 0, 1.0dl, 2.0, 3.0df, 4.0dd) != 3.0df) FAILURE
-  if (vararg_d64 (4, 0, 1.0dl, 2.0, 3.0df, 4.0dd) != 4.0dd) FAILURE
-  if (vararg_d128 (1, 0, 1.0dl, 2.0, 3.0df, 4.0dd) != 1.0dl) FAILURE
-  if (vararg_int (0, 0, 1.0dl, 2.0, 3.0df, 4.0dd) != 0) FAILURE
-  if (vararg_double (2, 0, 1.0dl, 2.0, 3.0df, 4.0dd) != 2.0) FAILURE
-
-  FINISH
-}
diff --git a/gcc/testsuite/gcc.target/i386/pr95021-1.c b/gcc/testsuite/gcc.target/i386/pr95021-1.c
new file mode 100644
index 00000000000..218776240ee
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr95021-1.c
@@ -0,0 +1,25 @@ 
+/* { dg-do compile { target ia32 } } */
+/* { dg-options "-O2 -msse2 -mstv -mregparm=0 -W" } */
+/* { dg-final { scan-assembler "movq\[ \t\]+\[^\n\]*, %xmm" } } */
+
+typedef long long a;
+struct __jmp_buf_tag   {
+};
+typedef struct __jmp_buf_tag sigjmp_buf[1];
+sigjmp_buf jmp_buf;
+int __sigsetjmp (sigjmp_buf, int);
+typedef struct {
+  a target;
+} b;
+extern a *target_p;
+b *c;
+extern void foo (a);
+void
+d(void)
+{
+  if (__sigsetjmp(jmp_buf, 1)) {
+    a target = *target_p;
+    c->target = target;
+    foo(target);
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr95021-2.c b/gcc/testsuite/gcc.target/i386/pr95021-2.c
new file mode 100644
index 00000000000..473b5f81664
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr95021-2.c
@@ -0,0 +1,53 @@ 
+/* { dg-do run { target ia32 } } */
+/* { dg-require-effective-target sse2_runtime } */
+/* { dg-options "-O2 -msse2 -mstv -W" } */
+
+#include <stdlib.h>
+#include <setjmp.h>
+
+jmp_buf buf;
+
+long long *target_p;
+long long *c;
+
+int count;
+
+__attribute__ ((noclone, noinline))
+void
+foo (long long x)
+{
+  if (x != *c)
+    abort ();
+  if (!count)
+    {
+      count++;
+      longjmp (buf, 1);
+    }
+}
+
+__attribute__ ((noclone, noinline))
+void
+bar (void)
+{
+  if (setjmp (buf))
+    {
+      long long target = *target_p;
+      *c = target;
+      foo (target);
+    }
+  else
+    foo (0);
+}
+
+int
+main ()
+{
+  long long val = 30;
+  long long local = 0;
+  target_p = &val;
+  c = &local;
+  bar ();
+  if (val != local)
+    abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr95021-3.c b/gcc/testsuite/gcc.target/i386/pr95021-3.c
new file mode 100644
index 00000000000..b213b5db072
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr95021-3.c
@@ -0,0 +1,25 @@ 
+/* { dg-do compile { target ia32 } } */
+/* { dg-options "-O2 -msse2 -mstv -mregparm=3 -W" } */
+/* { dg-final { scan-assembler "movq\[ \t\]+\[^\n\]*, %xmm" } } */
+
+typedef long long a;
+struct __jmp_buf_tag   {
+};
+typedef struct __jmp_buf_tag sigjmp_buf[1];
+sigjmp_buf jmp_buf;
+int __sigsetjmp (sigjmp_buf, int);
+typedef struct {
+  a target;
+} b;
+extern a *target_p;
+b *c;
+extern void foo (a);
+void
+d(void)
+{
+  if (__sigsetjmp(jmp_buf, 1)) {
+    a target = *target_p;
+    c->target = target;
+    foo(target);
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr95021-4.c b/gcc/testsuite/gcc.target/i386/pr95021-4.c
new file mode 100644
index 00000000000..a080d7b289f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr95021-4.c
@@ -0,0 +1,25 @@ 
+/* { dg-do compile { target int128 } } */
+/* { dg-options "-O2 -msse2 -mstv -W" } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+\[^\n\]*, %xmm" } } */
+
+typedef __int128 a;
+struct __jmp_buf_tag   {
+};
+typedef struct __jmp_buf_tag sigjmp_buf[1];
+sigjmp_buf jmp_buf;
+int __sigsetjmp (sigjmp_buf, int);
+typedef struct {
+  a target;
+} b;
+extern a *target_p;
+b *c;
+extern void foo (int, int, int, int, int, int, a);
+void
+d(void)
+{
+  if (__sigsetjmp(jmp_buf, 1)) {
+    a target = *target_p;
+    c->target = target;
+    foo(1, 2, 3, 4, 5, 6, target);
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr95021-5.c b/gcc/testsuite/gcc.target/i386/pr95021-5.c
new file mode 100644
index 00000000000..3fcf2c3922e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr95021-5.c
@@ -0,0 +1,59 @@ 
+/* { dg-do run { target int128 } } */
+/* { dg-require-effective-target sse2_runtime } */
+/* { dg-options "-O2 -msse2 -mstv -W" } */
+
+#include <stdlib.h>
+#include <setjmp.h>
+
+jmp_buf buf;
+
+__int128 *target_p;
+__int128 *c;
+
+int count;
+
+__attribute__ ((noclone, noinline))
+void
+foo (__int128 i1, __int128 i2, __int128 i3, __int128 x)
+{
+  if (i1 != 0xbadbeef1)
+    abort ();
+  if (i2 != 0x2badbeef)
+    abort ();
+  if (i3 != 0xbad3beef)
+    abort ();
+  if (x != *c)
+    abort ();
+  if (!count)
+    {
+      count++;
+      longjmp (buf, 1);
+    }
+}
+
+__attribute__ ((noclone, noinline))
+void
+bar (void)
+{
+  if (setjmp (buf))
+    {
+      __int128 target = *target_p;
+      *c = target;
+      foo (0xbadbeef1, 0x2badbeef, 0xbad3beef, target);
+    }
+  else
+    foo (0xbadbeef1, 0x2badbeef, 0xbad3beef, 0);
+}
+
+int
+main ()
+{
+  __int128 val = 30;
+  __int128 local = 0;
+  target_p = &val;
+  c = &local;
+  bar ();
+  if (val != local)
+    abort ();
+  return 0;
+}