x86: Allow vector register pushes

Message ID CAMe9rOpC3DMQavQY_0Myd7VHmXv9ECerr6qHc+si-7wgZBfykg@mail.gmail.com
State New
Headers show
Series
  • x86: Allow vector register pushes
Related show

Commit Message

Richard Biener via Gcc-patches May 13, 2020, 3:58 p.m.
On Wed, May 13, 2020 at 6:35 AM Uros Bizjak <ubizjak@gmail.com> wrote:
>

> On Wed, May 13, 2020 at 3:25 PM H.J. Lu <hjl.tools@gmail.com> wrote:

> >

> > On Wed, May 13, 2020 at 6:17 AM Uros Bizjak <ubizjak@gmail.com> wrote:

> > >

> > > On Wed, May 13, 2020 at 2:37 PM H.J. Lu <hjl.tools@gmail.com> wrote:

> > > >

> > > > On Wed, May 13, 2020 at 5:04 AM Uros Bizjak <ubizjak@gmail.com> wrote:

> > > > >

> > > > > On Wed, May 13, 2020 at 1:05 PM Uros Bizjak <ubizjak@gmail.com> wrote:

> > > > > >

> > > > > > On Tue, May 12, 2020 at 10:07 PM H.J. Lu <hjl.tools@gmail.com> wrote:

> > > > > > >

> > > > > > > Update STV pass to properly count cost of XMM register push.  In 32-bit

> > > > > > > mode, to convert XMM register push in DImode, we do an XMM store in

> > > > > > > DImode, followed by 2 memory pushes in SImode, instead of 2 integer

> > > > > > > register pushes in SImode.  To convert XM register push in SImode, we

> > > > > > > do an XMM register to integer register move in SImode, followed an

> > > > > > > integer register push in SImode, instead of an integer register push in

> > > > > > > SImode.  In 64-bit mode, we do an XMM register to integer register move

> > > > > > > in SImode or DImode, followed an integer register push in SImode or

> > > > > > > DImode, instead of an integer register push SImode or DImode.

> > > > > > >

> > > > > > > Tested on Linux/x86 and Linux/x86-64.

> > > > > >

> > > > > > I think it is better to implement XMM register pushes, and split them

> > > > > > after reload to a sequence of:

> > > > > >

> > > > > > (set (reg:P SP_REG) (plus:P SP_REG) (const_int -8)))

> > > > > > (set (match_dup 0) (match_dup 1))

> > > > > >

> > > > > > This is definitely better than trips through memory to stack.

> > > > >

> > > > > Attached (untested patch) allows fake pushes from XMM registers, so

> > > > > STV pass can allow pushes.

> > > >

> > > > The problem isn't STV pass.  The IRA pass won't assign hard register for

> > >

> > > Please see the sequence before STV pass:

> > >

> > > (insn 24 23 25 3 (set (reg/v:DI 85 [ target ])

> > >         (mem:DI (reg/f:SI 86 [ target_p ]) [2 *target_p.0_1+0 S8

> > > A32])) "pr95021.c":17:7 64 {*movdi_internal}

> > >      (expr_list:REG_DEAD (reg/f:SI 86 [ target_p ])

> > >         (nil)))

> > >

> > > (insn 26 25 27 3 (set (mem:DI (reg/f:SI 87 [ c ]) [2 c.1_2->target+0 S8 A32])

> > >         (reg/v:DI 85 [ target ])) "pr95021.c":18:15 64 {*movdi_internal}

> > >      (expr_list:REG_DEAD (reg/f:SI 87 [ c ])

> > >         (nil)))

> > >

> > > (insn 28 27 29 3 (set (mem:DI (pre_dec:SI (reg/f:SI 7 sp)) [2  S8 A64])

> > >         (reg/v:DI 85 [ target ])) "pr95021.c":19:5 40 {*pushdi2}

> > >      (expr_list:REG_DEAD (reg/v:DI 85 [ target ])

> > >         (expr_list:REG_ARGS_SIZE (const_int 16 [0x10])

> > >             (nil))))

> > >

> > > So, (reg 85) gets moved to a memory in (insn 26) and pushed to a stack

> > > in (insn 28).

> > >

> > > STV pass does this:

> > >

> > > (insn 24 41 37 3 (set (subreg:V2DI (reg:DI 89) 0)

> > >         (subreg:V2DI (reg:DI 91) 0)) "pr95021.c":17:7 1342 {movv2di_internal}

> > >      (expr_list:REG_DEAD (reg/f:SI 86 [ target_p ])

> > >         (nil)))

> > >

> > > (insn 37 24 38 3 (set (reg:V2DI 90)

> > >         (subreg:V2DI (reg:DI 89) 0)) "pr95021.c":17:7 -1

> > >      (nil))

> > > (insn 38 37 39 3 (set (subreg:SI (reg/v:DI 85 [ target ]) 0)

> > >         (subreg:SI (reg:V2DI 90) 0)) "pr95021.c":17:7 -1

> > >      (nil))

> > > (insn 39 38 40 3 (set (reg:V2DI 90)

> > >         (lshiftrt:V2DI (reg:V2DI 90)

> > >             (const_int 32 [0x20]))) "pr95021.c":17:7 -1

> > >      (nil))

> > > (insn 40 39 25 3 (set (subreg:SI (reg/v:DI 85 [ target ]) 4)

> > >         (subreg:SI (reg:V2DI 90) 0)) "pr95021.c":17:7 -1

> > >      (nil))

> > >

> > > (insn 26 25 27 3 (set (mem:DI (reg/f:SI 87 [ c ]) [2 c.1_2->target+0 S8 A32])

> > >         (reg:DI 89)) "pr95021.c":18:15 64 {*movdi_internal}

> > >      (expr_list:REG_DEAD (reg/f:SI 87 [ c ])

> > >         (nil)))

> > >

> > > (insn 28 27 29 3 (set (mem:DI (pre_dec:SI (reg/f:SI 7 sp)) [2  S8 A64])

> > >         (reg/v:DI 85 [ target ])) "pr95021.c":19:5 40 {*pushdi2}

> > >      (expr_list:REG_DEAD (reg/v:DI 85 [ target ])

> > >         (expr_list:REG_ARGS_SIZE (const_int 16 [0x10])

> > >             (nil))))

> > >

> > > For some reason (reg 89) moves to (reg 85) via sequence (insn 37) to

> > > (insn 40), splitting to SImode and reassembling back to (reg 85).

> > >

> > > >

> > > > (insn 28 27 29 3 (set (mem:DI (pre_dec:SI (reg/f:SI 7 sp)) [2  S8 A64])

> > > >         (reg/v:DI 85 [ target ])) "x.i":19:5 40 {*pushdi2}

> > > >      (expr_list:REG_DEAD (reg/v:DI 85 [ target ])

> > > >         (expr_list:REG_ARGS_SIZE (const_int 16 [0x10])

> > > >             (nil))))

> > > >

> > > > and the reload pass turns into

> > > >

> > > > ....

> > > > (insn 28 27 29 3 (set (mem:DI (pre_dec:SI (reg/f:SI 7 sp)) [2  S8 A64])

> > > >         (mem/c:DI (plus:SI (reg/f:SI 7 sp)

> > > >                 (const_int 16 [0x10])) [8 %sfp+-8 S8 A64])) "x.i":19:5

> > > > 40 {*pushdi2}

> > > >      (expr_list:REG_ARGS_SIZE (const_int 16 [0x10])

> > > >         (nil)))

> > >

> > > This is an optimization of the reload pass, it figures out where the

> > > values live and tries its best to assign hard regs and stack slots to

> > > the above convoluted sequence. Note, that DImode push from integer

> > > regs splits to SImode pushes after reload. This has nothing with STV

> > > pass.

> > >

> > > The question is, why STV pass creates its funny sequence? The original

> > > sequence should be easily solved by storing DImode from XMM register

> > > and (with patched gcc) pushing DImode value from the same XMM

> > > register.

> >

> > STV pass is mostly OK since it has to use XMM to move upper and lower

> > 32 bits of a 64-bit integer.  The problem is that push XMM becomes very

> > expensive later.

>

> As shown in the patch, XMM push should be just decrement of SP reg and

> move to the created stack slot.


OK for master if there are no regressions?

Thanks.

-- 
H.J.

Comments

Richard Biener via Gcc-patches May 15, 2020, 9:21 a.m. | #1
On Wed, May 13, 2020 at 5:58 PM H.J. Lu <hjl.tools@gmail.com> wrote:

> > > > The question is, why STV pass creates its funny sequence? The original

> > > > sequence should be easily solved by storing DImode from XMM register

> > > > and (with patched gcc) pushing DImode value from the same XMM

> > > > register.

> > >

> > > STV pass is mostly OK since it has to use XMM to move upper and lower

> > > 32 bits of a 64-bit integer.  The problem is that push XMM becomes very

> > > expensive later.

> >

> > As shown in the patch, XMM push should be just decrement of SP reg and

> > move to the created stack slot.

>

> OK for master if there are no regressions?


diff --git a/gcc/config/i386/i386-features.c b/gcc/config/i386/i386-features.c
index 78fb373db6e..6cad125fa83 100644
--- a/gcc/config/i386/i386-features.c
+++ b/gcc/config/i386/i386-features.c
@@ -1264,7 +1264,8 @@ has_non_address_hard_reg (rtx_insn *insn)
   FOR_EACH_INSN_DEF (ref, insn)
     if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))
     && !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
-    && DF_REF_REGNO (ref) != FLAGS_REG)
+    && DF_REF_REGNO (ref) != FLAGS_REG
+    && DF_REF_REGNO (ref) != SP_REG)
       return true;

I don't think this part is correct. The function comment says:

/* Return 1 if INSN uses or defines a hard register.
   Hard register uses in a memory address are ignored.
   Clobbers and flags definitions are ignored.  */

and you are putting SP_REG into clobber part.

OTOH, SP_REG in:

(insn 28 27 29 3 (set (mem:DI (pre_dec:SI (reg/f:SI 7 sp)) [2  S8 A64])
        (reg/v:DI 85 [ target ])) "x.i":19:5 40 {*pushdi2}

is inside memory, so REG_SP should already be ignored. Please
investigate, why it is not the case.

+(define_insn "*pushv1ti2"
+  [(set (match_operand:V1TI 0 "push_operand" "=<")
+    (match_operand:V1TI 1 "general_no_elim_operand" "v"))]
+  ""
+  "#"
+  [(set_attr "type" "multi")
+   (set_attr "mode" "TI")])
+
+(define_split
+  [(set (match_operand:V1TI 0 "push_operand" "")
+    (match_operand:V1TI 1 "sse_reg_operand" ""))]
+  "TARGET_64BIT && reload_completed"
+  [(set (reg:P SP_REG) (plus:P (reg:P SP_REG) (match_dup 2)))
+    (set (match_dup 0) (match_dup 1))]
+{
+  operands[2] = GEN_INT (-PUSH_ROUNDING (16));
+  /* Preserve memory attributes. */
+  operands[0] = replace_equiv_address (operands[0], stack_pointer_rtx);
+})

The above part is wrong on many levels, e.g. using wrong predicate,
unnecessary rounding, it should be defined as define_insn_and_split,
conditionalized on TARGET_64BIT && TARGET_STV and put nearby existing
pushti patterns.

I will implement push changes (modulo V1T1mode) by myself, since they
are independent of STV stuff.

Uros.

Patch

From 08d2f763bf16e59e4788212298a47366b8179474 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Tue, 12 May 2020 11:30:29 -0700
Subject: [PATCH] x86: Allow vector register pushes

Add vector register pushes and split them after reload to a sequence of:

(set (reg:P SP_REG) (plus:P SP_REG) (const_int -8)))
(set (match_dup 0) (match_dup 1))

so that STV pass can convert DI/TI mode integer pushes to vector register
pushes.  Also update has_non_address_hard_reg to allow pushes.

gcc/

2020-05-XX  Uros Bizjak    <ubizjak@gmail.com>
	    H.J. Lu  <hongjiu.lu@intel.com>

	PR target/95021
	* config/i386/i386-features.c (has_non_address_hard_reg): Allow
	SP_REG.
	* config/i386/i386.md (SWIDWI): New mode iterator.
	(*push<mode>2): Allow XMM register.
	(*pushdi2_rex64): Likewise.
	(*pushsi2): Likewise.
	(*push<mode>2_rex64): Likewise.
	Add XMM register push splitters.

gcc/testsuite/

2020-05-XX  H.J. Lu  <hongjiu.lu@intel.com>

	PR target/95021
	* gcc.target/i386/pr95021-1.c: New test.
	* gcc.target/i386/pr95021-2.c: Likewise.
	* gcc.target/i386/pr95021-3.c: Likewise.
---
 gcc/config/i386/i386-features.c           |  3 +-
 gcc/config/i386/i386.md                   | 66 ++++++++++++++++++-----
 gcc/testsuite/gcc.target/i386/pr95021-1.c | 25 +++++++++
 gcc/testsuite/gcc.target/i386/pr95021-2.c | 25 +++++++++
 gcc/testsuite/gcc.target/i386/pr95021-3.c | 25 +++++++++
 5 files changed, 130 insertions(+), 14 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr95021-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr95021-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr95021-3.c

diff --git a/gcc/config/i386/i386-features.c b/gcc/config/i386/i386-features.c
index 78fb373db6e..6cad125fa83 100644
--- a/gcc/config/i386/i386-features.c
+++ b/gcc/config/i386/i386-features.c
@@ -1264,7 +1264,8 @@  has_non_address_hard_reg (rtx_insn *insn)
   FOR_EACH_INSN_DEF (ref, insn)
     if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))
 	&& !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
-	&& DF_REF_REGNO (ref) != FLAGS_REG)
+	&& DF_REF_REGNO (ref) != FLAGS_REG
+	&& DF_REF_REGNO (ref) != SP_REG)
       return true;
 
   FOR_EACH_INSN_USE (ref, insn)
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 722eb9b5ec8..226af7b1773 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -1049,6 +1049,9 @@  (define_mode_iterator DWI [(DI "!TARGET_64BIT")
 ;; SWI and DWI together.
 (define_mode_iterator SWIDWI [QI HI SI DI (TI "TARGET_64BIT")])
 
+;; SWI48 and DWI together.
+(define_mode_iterator SWI48DWI [SI DI (TI "TARGET_64BIT")])
+
 ;; GET_MODE_SIZE for selected modes.  As GET_MODE_SIZE is not
 ;; compile time constant, it is faster to use <MODE_SIZE> than
 ;; GET_MODE_SIZE (<MODE>mode).  For XFmode which depends on
@@ -1670,8 +1673,8 @@  (define_insn "*cmpi<unord><MODEF:mode>"
 ;; Push/pop instructions.
 
 (define_insn "*push<mode>2"
-  [(set (match_operand:DWI 0 "push_operand" "=<")
-	(match_operand:DWI 1 "general_no_elim_operand" "riF*o"))]
+  [(set (match_operand:DWI 0 "push_operand" "=<,<")
+	(match_operand:DWI 1 "general_no_elim_operand" "riF*o,v"))]
   ""
   "#"
   [(set_attr "type" "multi")
@@ -1685,13 +1688,14 @@  (define_split
   "ix86_split_long_move (operands); DONE;")
 
 (define_insn "*pushdi2_rex64"
-  [(set (match_operand:DI 0 "push_operand" "=<,!<")
-	(match_operand:DI 1 "general_no_elim_operand" "re*m,n"))]
+  [(set (match_operand:DI 0 "push_operand" "=<,<,!<")
+	(match_operand:DI 1 "general_no_elim_operand" "re*m,v,n"))]
   "TARGET_64BIT"
   "@
    push{q}\t%1
+   #
    #"
-  [(set_attr "type" "push,multi")
+  [(set_attr "type" "push,multi,multi")
    (set_attr "mode" "DI")])
 
 ;; Convert impossible pushes of immediate to existing instructions.
@@ -1726,11 +1730,13 @@  (define_split
 })
 
 (define_insn "*pushsi2"
-  [(set (match_operand:SI 0 "push_operand" "=<")
-	(match_operand:SI 1 "general_no_elim_operand" "ri*m"))]
+  [(set (match_operand:SI 0 "push_operand" "=<,<")
+	(match_operand:SI 1 "general_no_elim_operand" "ri*m,v"))]
   "!TARGET_64BIT"
-  "push{l}\t%1"
-  [(set_attr "type" "push")
+  "@
+   push{l}\t%1
+   #"
+  [(set_attr "type" "push,multi")
    (set_attr "mode" "SI")])
 
 ;; emit_push_insn when it calls move_by_pieces requires an insn to
@@ -1739,11 +1745,13 @@  (define_insn "*pushsi2"
 
 ;; For TARGET_64BIT we always round up to 8 bytes.
 (define_insn "*push<mode>2_rex64"
-  [(set (match_operand:SWI124 0 "push_operand" "=X")
-	(match_operand:SWI124 1 "nonmemory_no_elim_operand" "r<i>"))]
+  [(set (match_operand:SWI124 0 "push_operand" "=X,X")
+	(match_operand:SWI124 1 "nonmemory_no_elim_operand" "r<i>,v"))]
   "TARGET_64BIT"
-  "push{q}\t%q1"
-  [(set_attr "type" "push")
+  "@
+   push{q}\t%q1
+   #"
+  [(set_attr "type" "push,multi")
    (set_attr "mode" "DI")])
 
 (define_insn "*push<mode>2"
@@ -1754,6 +1762,38 @@  (define_insn "*push<mode>2"
   [(set_attr "type" "push")
    (set_attr "mode" "SI")])
 
+(define_split
+  [(set (match_operand:SWI48DWI 0 "push_operand")
+	(match_operand:SWI48DWI 1 "sse_reg_operand"))]
+  "TARGET_SSE && reload_completed"
+  [(set (reg:P SP_REG) (plus:P (reg:P SP_REG) (match_dup 2)))
+    (set (match_dup 0) (match_dup 1))]
+{
+  operands[2] = GEN_INT (-PUSH_ROUNDING (GET_MODE_SIZE (<SWI48DWI:MODE>mode)));
+  /* Preserve memory attributes. */
+  operands[0] = replace_equiv_address (operands[0], stack_pointer_rtx);
+})
+
+(define_insn "*pushv1ti2"
+  [(set (match_operand:V1TI 0 "push_operand" "=<")
+	(match_operand:V1TI 1 "general_no_elim_operand" "v"))]
+  ""
+  "#"
+  [(set_attr "type" "multi")
+   (set_attr "mode" "TI")])
+
+(define_split
+  [(set (match_operand:V1TI 0 "push_operand" "")
+	(match_operand:V1TI 1 "sse_reg_operand" ""))]
+  "TARGET_64BIT && reload_completed"
+  [(set (reg:P SP_REG) (plus:P (reg:P SP_REG) (match_dup 2)))
+    (set (match_dup 0) (match_dup 1))]
+{
+  operands[2] = GEN_INT (-PUSH_ROUNDING (16));
+  /* Preserve memory attributes. */
+  operands[0] = replace_equiv_address (operands[0], stack_pointer_rtx);
+})
+
 (define_insn "*push<mode>2_prologue"
   [(set (match_operand:W 0 "push_operand" "=<")
 	(match_operand:W 1 "general_no_elim_operand" "r<i>*m"))
diff --git a/gcc/testsuite/gcc.target/i386/pr95021-1.c b/gcc/testsuite/gcc.target/i386/pr95021-1.c
new file mode 100644
index 00000000000..218776240ee
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr95021-1.c
@@ -0,0 +1,25 @@ 
+/* { dg-do compile { target ia32 } } */
+/* { dg-options "-O2 -msse2 -mstv -mregparm=0 -W" } */
+/* { dg-final { scan-assembler "movq\[ \t\]+\[^\n\]*, %xmm" } } */
+
+typedef long long a;
+struct __jmp_buf_tag   {
+};
+typedef struct __jmp_buf_tag sigjmp_buf[1];
+sigjmp_buf jmp_buf;
+int __sigsetjmp (sigjmp_buf, int);
+typedef struct {
+  a target;
+} b;
+extern a *target_p;
+b *c;
+extern void foo (a);
+void
+d(void)
+{
+  if (__sigsetjmp(jmp_buf, 1)) {
+    a target = *target_p;
+    c->target = target;
+    foo(target);
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr95021-2.c b/gcc/testsuite/gcc.target/i386/pr95021-2.c
new file mode 100644
index 00000000000..b213b5db072
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr95021-2.c
@@ -0,0 +1,25 @@ 
+/* { dg-do compile { target ia32 } } */
+/* { dg-options "-O2 -msse2 -mstv -mregparm=3 -W" } */
+/* { dg-final { scan-assembler "movq\[ \t\]+\[^\n\]*, %xmm" } } */
+
+typedef long long a;
+struct __jmp_buf_tag   {
+};
+typedef struct __jmp_buf_tag sigjmp_buf[1];
+sigjmp_buf jmp_buf;
+int __sigsetjmp (sigjmp_buf, int);
+typedef struct {
+  a target;
+} b;
+extern a *target_p;
+b *c;
+extern void foo (a);
+void
+d(void)
+{
+  if (__sigsetjmp(jmp_buf, 1)) {
+    a target = *target_p;
+    c->target = target;
+    foo(target);
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr95021-3.c b/gcc/testsuite/gcc.target/i386/pr95021-3.c
new file mode 100644
index 00000000000..a080d7b289f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr95021-3.c
@@ -0,0 +1,25 @@ 
+/* { dg-do compile { target int128 } } */
+/* { dg-options "-O2 -msse2 -mstv -W" } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+\[^\n\]*, %xmm" } } */
+
+typedef __int128 a;
+struct __jmp_buf_tag   {
+};
+typedef struct __jmp_buf_tag sigjmp_buf[1];
+sigjmp_buf jmp_buf;
+int __sigsetjmp (sigjmp_buf, int);
+typedef struct {
+  a target;
+} b;
+extern a *target_p;
+b *c;
+extern void foo (int, int, int, int, int, int, a);
+void
+d(void)
+{
+  if (__sigsetjmp(jmp_buf, 1)) {
+    a target = *target_p;
+    c->target = target;
+    foo(1, 2, 3, 4, 5, 6, target);
+  }
+}
-- 
2.26.2