x86: Mark scratch operand in ssse3_pshufbv8qi3 as earlyclobber

Message ID 20200403165112.155085-1-hjl.tools@gmail.com
State New
Headers show
Series
  • x86: Mark scratch operand in ssse3_pshufbv8qi3 as earlyclobber
Related show

Commit Message

luoxhu via Gcc-patches April 3, 2020, 4:51 p.m.
commit 16ed2601ad0a4aa82f11e9df86ea92183f94f979
Author: H.J. Lu <hongjiu.lu@intel.com>
Date:   Wed May 15 15:26:19 2019 +0000

    i386: Emulate MMX pshufb with SSE version

has

+(define_insn_and_split "ssse3_pshufbv8qi3"
+  [(set (match_operand:V8QI 0 "register_operand" "=y,x,Yv")
+  (unspec:V8QI [(match_operand:V8QI 1 "register_operand" "0,0,Yv")
+           (match_operand:V8QI 2 "register_mmxmem_operand" "ym,x,Yv")]
+          UNSPEC_PSHUFB))
+   (clobber (match_scratch:V4SI 3 "=X,x,Yv"))]
                                       ^^^  There are earlyclobber.
+  "(TARGET_MMX || TARGET_MMX_WITH_SSE) && TARGET_SSSE3"
+  "@
+   pshufb\t{%2, %0|%0, %2}
+   #
+   #"
+  "TARGET_MMX_WITH_SSE && reload_completed"
+  [(set (match_dup 3) (match_dup 5))
+   (set (match_dup 3)
+  (and:V4SI (match_dup 3) (match_dup 2)))
+   (set (match_dup 0)
+  (unspec:V16QI [(match_dup 1) (match_dup 4)] UNSPEC_PSHUFB))]

If input register operand 2 is dead after this insn, RA may choose it
as scratch operand.  Since it isn't marked as earlyclobber, operand 2
becomes unused after split and then it gets optimized out.  Mark scratch
operand as earlyclobber fixes the issue.

OK for master if there are no regressions?

H.J.
--
gcc/

	PR target/94467
	* config/i386/sse.md (ssse3_pshufbv8qi3): Mark scratch operand
	as earlyclobber.

gcc/

	PR target/94467
	* testsuite/gcc.target/i386/pr94467-1.c: New test.
	* testsuite/gcc.target/i386/pr94467-2.c: Likewise.
---
 gcc/config/i386/sse.md                    |  2 +-
 gcc/testsuite/gcc.target/i386/pr94467-1.c | 40 +++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr94467-2.c | 48 +++++++++++++++++++++++
 3 files changed, 89 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr94467-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr94467-2.c

-- 
2.25.1

Comments

luoxhu via Gcc-patches April 3, 2020, 4:57 p.m. | #1
On Fri, Apr 3, 2020 at 6:51 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>

> commit 16ed2601ad0a4aa82f11e9df86ea92183f94f979

> Author: H.J. Lu <hongjiu.lu@intel.com>

> Date:   Wed May 15 15:26:19 2019 +0000

>

>     i386: Emulate MMX pshufb with SSE version

>

> has

>

> +(define_insn_and_split "ssse3_pshufbv8qi3"

> +  [(set (match_operand:V8QI 0 "register_operand" "=y,x,Yv")

> +  (unspec:V8QI [(match_operand:V8QI 1 "register_operand" "0,0,Yv")

> +           (match_operand:V8QI 2 "register_mmxmem_operand" "ym,x,Yv")]

> +          UNSPEC_PSHUFB))

> +   (clobber (match_scratch:V4SI 3 "=X,x,Yv"))]

>                                        ^^^  There are earlyclobber.

> +  "(TARGET_MMX || TARGET_MMX_WITH_SSE) && TARGET_SSSE3"

> +  "@

> +   pshufb\t{%2, %0|%0, %2}

> +   #

> +   #"

> +  "TARGET_MMX_WITH_SSE && reload_completed"

> +  [(set (match_dup 3) (match_dup 5))

> +   (set (match_dup 3)

> +  (and:V4SI (match_dup 3) (match_dup 2)))

> +   (set (match_dup 0)

> +  (unspec:V16QI [(match_dup 1) (match_dup 4)] UNSPEC_PSHUFB))]

>

> If input register operand 2 is dead after this insn, RA may choose it

> as scratch operand.  Since it isn't marked as earlyclobber, operand 2

> becomes unused after split and then it gets optimized out.  Mark scratch

> operand as earlyclobber fixes the issue.

>

> OK for master if there are no regressions?

>

> H.J.

> --

> gcc/

>

>         PR target/94467

>         * config/i386/sse.md (ssse3_pshufbv8qi3): Mark scratch operand

>         as earlyclobber.

>

> gcc/

>

>         PR target/94467

>         * testsuite/gcc.target/i386/pr94467-1.c: New test.

>         * testsuite/gcc.target/i386/pr94467-2.c: Likewise.


OK.

Thanks,
Uros.

> ---

>  gcc/config/i386/sse.md                    |  2 +-

>  gcc/testsuite/gcc.target/i386/pr94467-1.c | 40 +++++++++++++++++++

>  gcc/testsuite/gcc.target/i386/pr94467-2.c | 48 +++++++++++++++++++++++

>  3 files changed, 89 insertions(+), 1 deletion(-)

>  create mode 100644 gcc/testsuite/gcc.target/i386/pr94467-1.c

>  create mode 100644 gcc/testsuite/gcc.target/i386/pr94467-2.c

>

> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md

> index fba91b7369a..1de03a515d9 100644

> --- a/gcc/config/i386/sse.md

> +++ b/gcc/config/i386/sse.md

> @@ -16695,7 +16695,7 @@ (define_insn_and_split "ssse3_pshufbv8qi3"

>         (unspec:V8QI [(match_operand:V8QI 1 "register_operand" "0,0,Yv")

>                       (match_operand:V8QI 2 "register_mmxmem_operand" "ym,x,Yv")]

>                      UNSPEC_PSHUFB))

> -   (clobber (match_scratch:V4SI 3 "=X,x,Yv"))]

> +   (clobber (match_scratch:V4SI 3 "=X,&x,&Yv"))]

>    "(TARGET_MMX || TARGET_MMX_WITH_SSE) && TARGET_SSSE3"

>    "@

>     pshufb\t{%2, %0|%0, %2}

> diff --git a/gcc/testsuite/gcc.target/i386/pr94467-1.c b/gcc/testsuite/gcc.target/i386/pr94467-1.c

> new file mode 100644

> index 00000000000..a51c3a8f5fe

> --- /dev/null

> +++ b/gcc/testsuite/gcc.target/i386/pr94467-1.c

> @@ -0,0 +1,40 @@

> +/* { dg-do run } */

> +/* { dg-require-effective-target avx } */

> +/* { dg-options "-O -mavx" } */

> +

> +#include "avx-check.h"

> +

> +typedef char __attribute__ ((__vector_size__ (8))) v8qi;

> +typedef short __attribute__ ((__vector_size__ (8))) v4hi;

> +typedef int __attribute__ ((__vector_size__ (8))) v2si;

> +typedef long long __attribute__ ((__vector_size__ (8))) v1di;

> +typedef unsigned long long u64;

> +u64 k, c;

> +

> +v8qi g, h, p, q;

> +v4hi d, e, f, l, n, o;

> +v2si j;

> +

> +u64

> +foo (v4hi r)

> +{

> +  v8qi s;

> +  f = (v4hi) j;

> +  e = __builtin_ia32_psrlwi ((v4hi) k, c);

> +  s = __builtin_ia32_pavgb (h, h);

> +  n = __builtin_ia32_pabsw (f);

> +  o = __builtin_ia32_psubusw (n, l);

> +  p = __builtin_ia32_packsswb (r, o);

> +  q = __builtin_ia32_pshufb (p, s);

> +  g = __builtin_ia32_punpcklbw (q, (v8qi) r);

> +  d = r;

> +  return (u64) g + (u64) h + (u64) j;

> +}

> +

> +static void

> +avx_test (void)

> +{

> +  u64 x = foo ((v4hi) { 5 });

> +  if (x != 0x0005000500050505)

> +    __builtin_abort ();

> +}

> diff --git a/gcc/testsuite/gcc.target/i386/pr94467-2.c b/gcc/testsuite/gcc.target/i386/pr94467-2.c

> new file mode 100644

> index 00000000000..8128be325e4

> --- /dev/null

> +++ b/gcc/testsuite/gcc.target/i386/pr94467-2.c

> @@ -0,0 +1,48 @@

> +/* { dg-do run } */

> +/* { dg-require-effective-target ssse3 } */

> +/* { dg-options "-O -mssse3" } */

> +

> +#ifndef CHECK_H

> +#define CHECK_H "ssse3-check.h"

> +#endif

> +

> +#ifndef TEST

> +#define TEST ssse3_test

> +#endif

> +

> +#include CHECK_H

> +

> +typedef char __attribute__ ((__vector_size__ (8))) v8qi;

> +typedef short __attribute__ ((__vector_size__ (8))) v4hi;

> +typedef int __attribute__ ((__vector_size__ (8))) v2si;

> +typedef long long __attribute__ ((__vector_size__ (8))) v1di;

> +typedef unsigned long long u64;

> +u64 k, c;

> +

> +v8qi g, h, p, q;

> +v4hi d, e, f, l, n, o;

> +v2si j;

> +

> +u64

> +foo (v4hi r)

> +{

> +  v8qi s;

> +  f = (v4hi) j;

> +  e = __builtin_ia32_psrlwi ((v4hi) k, c);

> +  s = __builtin_ia32_pavgb (h, h);

> +  n = __builtin_ia32_pabsw (f);

> +  o = __builtin_ia32_psubusw (n, l);

> +  p = __builtin_ia32_packsswb (r, o);

> +  q = __builtin_ia32_pshufb (p, s);

> +  g = __builtin_ia32_punpcklbw (q, (v8qi) r);

> +  d = r;

> +  return (u64) g + (u64) h + (u64) j;

> +}

> +

> +static void

> +ssse3_test (void)

> +{

> +  u64 x = foo ((v4hi) { 5 });

> +  if (x != 0x0005000500050505)

> +    __builtin_abort ();

> +}

> --

> 2.25.1

>

Patch

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index fba91b7369a..1de03a515d9 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -16695,7 +16695,7 @@  (define_insn_and_split "ssse3_pshufbv8qi3"
 	(unspec:V8QI [(match_operand:V8QI 1 "register_operand" "0,0,Yv")
 		      (match_operand:V8QI 2 "register_mmxmem_operand" "ym,x,Yv")]
 		     UNSPEC_PSHUFB))
-   (clobber (match_scratch:V4SI 3 "=X,x,Yv"))]
+   (clobber (match_scratch:V4SI 3 "=X,&x,&Yv"))]
   "(TARGET_MMX || TARGET_MMX_WITH_SSE) && TARGET_SSSE3"
   "@
    pshufb\t{%2, %0|%0, %2}
diff --git a/gcc/testsuite/gcc.target/i386/pr94467-1.c b/gcc/testsuite/gcc.target/i386/pr94467-1.c
new file mode 100644
index 00000000000..a51c3a8f5fe
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr94467-1.c
@@ -0,0 +1,40 @@ 
+/* { dg-do run } */
+/* { dg-require-effective-target avx } */
+/* { dg-options "-O -mavx" } */
+
+#include "avx-check.h"
+
+typedef char __attribute__ ((__vector_size__ (8))) v8qi;
+typedef short __attribute__ ((__vector_size__ (8))) v4hi;
+typedef int __attribute__ ((__vector_size__ (8))) v2si;
+typedef long long __attribute__ ((__vector_size__ (8))) v1di;
+typedef unsigned long long u64;
+u64 k, c;
+
+v8qi g, h, p, q;
+v4hi d, e, f, l, n, o;
+v2si j;
+
+u64
+foo (v4hi r)
+{
+  v8qi s;
+  f = (v4hi) j;
+  e = __builtin_ia32_psrlwi ((v4hi) k, c);
+  s = __builtin_ia32_pavgb (h, h);
+  n = __builtin_ia32_pabsw (f);
+  o = __builtin_ia32_psubusw (n, l);
+  p = __builtin_ia32_packsswb (r, o);
+  q = __builtin_ia32_pshufb (p, s);
+  g = __builtin_ia32_punpcklbw (q, (v8qi) r);
+  d = r;
+  return (u64) g + (u64) h + (u64) j;
+}
+
+static void
+avx_test (void)
+{
+  u64 x = foo ((v4hi) { 5 });
+  if (x != 0x0005000500050505)
+    __builtin_abort ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr94467-2.c b/gcc/testsuite/gcc.target/i386/pr94467-2.c
new file mode 100644
index 00000000000..8128be325e4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr94467-2.c
@@ -0,0 +1,48 @@ 
+/* { dg-do run } */
+/* { dg-require-effective-target ssse3 } */
+/* { dg-options "-O -mssse3" } */
+
+#ifndef CHECK_H
+#define CHECK_H "ssse3-check.h"
+#endif
+
+#ifndef TEST
+#define TEST ssse3_test
+#endif
+
+#include CHECK_H
+
+typedef char __attribute__ ((__vector_size__ (8))) v8qi;
+typedef short __attribute__ ((__vector_size__ (8))) v4hi;
+typedef int __attribute__ ((__vector_size__ (8))) v2si;
+typedef long long __attribute__ ((__vector_size__ (8))) v1di;
+typedef unsigned long long u64;
+u64 k, c;
+
+v8qi g, h, p, q;
+v4hi d, e, f, l, n, o;
+v2si j;
+
+u64
+foo (v4hi r)
+{
+  v8qi s;
+  f = (v4hi) j;
+  e = __builtin_ia32_psrlwi ((v4hi) k, c);
+  s = __builtin_ia32_pavgb (h, h);
+  n = __builtin_ia32_pabsw (f);
+  o = __builtin_ia32_psubusw (n, l);
+  p = __builtin_ia32_packsswb (r, o);
+  q = __builtin_ia32_pshufb (p, s);
+  g = __builtin_ia32_punpcklbw (q, (v8qi) r);
+  d = r;
+  return (u64) g + (u64) h + (u64) j;
+}
+
+static void
+ssse3_test (void)
+{
+  u64 x = foo ((v4hi) { 5 });
+  if (x != 0x0005000500050505)
+    __builtin_abort ();
+}