[committed] i386: Use lock prefixed insn instead of MFENCE [PR95750]

Message ID CAFULd4Zk5LEtVFeq-kfG49HU3K2Q6sAfEAs6-=Go-h92NjWOwA@mail.gmail.com
State New
Headers show
Series
  • [committed] i386: Use lock prefixed insn instead of MFENCE [PR95750]
Related show

Commit Message

Ian Lance Taylor via Gcc-patches July 20, 2020, 6:39 p.m.
Currently, __atomic_thread_fence(seq_cst) on x86 and x86-64 generates
mfence instruction. A dummy atomic instruction (a lock-prefixed instruction
or xchg with a memory operand) would provide the same sequential consistency
guarantees while being more efficient on most current CPUs. The mfence
instruction additionally orders non-temporal stores, which is not relevant
for atomic operations and are not ordered by seq_cst atomic operations anyway.

2020-07-20  Uroš Bizjak  <ubizjak@gmail.com>

gcc/ChangeLog:
    PR target/95750
    * config/i386/i386.h (TARGET_AVOID_MFENCE):
    Rename from TARGET_USE_XCHG_FOR_ATOMIC_STORE.
    * config/i386/sync.md (mfence_sse2): Disable for TARGET_AVOID_MFENCE.
    (mfence_nosse): Enable also for TARGET_AVOID_MFENCE. Emit stack
    referred memory in word_mode.
    (mem_thread_fence): Do not generate mfence_sse2 pattern when
    TARGET_AVOID_MFENCE is true.
    (atomic_store<mode>): Update for rename.
    * config/i386/x86-tune.def (X86_TUNE_AVOID_MFENCE):
    Rename from X86_TUNE_USE_XCHG_FOR_ATOMIC_STORE.

gcc/testsuite/ChangeLog:
    PR target/95750
    * gcc.target/i386/pr95750.c: New test.

Bootstrapped and regression tested on x86_64-linux-gnu {,-m32}.

Uros.

Comments

Franz Sirl July 21, 2020, 3:46 p.m. | #1
Am 2020-07-20 um 20:39 schrieb Uros Bizjak via Gcc-patches:
> Currently, __atomic_thread_fence(seq_cst) on x86 and x86-64 generates

> mfence instruction. A dummy atomic instruction (a lock-prefixed instruction

> or xchg with a memory operand) would provide the same sequential consistency

> guarantees while being more efficient on most current CPUs. The mfence

> instruction additionally orders non-temporal stores, which is not relevant

> for atomic operations and are not ordered by seq_cst atomic operations anyway.

> 

> 2020-07-20  Uroš Bizjak  <ubizjak@gmail.com>

> 

> gcc/ChangeLog:

>      PR target/95750

>      * config/i386/i386.h (TARGET_AVOID_MFENCE):

>      Rename from TARGET_USE_XCHG_FOR_ATOMIC_STORE.

>      * config/i386/sync.md (mfence_sse2): Disable for TARGET_AVOID_MFENCE.

>      (mfence_nosse): Enable also for TARGET_AVOID_MFENCE. Emit stack

>      referred memory in word_mode.

>      (mem_thread_fence): Do not generate mfence_sse2 pattern when

>      TARGET_AVOID_MFENCE is true.

>      (atomic_store<mode>): Update for rename.

>      * config/i386/x86-tune.def (X86_TUNE_AVOID_MFENCE):

>      Rename from X86_TUNE_USE_XCHG_FOR_ATOMIC_STORE.

> 

> gcc/testsuite/ChangeLog:

>      PR target/95750

>      * gcc.target/i386/pr95750.c: New test.

> 

> Bootstrapped and regression tested on x86_64-linux-gnu {,-m32}.

> 

> Uros.

> 


Hi,

I didn't bisect it, but I see a profiledbootstrap ICE that may be related:

libtool: compile: 
/home/fsirl/rpmbuild/BUILD/gcc-11.0.0+gitr11+2246/obj-x86_64-suse-linux/./gcc/xgcc 
-B/home/fsirl/rpmbuild/BUILD/gcc-11.0.0+gitr11+2246/obj-x86_64-suse-linux/./gcc/ 
-B/usr/x86_64-su
se-linux/bin/ -B/usr/x86_64-suse-linux/lib/ -isystem 
/usr/x86_64-suse-linux/include -isystem 
/usr/x86_64-suse-linux/sys-include -m32 -DHAVE_CONFIG_H -I. 
-I../../../../libgo -I ../../../../libgo/runti
me -I../../../../libgo/../libffi/include -I../libffi/include -pthread 
-L../libatomic/.libs -fexceptions -fnon-call-exceptions -fsplit-stack 
-Wall -Wextra -Wwrite-strings -Wcast-qual -minline-all-stri
ngops -D_GNU_SOURCE -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -I 
../../../../libgo/../libgcc -I ../../../../libgo/../libbacktrace -I 
../../../gcc/include -O2 -g -fmessage-length=0 -D_FORTIFY_SOURCE=
2 -funwind-tables -fasynchronous-unwind-tables -U_FORTIFY_SOURCE -m32 
-MT runtime/runtime_c.lo -MD -MP -MF runtime/.deps/runtime_c.Tpo -c 
../../../../libgo/runtime/runtime_c.c  -fPIC -DPIC -o runtime
/.libs/runtime_c.o
../../../../libgo/runtime/runtime_c.c: In function ?runtime_cputicks?:
../../../../libgo/runtime/runtime_c.c:102:1: error: unrecognizable insn:
   102 | }
       | ^
(insn 20 19 21 6 (set (mem/v:BLK (scratch:SI) [0  A8])
         (unspec:BLK [
                 (mem/v:BLK (scratch:SI) [0  A8])
             ] UNSPEC_MFENCE)) 
"../../../../libgo/runtime/runtime_c.c":84:7 -1
      (nil))
during RTL pass: vregs
../../../../libgo/runtime/runtime_c.c:102:1: internal compiler error: in 
extract_insn, at recog.c:2294

This is on a Xeon X5650 machine.

Franz
Ian Lance Taylor via Gcc-patches July 21, 2020, 4:13 p.m. | #2
On Tue, Jul 21, 2020 at 5:46 PM Franz Sirl
<Franz.Sirl-kernel@lauterbach.com> wrote:
>

> Am 2020-07-20 um 20:39 schrieb Uros Bizjak via Gcc-patches:

> > Currently, __atomic_thread_fence(seq_cst) on x86 and x86-64 generates

> > mfence instruction. A dummy atomic instruction (a lock-prefixed instruction

> > or xchg with a memory operand) would provide the same sequential consistency

> > guarantees while being more efficient on most current CPUs. The mfence

> > instruction additionally orders non-temporal stores, which is not relevant

> > for atomic operations and are not ordered by seq_cst atomic operations anyway.

> >

> > 2020-07-20  Uroš Bizjak  <ubizjak@gmail.com>

> >

> > gcc/ChangeLog:

> >      PR target/95750

> >      * config/i386/i386.h (TARGET_AVOID_MFENCE):

> >      Rename from TARGET_USE_XCHG_FOR_ATOMIC_STORE.

> >      * config/i386/sync.md (mfence_sse2): Disable for TARGET_AVOID_MFENCE.

> >      (mfence_nosse): Enable also for TARGET_AVOID_MFENCE. Emit stack

> >      referred memory in word_mode.

> >      (mem_thread_fence): Do not generate mfence_sse2 pattern when

> >      TARGET_AVOID_MFENCE is true.

> >      (atomic_store<mode>): Update for rename.

> >      * config/i386/x86-tune.def (X86_TUNE_AVOID_MFENCE):

> >      Rename from X86_TUNE_USE_XCHG_FOR_ATOMIC_STORE.

> >

> > gcc/testsuite/ChangeLog:

> >      PR target/95750

> >      * gcc.target/i386/pr95750.c: New test.

> >

> > Bootstrapped and regression tested on x86_64-linux-gnu {,-m32}.

> >

> > Uros.

> >

>

> Hi,

>

> I didn't bisect it, but I see a profiledbootstrap ICE that may be related:


Ah, mfence_sse2 can be expanded from the __builtin_ia32_mfence
independently of tuning flags. I'm testing the following patch:

--cut here--
diff --git a/gcc/config/i386/sync.md b/gcc/config/i386/sync.md
index c6827037abf..c88750d3664 100644
--- a/gcc/config/i386/sync.md
+++ b/gcc/config/i386/sync.md
@@ -89,8 +89,7 @@
 (define_insn "mfence_sse2"
   [(set (match_operand:BLK 0)
        (unspec:BLK [(match_dup 0)] UNSPEC_MFENCE))]
-  "(TARGET_64BIT || TARGET_SSE2)
-   && !TARGET_AVOID_MFENCE"
+  "TARGET_64BIT || TARGET_SSE2"
   "mfence"
   [(set_attr "type" "sse")
    (set_attr "length_address" "0")
@@ -101,8 +100,7 @@
   [(set (match_operand:BLK 0)
        (unspec:BLK [(match_dup 0)] UNSPEC_MFENCE))
    (clobber (reg:CC FLAGS_REG))]
-  "!(TARGET_64BIT || TARGET_SSE2)
-   || TARGET_AVOID_MFENCE"
+  ""
 {
   rtx mem = gen_rtx_MEM (word_mode, stack_pointer_rtx);

--cut here--

Uros.
Ian Lance Taylor via Gcc-patches July 21, 2020, 6:25 p.m. | #3
On Tue, Jul 21, 2020 at 6:13 PM Uros Bizjak <ubizjak@gmail.com> wrote:
>

> On Tue, Jul 21, 2020 at 5:46 PM Franz Sirl

> <Franz.Sirl-kernel@lauterbach.com> wrote:


> > I didn't bisect it, but I see a profiledbootstrap ICE that may be related:

>

> Ah, mfence_sse2 can be expanded from the __builtin_ia32_mfence

> independently of tuning flags. I'm testing the following patch:


Bootstrapped, regression tested on x86_64-linux-gnu {,-m32} for all
default languages + go, and committed with the following ChangeLog:

i386: Fix insn conditions of mfence patterns [PR95750]

2020-07-21  Uroš Bizjak  <ubizjak@gmail.com>

gcc/ChangeLog:
    PR target/95750
    * config/i386/sync.md (mfence_sse2): Enable for
    TARGET_64BIT and TARGET_SSE2.
    (mfence_nosse): Always enable.

Uros.

> --cut here--

> diff --git a/gcc/config/i386/sync.md b/gcc/config/i386/sync.md

> index c6827037abf..c88750d3664 100644

> --- a/gcc/config/i386/sync.md

> +++ b/gcc/config/i386/sync.md

> @@ -89,8 +89,7 @@

>  (define_insn "mfence_sse2"

>    [(set (match_operand:BLK 0)

>         (unspec:BLK [(match_dup 0)] UNSPEC_MFENCE))]

> -  "(TARGET_64BIT || TARGET_SSE2)

> -   && !TARGET_AVOID_MFENCE"

> +  "TARGET_64BIT || TARGET_SSE2"

>    "mfence"

>    [(set_attr "type" "sse")

>     (set_attr "length_address" "0")

> @@ -101,8 +100,7 @@

>    [(set (match_operand:BLK 0)

>         (unspec:BLK [(match_dup 0)] UNSPEC_MFENCE))

>     (clobber (reg:CC FLAGS_REG))]

> -  "!(TARGET_64BIT || TARGET_SSE2)

> -   || TARGET_AVOID_MFENCE"

> +  ""

>  {

>    rtx mem = gen_rtx_MEM (word_mode, stack_pointer_rtx);

>

> --cut here--

>

> Uros.

Patch

diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index f4a8f1391fa..114967e49a3 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -598,8 +598,7 @@  extern unsigned char ix86_tune_features[X86_TUNE_LAST];
 	ix86_tune_features[X86_TUNE_AVOID_FALSE_DEP_FOR_BMI]
 #define TARGET_ONE_IF_CONV_INSN \
 	ix86_tune_features[X86_TUNE_ONE_IF_CONV_INSN]
-#define TARGET_USE_XCHG_FOR_ATOMIC_STORE \
-	ix86_tune_features[X86_TUNE_USE_XCHG_FOR_ATOMIC_STORE]
+#define TARGET_AVOID_MFENCE ix86_tune_features[X86_TUNE_AVOID_MFENCE]
 #define TARGET_EMIT_VZEROUPPER \
 	ix86_tune_features[X86_TUNE_EMIT_VZEROUPPER]
 #define TARGET_EXPAND_ABS \
diff --git a/gcc/config/i386/sync.md b/gcc/config/i386/sync.md
index e22109039c1..c6827037abf 100644
--- a/gcc/config/i386/sync.md
+++ b/gcc/config/i386/sync.md
@@ -89,7 +89,8 @@ 
 (define_insn "mfence_sse2"
   [(set (match_operand:BLK 0)
 	(unspec:BLK [(match_dup 0)] UNSPEC_MFENCE))]
-  "TARGET_64BIT || TARGET_SSE2"
+  "(TARGET_64BIT || TARGET_SSE2)
+   && !TARGET_AVOID_MFENCE"
   "mfence"
   [(set_attr "type" "sse")
    (set_attr "length_address" "0")
@@ -100,8 +101,14 @@ 
   [(set (match_operand:BLK 0)
 	(unspec:BLK [(match_dup 0)] UNSPEC_MFENCE))
    (clobber (reg:CC FLAGS_REG))]
-  "!(TARGET_64BIT || TARGET_SSE2)"
-  "lock{%;} or{l}\t{$0, (%%esp)|DWORD PTR [esp], 0}"
+  "!(TARGET_64BIT || TARGET_SSE2)
+   || TARGET_AVOID_MFENCE"
+{
+  rtx mem = gen_rtx_MEM (word_mode, stack_pointer_rtx);
+
+  output_asm_insn ("lock{%;} or%z0\t{$0, %0|%0, 0}", &mem);
+  return "";
+}
   [(set_attr "memory" "unknown")])
 
 (define_expand "mem_thread_fence"
@@ -117,7 +124,8 @@ 
       rtx (*mfence_insn)(rtx);
       rtx mem;
 
-      if (TARGET_64BIT || TARGET_SSE2)
+      if ((TARGET_64BIT || TARGET_SSE2)
+	  && !TARGET_AVOID_MFENCE)
 	mfence_insn = gen_mfence_sse2;
       else
 	mfence_insn = gen_mfence_nosse;
@@ -306,11 +314,10 @@ 
     {
       operands[1] = force_reg (<MODE>mode, operands[1]);
 
-      /* For seq-cst stores, use XCHG when we lack MFENCE
-      	 or when target prefers XCHG.  */
+      /* For seq-cst stores, use XCHG when we lack MFENCE.  */
       if (is_mm_seq_cst (model)
 	  && (!(TARGET_64BIT || TARGET_SSE2)
-	      || TARGET_USE_XCHG_FOR_ATOMIC_STORE))
+	      || TARGET_AVOID_MFENCE))
 	{
 	  emit_insn (gen_atomic_exchange<mode> (gen_reg_rtx (<MODE>mode),
 						operands[0], operands[1],
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 1776aba2d17..6eff8256897 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -313,8 +313,8 @@  DEF_TUNE (X86_TUNE_ONE_IF_CONV_INSN, "one_if_conv_insn",
 	  m_SILVERMONT | m_KNL | m_KNM | m_INTEL | m_CORE_ALL | m_GOLDMONT
 	  | m_GOLDMONT_PLUS | m_TREMONT | m_GENERIC)
 
-/* X86_TUNE_USE_XCHG_FOR_ATOMIC_STORE: Use xchg instead of mov+mfence.  */
-DEF_TUNE (X86_TUNE_USE_XCHG_FOR_ATOMIC_STORE, "use_xchg_for_atomic_store",
+/* X86_TUNE_AVOID_MFENCE: Use lock prefixed instructions instead of mfence.  */
+DEF_TUNE (X86_TUNE_AVOID_MFENCE, "avoid_mfence",
 	 m_CORE_ALL | m_BDVER | m_ZNVER | m_GENERIC)
 
 /* X86_TUNE_EXPAND_ABS: This enables a new abs pattern by
diff --git a/gcc/testsuite/gcc.target/i386/pr95750.c b/gcc/testsuite/gcc.target/i386/pr95750.c
new file mode 100644
index 00000000000..c47108fb796
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr95750.c
@@ -0,0 +1,19 @@ 
+/* PR target/95750 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=core2" } */
+
+void
+foo (void)
+{
+  __atomic_thread_fence (__ATOMIC_SEQ_CST);
+}
+
+int x;
+
+void
+bar (void)
+{
+  __atomic_store_n (&x, -1, __ATOMIC_SEQ_CST);
+}
+
+/* { dg-final { scan-assembler-not "mfence" } } */