x86: Don't generate vzeroupper if caller is AVX_U128_DIRTY

Message ID 20190107174014.GA17007@intel.com
State New
Headers show
Series
  • x86: Don't generate vzeroupper if caller is AVX_U128_DIRTY
Related show

Commit Message

H.J. Lu Jan. 7, 2019, 5:40 p.m.
There is no need to generate vzeroupper if caller uses upper bits of
AVX/AVX512 registers,  We track caller's avx_u128_state and avoid
vzeroupper when caller's avx_u128_state is AVX_U128_DIRTY.

Tested on i686 and x86-64 with and without --with-arch=native.

OK for trunk?

Thanks.

H.J.
---
gcc/

	PR target/88717
	* config/i386/i386.c (ix86_avx_u128_mode_entry): Set
	caller_avx_u128_dirty to true when caller is AVX_U128_DIRTY.
	(ix86_avx_u128_mode_exit): Set exit mode to AVX_U128_DIRTY if
	caller is AVX_U128_DIRTY.
	* config/i386/i386.h (machine_function): Add
	caller_avx_u128_dirty.

gcc/testsuite/

	PR target/88717
	* gcc.target/i386/pr88717.c: New test.
---
 gcc/config/i386/i386.c                  | 10 +++++++++-
 gcc/config/i386/i386.h                  |  3 +++
 gcc/testsuite/gcc.target/i386/pr88717.c | 24 ++++++++++++++++++++++++
 3 files changed, 36 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88717.c

-- 
2.20.1

Comments

Uros Bizjak Jan. 8, 2019, 7:12 a.m. | #1
On Mon, Jan 7, 2019 at 6:40 PM H.J. Lu <hongjiu.lu@intel.com> wrote:
>

> There is no need to generate vzeroupper if caller uses upper bits of

> AVX/AVX512 registers,  We track caller's avx_u128_state and avoid

> vzeroupper when caller's avx_u128_state is AVX_U128_DIRTY.

>

> Tested on i686 and x86-64 with and without --with-arch=native.

>

> OK for trunk?


In principle OK, but I think we don't have to cache the result of
ix86_avx_u128_mode_entry. Simply call the function from
ix86_avx_u128_mode_exit; it is a simple function, so I guess we can
afford to re-call it one more time per function.

Uros.

> Thanks.

>

> H.J.

> ---

> gcc/

>

>         PR target/88717

>         * config/i386/i386.c (ix86_avx_u128_mode_entry): Set

>         caller_avx_u128_dirty to true when caller is AVX_U128_DIRTY.

>         (ix86_avx_u128_mode_exit): Set exit mode to AVX_U128_DIRTY if

>         caller is AVX_U128_DIRTY.

>         * config/i386/i386.h (machine_function): Add

>         caller_avx_u128_dirty.

>

> gcc/testsuite/

>

>         PR target/88717

>         * gcc.target/i386/pr88717.c: New test.

> ---

>  gcc/config/i386/i386.c                  | 10 +++++++++-

>  gcc/config/i386/i386.h                  |  3 +++

>  gcc/testsuite/gcc.target/i386/pr88717.c | 24 ++++++++++++++++++++++++

>  3 files changed, 36 insertions(+), 1 deletion(-)

>  create mode 100644 gcc/testsuite/gcc.target/i386/pr88717.c

>

> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c

> index d01278d866f..9b49a2c1d9c 100644

> --- a/gcc/config/i386/i386.c

> +++ b/gcc/config/i386/i386.c

> @@ -19100,7 +19100,11 @@ ix86_avx_u128_mode_entry (void)

>        rtx incoming = DECL_INCOMING_RTL (arg);

>

>        if (incoming && ix86_check_avx_upper_register (incoming))

> -       return AVX_U128_DIRTY;

> +       {

> +         /* Caller is AVX_U128_DIRTY.  */

> +         cfun->machine->caller_avx_u128_dirty = true;

> +         return AVX_U128_DIRTY;

> +       }

>      }

>

>    return AVX_U128_CLEAN;

> @@ -19130,6 +19134,10 @@ ix86_mode_entry (int entity)

>  static int

>  ix86_avx_u128_mode_exit (void)

>  {

> +  /* Exit mode is set to AVX_U128_DIRTY if caller is AVX_U128_DIRTY.  */

> +  if (cfun->machine->caller_avx_u128_dirty)

> +    return AVX_U128_DIRTY;

> +

>    rtx reg = crtl->return_rtx;

>

>    /* Exit mode is set to AVX_U128_DIRTY if there are 256bit

> diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h

> index 83b025e0cf5..c053b657a55 100644

> --- a/gcc/config/i386/i386.h

> +++ b/gcc/config/i386/i386.h

> @@ -2747,6 +2747,9 @@ struct GTY(()) machine_function {

>    /* If true, ENDBR is queued at function entrance.  */

>    BOOL_BITFIELD endbr_queued_at_entrance : 1;

>

> +  /* If true, caller is AVX_U128_DIRTY.  */

> +  BOOL_BITFIELD caller_avx_u128_dirty : 1;

> +

>    /* The largest alignment, in bytes, of stack slot actually used.  */

>    unsigned int max_used_stack_alignment;

>

> diff --git a/gcc/testsuite/gcc.target/i386/pr88717.c b/gcc/testsuite/gcc.target/i386/pr88717.c

> new file mode 100644

> index 00000000000..01680998f1b

> --- /dev/null

> +++ b/gcc/testsuite/gcc.target/i386/pr88717.c

> @@ -0,0 +1,24 @@

> +/* { dg-do compile } */

> +/* { dg-options "-O2 -mavx512f -mvzeroupper" } */

> +

> +#include <immintrin.h>

> +

> +__m128

> +foo1 (__m256 x)

> +{

> +  return _mm256_castps256_ps128 (x);

> +}

> +

> +void

> +foo2 (float *p, __m256 x)

> +{

> +  *p = ((__v8sf)x)[0];

> +}

> +

> +void

> +foo3 (float *p, __m512 x)

> +{

> +  *p = ((__v16sf)x)[0];

> +}

> +

> +/* { dg-final { scan-assembler-not "vzeroupper" } } */

> --

> 2.20.1

>
H.J. Lu Jan. 8, 2019, 2:39 p.m. | #2
On Mon, Jan 7, 2019 at 11:12 PM Uros Bizjak <ubizjak@gmail.com> wrote:
>

> On Mon, Jan 7, 2019 at 6:40 PM H.J. Lu <hongjiu.lu@intel.com> wrote:

> >

> > There is no need to generate vzeroupper if caller uses upper bits of

> > AVX/AVX512 registers,  We track caller's avx_u128_state and avoid

> > vzeroupper when caller's avx_u128_state is AVX_U128_DIRTY.

> >

> > Tested on i686 and x86-64 with and without --with-arch=native.

> >

> > OK for trunk?

>

> In principle OK, but I think we don't have to cache the result of

> ix86_avx_u128_mode_entry. Simply call the function from

> ix86_avx_u128_mode_exit; it is a simple function, so I guess we can

> afford to re-call it one more time per function.


Do we really need ix86_avx_u128_mode_entry?  We can just
set entry state to AVX_U128_CLEAN and set exit state to
AVX_U128_DIRTY if caller returns AVX/AVX512 register or passes
AVX/AVX512 registers to callee.

Does this patch look OK?

Thanks.

H.J.
--
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index d01278d866f..1ac89fd2eb5 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -19087,25 +19087,6 @@ ix86_dirflag_mode_entry (void)
   return X86_DIRFLAG_RESET;
 }

-static int
-ix86_avx_u128_mode_entry (void)
-{
-  tree arg;
-
-  /* Entry mode is set to AVX_U128_DIRTY if there are
-     256bit or 512bit modes used in function arguments.  */
-  for (arg = DECL_ARGUMENTS (current_function_decl); arg;
-       arg = TREE_CHAIN (arg))
-    {
-      rtx incoming = DECL_INCOMING_RTL (arg);
-
-      if (incoming && ix86_check_avx_upper_register (incoming))
- return AVX_U128_DIRTY;
-    }
-
-  return AVX_U128_CLEAN;
-}
-
 /* Return a mode that ENTITY is assumed to be
    switched to at function entry.  */

@@ -19117,7 +19098,7 @@ ix86_mode_entry (int entity)
     case X86_DIRFLAG:
       return ix86_dirflag_mode_entry ();
     case AVX_U128:
-      return ix86_avx_u128_mode_entry ();
+      return AVX_U128_CLEAN;
     case I387_TRUNC:
     case I387_FLOOR:
     case I387_CEIL:
@@ -19130,13 +19111,24 @@ ix86_mode_entry (int entity)
 static int
 ix86_avx_u128_mode_exit (void)
 {
+  /* Exit mode is set to AVX_U128_DIRTY if there are 256bit or 512bit
+     modes used in function arguments or function return..  */
   rtx reg = crtl->return_rtx;

-  /* Exit mode is set to AVX_U128_DIRTY if there are 256bit
-     or 512 bit modes used in the function return register. */
   if (reg && ix86_check_avx_upper_register (reg))
     return AVX_U128_DIRTY;

+  tree arg;
+
+  for (arg = DECL_ARGUMENTS (current_function_decl); arg;
+       arg = TREE_CHAIN (arg))
+    {
+      rtx incoming = DECL_INCOMING_RTL (arg);
+
+      if (incoming && ix86_check_avx_upper_register (incoming))
+ return AVX_U128_DIRTY;
+    }
+
   return AVX_U128_CLEAN;
 }
Uros Bizjak Jan. 8, 2019, 2:53 p.m. | #3
On Tue, Jan 8, 2019 at 3:39 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>

> On Mon, Jan 7, 2019 at 11:12 PM Uros Bizjak <ubizjak@gmail.com> wrote:

> >

> > On Mon, Jan 7, 2019 at 6:40 PM H.J. Lu <hongjiu.lu@intel.com> wrote:

> > >

> > > There is no need to generate vzeroupper if caller uses upper bits of

> > > AVX/AVX512 registers,  We track caller's avx_u128_state and avoid

> > > vzeroupper when caller's avx_u128_state is AVX_U128_DIRTY.

> > >

> > > Tested on i686 and x86-64 with and without --with-arch=native.

> > >

> > > OK for trunk?

> >

> > In principle OK, but I think we don't have to cache the result of

> > ix86_avx_u128_mode_entry. Simply call the function from

> > ix86_avx_u128_mode_exit; it is a simple function, so I guess we can

> > afford to re-call it one more time per function.

>

> Do we really need ix86_avx_u128_mode_entry?  We can just

> set entry state to AVX_U128_CLEAN and set exit state to

> AVX_U128_DIRTY if caller returns AVX/AVX512 register or passes

> AVX/AVX512 registers to callee.

>

> Does this patch look OK?


No, the compiler is then free to move optimal insertion point at the
beginning of the function.

Uros.

> Thanks.

>

> H.J.

> --

> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c

> index d01278d866f..1ac89fd2eb5 100644

> --- a/gcc/config/i386/i386.c

> +++ b/gcc/config/i386/i386.c

> @@ -19087,25 +19087,6 @@ ix86_dirflag_mode_entry (void)

>    return X86_DIRFLAG_RESET;

>  }

>

> -static int

> -ix86_avx_u128_mode_entry (void)

> -{

> -  tree arg;

> -

> -  /* Entry mode is set to AVX_U128_DIRTY if there are

> -     256bit or 512bit modes used in function arguments.  */

> -  for (arg = DECL_ARGUMENTS (current_function_decl); arg;

> -       arg = TREE_CHAIN (arg))

> -    {

> -      rtx incoming = DECL_INCOMING_RTL (arg);

> -

> -      if (incoming && ix86_check_avx_upper_register (incoming))

> - return AVX_U128_DIRTY;

> -    }

> -

> -  return AVX_U128_CLEAN;

> -}

> -

>  /* Return a mode that ENTITY is assumed to be

>     switched to at function entry.  */

>

> @@ -19117,7 +19098,7 @@ ix86_mode_entry (int entity)

>      case X86_DIRFLAG:

>        return ix86_dirflag_mode_entry ();

>      case AVX_U128:

> -      return ix86_avx_u128_mode_entry ();

> +      return AVX_U128_CLEAN;

>      case I387_TRUNC:

>      case I387_FLOOR:

>      case I387_CEIL:

> @@ -19130,13 +19111,24 @@ ix86_mode_entry (int entity)

>  static int

>  ix86_avx_u128_mode_exit (void)

>  {

> +  /* Exit mode is set to AVX_U128_DIRTY if there are 256bit or 512bit

> +     modes used in function arguments or function return..  */

>    rtx reg = crtl->return_rtx;

>

> -  /* Exit mode is set to AVX_U128_DIRTY if there are 256bit

> -     or 512 bit modes used in the function return register. */

>    if (reg && ix86_check_avx_upper_register (reg))

>      return AVX_U128_DIRTY;

>

> +  tree arg;

> +

> +  for (arg = DECL_ARGUMENTS (current_function_decl); arg;

> +       arg = TREE_CHAIN (arg))

> +    {

> +      rtx incoming = DECL_INCOMING_RTL (arg);

> +

> +      if (incoming && ix86_check_avx_upper_register (incoming))

> + return AVX_U128_DIRTY;

> +    }

> +

>    return AVX_U128_CLEAN;

>  }

Patch

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index d01278d866f..9b49a2c1d9c 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -19100,7 +19100,11 @@  ix86_avx_u128_mode_entry (void)
       rtx incoming = DECL_INCOMING_RTL (arg);
 
       if (incoming && ix86_check_avx_upper_register (incoming))
-	return AVX_U128_DIRTY;
+	{
+	  /* Caller is AVX_U128_DIRTY.  */
+	  cfun->machine->caller_avx_u128_dirty = true;
+	  return AVX_U128_DIRTY;
+	}
     }
 
   return AVX_U128_CLEAN;
@@ -19130,6 +19134,10 @@  ix86_mode_entry (int entity)
 static int
 ix86_avx_u128_mode_exit (void)
 {
+  /* Exit mode is set to AVX_U128_DIRTY if caller is AVX_U128_DIRTY.  */
+  if (cfun->machine->caller_avx_u128_dirty)
+    return AVX_U128_DIRTY;
+
   rtx reg = crtl->return_rtx;
 
   /* Exit mode is set to AVX_U128_DIRTY if there are 256bit
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 83b025e0cf5..c053b657a55 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -2747,6 +2747,9 @@  struct GTY(()) machine_function {
   /* If true, ENDBR is queued at function entrance.  */
   BOOL_BITFIELD endbr_queued_at_entrance : 1;
 
+  /* If true, caller is AVX_U128_DIRTY.  */
+  BOOL_BITFIELD caller_avx_u128_dirty : 1;
+
   /* The largest alignment, in bytes, of stack slot actually used.  */
   unsigned int max_used_stack_alignment;
 
diff --git a/gcc/testsuite/gcc.target/i386/pr88717.c b/gcc/testsuite/gcc.target/i386/pr88717.c
new file mode 100644
index 00000000000..01680998f1b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88717.c
@@ -0,0 +1,24 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f -mvzeroupper" } */
+
+#include <immintrin.h>
+
+__m128
+foo1 (__m256 x)
+{
+  return _mm256_castps256_ps128 (x);
+}
+
+void
+foo2 (float *p, __m256 x)
+{
+  *p = ((__v8sf)x)[0];
+}
+
+void
+foo3 (float *p, __m512 x)
+{
+  *p = ((__v16sf)x)[0];
+}
+
+/* { dg-final { scan-assembler-not "vzeroupper" } } */