x86: Add -mavoid-libcall for -mgeneral-regs-only

Message ID 20200514231256.564685-1-hjl.tools@gmail.com
State New
Headers show
Series
  • x86: Add -mavoid-libcall for -mgeneral-regs-only
Related show

Commit Message

Jose E. Marchesi via Gcc-patches May 14, 2020, 11:12 p.m.
The -mgeneral-regs-only option generates code that uses only the
general-purpose registers.  It prevents the compiler from using vector
registers.  But GCC may still generate calls to memcpy, memmove, memset
and memcmp library functions.  In the GNU C library, these library
functions are implementated with vector registers, which makes the
-mgeneral-regs-only option less effective.  The new -mavoid-libcall
option expands memcpy, memmove and memset into REP MOVSB and REP STOSB
sequence.  This option can be further enhanced with a cmpmem pattern
to expand memcmp into REP CMPSB sequence in the future.

Tested on Linux/x86 and Linux/x86-64.  OK for master?

Thanks.

H.J.
---
gcc/

	PR target/95134
	* config/i386/i386-expand.c (alg_usable_p): Return false for
	libcall with -mavoid-libcall.
	(decide_alg): Avoid libcall and rep_prefix_1_byte instead of
	libcall with -mavoid-libcall.
	* config/i386/i386.opt: Add -mavoid-libcall.
	* doc/invoke.texi: Document -mavoid-libcall.

gcc/testsuite/

	PR target/95134
	* gcc.target/i386/pr95134-1.c: New test.
	* gcc.target/i386/pr95134-2.c: Likewise.
	* gcc.target/i386/pr95134-3.c: Likewise.
	* gcc.target/i386/pr95134-4.c: Likewise.
---
 gcc/config/i386/i386-expand.c             | 15 ++++++++++-----
 gcc/config/i386/i386.opt                  |  6 +++++-
 gcc/doc/invoke.texi                       | 10 +++++++++-
 gcc/testsuite/gcc.target/i386/pr95134-1.c | 18 ++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr95134-2.c | 18 ++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr95134-3.c | 18 ++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr95134-4.c | 11 +++++++++++
 7 files changed, 89 insertions(+), 7 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr95134-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr95134-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr95134-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr95134-4.c

-- 
2.26.2

Comments

Jose E. Marchesi via Gcc-patches May 15, 2020, 6:11 a.m. | #1
On Fri, May 15, 2020 at 1:13 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>

> The -mgeneral-regs-only option generates code that uses only the

> general-purpose registers.  It prevents the compiler from using vector

> registers.  But GCC may still generate calls to memcpy, memmove, memset

> and memcmp library functions.  In the GNU C library, these library

> functions are implementated with vector registers, which makes the

> -mgeneral-regs-only option less effective.  The new -mavoid-libcall

> option expands memcpy, memmove and memset into REP MOVSB and REP STOSB

> sequence.  This option can be further enhanced with a cmpmem pattern

> to expand memcmp into REP CMPSB sequence in the future.

>

> Tested on Linux/x86 and Linux/x86-64.  OK for master?


No. Library should provide functions that are appropriate for your
target. There are probably other places in the library that use XMM
registers, so there is no point working around only some specific
functions.

Uros.

> Thanks.

>

> H.J.

> ---

> gcc/

>

>         PR target/95134

>         * config/i386/i386-expand.c (alg_usable_p): Return false for

>         libcall with -mavoid-libcall.

>         (decide_alg): Avoid libcall and rep_prefix_1_byte instead of

>         libcall with -mavoid-libcall.

>         * config/i386/i386.opt: Add -mavoid-libcall.

>         * doc/invoke.texi: Document -mavoid-libcall.

>

> gcc/testsuite/

>

>         PR target/95134

>         * gcc.target/i386/pr95134-1.c: New test.

>         * gcc.target/i386/pr95134-2.c: Likewise.

>         * gcc.target/i386/pr95134-3.c: Likewise.

>         * gcc.target/i386/pr95134-4.c: Likewise.

> ---

>  gcc/config/i386/i386-expand.c             | 15 ++++++++++-----

>  gcc/config/i386/i386.opt                  |  6 +++++-

>  gcc/doc/invoke.texi                       | 10 +++++++++-

>  gcc/testsuite/gcc.target/i386/pr95134-1.c | 18 ++++++++++++++++++

>  gcc/testsuite/gcc.target/i386/pr95134-2.c | 18 ++++++++++++++++++

>  gcc/testsuite/gcc.target/i386/pr95134-3.c | 18 ++++++++++++++++++

>  gcc/testsuite/gcc.target/i386/pr95134-4.c | 11 +++++++++++

>  7 files changed, 89 insertions(+), 7 deletions(-)

>  create mode 100644 gcc/testsuite/gcc.target/i386/pr95134-1.c

>  create mode 100644 gcc/testsuite/gcc.target/i386/pr95134-2.c

>  create mode 100644 gcc/testsuite/gcc.target/i386/pr95134-3.c

>  create mode 100644 gcc/testsuite/gcc.target/i386/pr95134-4.c

>

> diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c

> index 26531585c5f..b38463bf88c 100644

> --- a/gcc/config/i386/i386-expand.c

> +++ b/gcc/config/i386/i386-expand.c

> @@ -6816,7 +6816,7 @@ alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)

>           || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))

>         return false;

>      }

> -  return true;

> +  return !flag_avoid_libcall || alg != libcall;

>  }

>

>  /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation.  */

> @@ -6889,7 +6889,7 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,

>       setup.  */

>    else if (expected_size != -1 && expected_size < 4)

>      return loop_1_byte;

> -  else if (expected_size != -1)

> +  else if (expected_size != -1 && !flag_avoid_libcall)

>      {

>        enum stringop_alg alg = libcall;

>        bool alg_noalign = false;

> @@ -6934,6 +6934,9 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,

>             }

>         }

>      }

> +

> +  enum stringop_alg alg;

> +

>    /* When asked to inline the call anyway, try to pick meaningful choice.

>       We look for maximal size of block that is faster to copy by hand and

>       take blocks of at most of that size guessing that average size will

> @@ -6945,7 +6948,6 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,

>        && (algs->unknown_size == libcall

>           || !alg_usable_p (algs->unknown_size, memset, have_as)))

>      {

> -      enum stringop_alg alg;

>        HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;

>

>        /* If there aren't any usable algorithms or if recursing already,

> @@ -6967,8 +6969,11 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,

>         gcc_assert (alg != libcall);

>        return alg;

>      }

> -  return (alg_usable_p (algs->unknown_size, memset, have_as)

> -         ? algs->unknown_size : libcall);

> +  alg = (alg_usable_p (algs->unknown_size, memset, have_as)

> +        ? algs->unknown_size : libcall);

> +  if (flag_avoid_libcall && alg == libcall)

> +    alg = rep_prefix_1_byte;

> +  return alg;

>  }

>

>  /* Decide on alignment.  We know that the operand is already aligned to ALIGN

> diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt

> index c9f7195d423..23b401bd424 100644

> --- a/gcc/config/i386/i386.opt

> +++ b/gcc/config/i386/i386.opt

> @@ -1114,4 +1114,8 @@ Support SERIALIZE built-in functions and code generation.

>

>  mtsxldtrk

>  Target Report Mask(ISA2_TSXLDTRK) Var(ix86_isa_flags2) Save

> -Support TSXLDTRK built-in functions and code generation.

> \ No newline at end of file

> +Support TSXLDTRK built-in functions and code generation.

> +

> +mavoid-libcall

> +Target Report Var(flag_avoid_libcall) Init(0)

> +Avoid generation of libcall.

> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi

> index 850aeac033d..0d2d70419d5 100644

> --- a/gcc/doc/invoke.texi

> +++ b/gcc/doc/invoke.texi

> @@ -1364,7 +1364,7 @@ See RS/6000 and PowerPC Options.

>  -mstack-protector-guard-reg=@var{reg} @gol

>  -mstack-protector-guard-offset=@var{offset} @gol

>  -mstack-protector-guard-symbol=@var{symbol} @gol

> --mgeneral-regs-only  -mcall-ms2sysv-xlogues @gol

> +-mgeneral-regs-only -mavoid-libcall -mcall-ms2sysv-xlogues @gol

>  -mindirect-branch=@var{choice}  -mfunction-return=@var{choice} @gol

>  -mindirect-branch-register}

>

> @@ -30115,6 +30115,14 @@ Generate code that uses only the general-purpose registers.  This

>  prevents the compiler from using floating-point, vector, mask and bound

>  registers.

>

> +@item -mavoid-libcall

> +@opindex mavoid-libcall

> +Avoid generation of calls to @code{memcpy}, @code{memmove} and

> +@code{memset} library functions.  It can be used together with the

> +option @option{-mgeneral-regs-only} to avoid implicit vector register

> +usage in @code{memcpy}, @code{memmove} and @code{memset} library

> +functions.

> +

>  @item -mindirect-branch=@var{choice}

>  @opindex mindirect-branch

>  Convert indirect call and jump with @var{choice}.  The default is

> diff --git a/gcc/testsuite/gcc.target/i386/pr95134-1.c b/gcc/testsuite/gcc.target/i386/pr95134-1.c

> new file mode 100644

> index 00000000000..8ffa680559d

> --- /dev/null

> +++ b/gcc/testsuite/gcc.target/i386/pr95134-1.c

> @@ -0,0 +1,18 @@

> +/* { dg-do compile } */

> +/* { dg-options "-O2 -mgeneral-regs-only -mavoid-libcall -mtune=skylake" } */

> +

> +struct foo

> +{

> +  char array[513];

> +};

> +

> +extern struct foo x;

> +

> +int

> +func (void)

> +{

> +  __builtin_memset (&x, 0, sizeof (x));

> +  return 0;

> +}

> +

> +/* { dg-final { scan-assembler-not "call\[\\t \]*_?memset" } } */

> diff --git a/gcc/testsuite/gcc.target/i386/pr95134-2.c b/gcc/testsuite/gcc.target/i386/pr95134-2.c

> new file mode 100644

> index 00000000000..7c6c42a736d

> --- /dev/null

> +++ b/gcc/testsuite/gcc.target/i386/pr95134-2.c

> @@ -0,0 +1,18 @@

> +/* { dg-do compile { target ia32 } } */

> +/* { dg-options "-O2 -mgeneral-regs-only -mavoid-libcall -mtune=pentium" } */

> +

> +struct foo

> +{

> +  char array[257];

> +};

> +

> +extern struct foo x;

> +

> +int

> +func (struct foo i)

> +{

> +  x = i;

> +  return 0;

> +}

> +

> +/* { dg-final { scan-assembler-not "call\[\\t \]*_?memcpy" } } */

> diff --git a/gcc/testsuite/gcc.target/i386/pr95134-3.c b/gcc/testsuite/gcc.target/i386/pr95134-3.c

> new file mode 100644

> index 00000000000..4e4428cd0ae

> --- /dev/null

> +++ b/gcc/testsuite/gcc.target/i386/pr95134-3.c

> @@ -0,0 +1,18 @@

> +/* { dg-do compile { target ia32 } } */

> +/* { dg-options "-O2 -mgeneral-regs-only -mavoid-libcall -mtune=pentium" } */

> +

> +struct foo

> +{

> +  char array[257];

> +};

> +

> +extern struct foo x;

> +

> +int

> +func (struct foo i)

> +{

> +  __builtin_memcpy (&x, &i, sizeof (x));

> +  return 0;

> +}

> +

> +/* { dg-final { scan-assembler-not "call\[\\t \]*_?memcpy" } } */

> diff --git a/gcc/testsuite/gcc.target/i386/pr95134-4.c b/gcc/testsuite/gcc.target/i386/pr95134-4.c

> new file mode 100644

> index 00000000000..d1bd8fbf4c1

> --- /dev/null

> +++ b/gcc/testsuite/gcc.target/i386/pr95134-4.c

> @@ -0,0 +1,11 @@

> +/* { dg-do compile } */

> +/* { dg-options "-O2 -mgeneral-regs-only -mavoid-libcall" } */

> +

> +int

> +func (void *d, void *s, unsigned int l)

> +{

> +  __builtin_memcpy (d, s, l);

> +  return 0;

> +}

> +

> +/* { dg-final { scan-assembler-not "call\[\\t \]*_?memcpy" } } */

> --

> 2.26.2

>
Jose E. Marchesi via Gcc-patches May 15, 2020, 7:17 a.m. | #2
On Fri, May 15, 2020 at 8:27 AM Uros Bizjak via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>

> On Fri, May 15, 2020 at 1:13 AM H.J. Lu <hjl.tools@gmail.com> wrote:

> >

> > The -mgeneral-regs-only option generates code that uses only the

> > general-purpose registers.  It prevents the compiler from using vector

> > registers.  But GCC may still generate calls to memcpy, memmove, memset

> > and memcmp library functions.  In the GNU C library, these library

> > functions are implementated with vector registers, which makes the

> > -mgeneral-regs-only option less effective.  The new -mavoid-libcall

> > option expands memcpy, memmove and memset into REP MOVSB and REP STOSB

> > sequence.  This option can be further enhanced with a cmpmem pattern

> > to expand memcmp into REP CMPSB sequence in the future.

> >

> > Tested on Linux/x86 and Linux/x86-64.  OK for master?

>

> No. Library should provide functions that are appropriate for your

> target. There are probably other places in the library that use XMM

> registers, so there is no point working around only some specific

> functions.


For those specific functions -minline-all-stringops should also work, no?

Richard.

> Uros.

>

> > Thanks.

> >

> > H.J.

> > ---

> > gcc/

> >

> >         PR target/95134

> >         * config/i386/i386-expand.c (alg_usable_p): Return false for

> >         libcall with -mavoid-libcall.

> >         (decide_alg): Avoid libcall and rep_prefix_1_byte instead of

> >         libcall with -mavoid-libcall.

> >         * config/i386/i386.opt: Add -mavoid-libcall.

> >         * doc/invoke.texi: Document -mavoid-libcall.

> >

> > gcc/testsuite/

> >

> >         PR target/95134

> >         * gcc.target/i386/pr95134-1.c: New test.

> >         * gcc.target/i386/pr95134-2.c: Likewise.

> >         * gcc.target/i386/pr95134-3.c: Likewise.

> >         * gcc.target/i386/pr95134-4.c: Likewise.

> > ---

> >  gcc/config/i386/i386-expand.c             | 15 ++++++++++-----

> >  gcc/config/i386/i386.opt                  |  6 +++++-

> >  gcc/doc/invoke.texi                       | 10 +++++++++-

> >  gcc/testsuite/gcc.target/i386/pr95134-1.c | 18 ++++++++++++++++++

> >  gcc/testsuite/gcc.target/i386/pr95134-2.c | 18 ++++++++++++++++++

> >  gcc/testsuite/gcc.target/i386/pr95134-3.c | 18 ++++++++++++++++++

> >  gcc/testsuite/gcc.target/i386/pr95134-4.c | 11 +++++++++++

> >  7 files changed, 89 insertions(+), 7 deletions(-)

> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr95134-1.c

> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr95134-2.c

> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr95134-3.c

> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr95134-4.c

> >

> > diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c

> > index 26531585c5f..b38463bf88c 100644

> > --- a/gcc/config/i386/i386-expand.c

> > +++ b/gcc/config/i386/i386-expand.c

> > @@ -6816,7 +6816,7 @@ alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)

> >           || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))

> >         return false;

> >      }

> > -  return true;

> > +  return !flag_avoid_libcall || alg != libcall;

> >  }

> >

> >  /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation.  */

> > @@ -6889,7 +6889,7 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,

> >       setup.  */

> >    else if (expected_size != -1 && expected_size < 4)

> >      return loop_1_byte;

> > -  else if (expected_size != -1)

> > +  else if (expected_size != -1 && !flag_avoid_libcall)

> >      {

> >        enum stringop_alg alg = libcall;

> >        bool alg_noalign = false;

> > @@ -6934,6 +6934,9 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,

> >             }

> >         }

> >      }

> > +

> > +  enum stringop_alg alg;

> > +

> >    /* When asked to inline the call anyway, try to pick meaningful choice.

> >       We look for maximal size of block that is faster to copy by hand and

> >       take blocks of at most of that size guessing that average size will

> > @@ -6945,7 +6948,6 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,

> >        && (algs->unknown_size == libcall

> >           || !alg_usable_p (algs->unknown_size, memset, have_as)))

> >      {

> > -      enum stringop_alg alg;

> >        HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;

> >

> >        /* If there aren't any usable algorithms or if recursing already,

> > @@ -6967,8 +6969,11 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,

> >         gcc_assert (alg != libcall);

> >        return alg;

> >      }

> > -  return (alg_usable_p (algs->unknown_size, memset, have_as)

> > -         ? algs->unknown_size : libcall);

> > +  alg = (alg_usable_p (algs->unknown_size, memset, have_as)

> > +        ? algs->unknown_size : libcall);

> > +  if (flag_avoid_libcall && alg == libcall)

> > +    alg = rep_prefix_1_byte;

> > +  return alg;

> >  }

> >

> >  /* Decide on alignment.  We know that the operand is already aligned to ALIGN

> > diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt

> > index c9f7195d423..23b401bd424 100644

> > --- a/gcc/config/i386/i386.opt

> > +++ b/gcc/config/i386/i386.opt

> > @@ -1114,4 +1114,8 @@ Support SERIALIZE built-in functions and code generation.

> >

> >  mtsxldtrk

> >  Target Report Mask(ISA2_TSXLDTRK) Var(ix86_isa_flags2) Save

> > -Support TSXLDTRK built-in functions and code generation.

> > \ No newline at end of file

> > +Support TSXLDTRK built-in functions and code generation.

> > +

> > +mavoid-libcall

> > +Target Report Var(flag_avoid_libcall) Init(0)

> > +Avoid generation of libcall.

> > diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi

> > index 850aeac033d..0d2d70419d5 100644

> > --- a/gcc/doc/invoke.texi

> > +++ b/gcc/doc/invoke.texi

> > @@ -1364,7 +1364,7 @@ See RS/6000 and PowerPC Options.

> >  -mstack-protector-guard-reg=@var{reg} @gol

> >  -mstack-protector-guard-offset=@var{offset} @gol

> >  -mstack-protector-guard-symbol=@var{symbol} @gol

> > --mgeneral-regs-only  -mcall-ms2sysv-xlogues @gol

> > +-mgeneral-regs-only -mavoid-libcall -mcall-ms2sysv-xlogues @gol

> >  -mindirect-branch=@var{choice}  -mfunction-return=@var{choice} @gol

> >  -mindirect-branch-register}

> >

> > @@ -30115,6 +30115,14 @@ Generate code that uses only the general-purpose registers.  This

> >  prevents the compiler from using floating-point, vector, mask and bound

> >  registers.

> >

> > +@item -mavoid-libcall

> > +@opindex mavoid-libcall

> > +Avoid generation of calls to @code{memcpy}, @code{memmove} and

> > +@code{memset} library functions.  It can be used together with the

> > +option @option{-mgeneral-regs-only} to avoid implicit vector register

> > +usage in @code{memcpy}, @code{memmove} and @code{memset} library

> > +functions.

> > +

> >  @item -mindirect-branch=@var{choice}

> >  @opindex mindirect-branch

> >  Convert indirect call and jump with @var{choice}.  The default is

> > diff --git a/gcc/testsuite/gcc.target/i386/pr95134-1.c b/gcc/testsuite/gcc.target/i386/pr95134-1.c

> > new file mode 100644

> > index 00000000000..8ffa680559d

> > --- /dev/null

> > +++ b/gcc/testsuite/gcc.target/i386/pr95134-1.c

> > @@ -0,0 +1,18 @@

> > +/* { dg-do compile } */

> > +/* { dg-options "-O2 -mgeneral-regs-only -mavoid-libcall -mtune=skylake" } */

> > +

> > +struct foo

> > +{

> > +  char array[513];

> > +};

> > +

> > +extern struct foo x;

> > +

> > +int

> > +func (void)

> > +{

> > +  __builtin_memset (&x, 0, sizeof (x));

> > +  return 0;

> > +}

> > +

> > +/* { dg-final { scan-assembler-not "call\[\\t \]*_?memset" } } */

> > diff --git a/gcc/testsuite/gcc.target/i386/pr95134-2.c b/gcc/testsuite/gcc.target/i386/pr95134-2.c

> > new file mode 100644

> > index 00000000000..7c6c42a736d

> > --- /dev/null

> > +++ b/gcc/testsuite/gcc.target/i386/pr95134-2.c

> > @@ -0,0 +1,18 @@

> > +/* { dg-do compile { target ia32 } } */

> > +/* { dg-options "-O2 -mgeneral-regs-only -mavoid-libcall -mtune=pentium" } */

> > +

> > +struct foo

> > +{

> > +  char array[257];

> > +};

> > +

> > +extern struct foo x;

> > +

> > +int

> > +func (struct foo i)

> > +{

> > +  x = i;

> > +  return 0;

> > +}

> > +

> > +/* { dg-final { scan-assembler-not "call\[\\t \]*_?memcpy" } } */

> > diff --git a/gcc/testsuite/gcc.target/i386/pr95134-3.c b/gcc/testsuite/gcc.target/i386/pr95134-3.c

> > new file mode 100644

> > index 00000000000..4e4428cd0ae

> > --- /dev/null

> > +++ b/gcc/testsuite/gcc.target/i386/pr95134-3.c

> > @@ -0,0 +1,18 @@

> > +/* { dg-do compile { target ia32 } } */

> > +/* { dg-options "-O2 -mgeneral-regs-only -mavoid-libcall -mtune=pentium" } */

> > +

> > +struct foo

> > +{

> > +  char array[257];

> > +};

> > +

> > +extern struct foo x;

> > +

> > +int

> > +func (struct foo i)

> > +{

> > +  __builtin_memcpy (&x, &i, sizeof (x));

> > +  return 0;

> > +}

> > +

> > +/* { dg-final { scan-assembler-not "call\[\\t \]*_?memcpy" } } */

> > diff --git a/gcc/testsuite/gcc.target/i386/pr95134-4.c b/gcc/testsuite/gcc.target/i386/pr95134-4.c

> > new file mode 100644

> > index 00000000000..d1bd8fbf4c1

> > --- /dev/null

> > +++ b/gcc/testsuite/gcc.target/i386/pr95134-4.c

> > @@ -0,0 +1,11 @@

> > +/* { dg-do compile } */

> > +/* { dg-options "-O2 -mgeneral-regs-only -mavoid-libcall" } */

> > +

> > +int

> > +func (void *d, void *s, unsigned int l)

> > +{

> > +  __builtin_memcpy (d, s, l);

> > +  return 0;

> > +}

> > +

> > +/* { dg-final { scan-assembler-not "call\[\\t \]*_?memcpy" } } */

> > --

> > 2.26.2

> >
Jose E. Marchesi via Gcc-patches May 20, 2020, 9:53 p.m. | #3
On Fri, 2020-05-15 at 08:11 +0200, Uros Bizjak via Gcc-patches wrote:
> On Fri, May 15, 2020 at 1:13 AM H.J. Lu <hjl.tools@gmail.com> wrote:

> > The -mgeneral-regs-only option generates code that uses only the

> > general-purpose registers.  It prevents the compiler from using vector

> > registers.  But GCC may still generate calls to memcpy, memmove, memset

> > and memcmp library functions.  In the GNU C library, these library

> > functions are implementated with vector registers, which makes the

> > -mgeneral-regs-only option less effective.  The new -mavoid-libcall

> > option expands memcpy, memmove and memset into REP MOVSB and REP STOSB

> > sequence.  This option can be further enhanced with a cmpmem pattern

> > to expand memcmp into REP CMPSB sequence in the future.

> > 

> > Tested on Linux/x86 and Linux/x86-64.  OK for master?

> 

> No. Library should provide functions that are appropriate for your

> target. There are probably other places in the library that use XMM

> registers, so there is no point working around only some specific

> functions.

Couldn't one make the argument that we should be using -ffreestanding and that
-ffreestanding should be emitting the necessary inline code rather than calling
out to memcpy or whatever it's doing.

jeff
>

Patch

diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index 26531585c5f..b38463bf88c 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -6816,7 +6816,7 @@  alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
 	  || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
 	return false;
     }
-  return true;
+  return !flag_avoid_libcall || alg != libcall;
 }
 
 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation.  */
@@ -6889,7 +6889,7 @@  decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
      setup.  */
   else if (expected_size != -1 && expected_size < 4)
     return loop_1_byte;
-  else if (expected_size != -1)
+  else if (expected_size != -1 && !flag_avoid_libcall)
     {
       enum stringop_alg alg = libcall;
       bool alg_noalign = false;
@@ -6934,6 +6934,9 @@  decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
 	    }
 	}
     }
+
+  enum stringop_alg alg;
+
   /* When asked to inline the call anyway, try to pick meaningful choice.
      We look for maximal size of block that is faster to copy by hand and
      take blocks of at most of that size guessing that average size will
@@ -6945,7 +6948,6 @@  decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
       && (algs->unknown_size == libcall
 	  || !alg_usable_p (algs->unknown_size, memset, have_as)))
     {
-      enum stringop_alg alg;
       HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
 
       /* If there aren't any usable algorithms or if recursing already,
@@ -6967,8 +6969,11 @@  decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
 	gcc_assert (alg != libcall);
       return alg;
     }
-  return (alg_usable_p (algs->unknown_size, memset, have_as)
-	  ? algs->unknown_size : libcall);
+  alg = (alg_usable_p (algs->unknown_size, memset, have_as)
+	 ? algs->unknown_size : libcall);
+  if (flag_avoid_libcall && alg == libcall)
+    alg = rep_prefix_1_byte;
+  return alg;
 }
 
 /* Decide on alignment.  We know that the operand is already aligned to ALIGN
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index c9f7195d423..23b401bd424 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -1114,4 +1114,8 @@  Support SERIALIZE built-in functions and code generation.
 
 mtsxldtrk
 Target Report Mask(ISA2_TSXLDTRK) Var(ix86_isa_flags2) Save
-Support TSXLDTRK built-in functions and code generation.
\ No newline at end of file
+Support TSXLDTRK built-in functions and code generation.
+
+mavoid-libcall
+Target Report Var(flag_avoid_libcall) Init(0)
+Avoid generation of libcall.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 850aeac033d..0d2d70419d5 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -1364,7 +1364,7 @@  See RS/6000 and PowerPC Options.
 -mstack-protector-guard-reg=@var{reg} @gol
 -mstack-protector-guard-offset=@var{offset} @gol
 -mstack-protector-guard-symbol=@var{symbol} @gol
--mgeneral-regs-only  -mcall-ms2sysv-xlogues @gol
+-mgeneral-regs-only -mavoid-libcall -mcall-ms2sysv-xlogues @gol
 -mindirect-branch=@var{choice}  -mfunction-return=@var{choice} @gol
 -mindirect-branch-register}
 
@@ -30115,6 +30115,14 @@  Generate code that uses only the general-purpose registers.  This
 prevents the compiler from using floating-point, vector, mask and bound
 registers.
 
+@item -mavoid-libcall
+@opindex mavoid-libcall
+Avoid generation of calls to @code{memcpy}, @code{memmove} and
+@code{memset} library functions.  It can be used together with the
+option @option{-mgeneral-regs-only} to avoid implicit vector register
+usage in @code{memcpy}, @code{memmove} and @code{memset} library
+functions.
+
 @item -mindirect-branch=@var{choice}
 @opindex mindirect-branch
 Convert indirect call and jump with @var{choice}.  The default is
diff --git a/gcc/testsuite/gcc.target/i386/pr95134-1.c b/gcc/testsuite/gcc.target/i386/pr95134-1.c
new file mode 100644
index 00000000000..8ffa680559d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr95134-1.c
@@ -0,0 +1,18 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mgeneral-regs-only -mavoid-libcall -mtune=skylake" } */
+
+struct foo
+{
+  char array[513];
+};
+
+extern struct foo x;
+
+int
+func (void)
+{
+  __builtin_memset (&x, 0, sizeof (x));
+  return 0;
+}
+
+/* { dg-final { scan-assembler-not "call\[\\t \]*_?memset" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr95134-2.c b/gcc/testsuite/gcc.target/i386/pr95134-2.c
new file mode 100644
index 00000000000..7c6c42a736d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr95134-2.c
@@ -0,0 +1,18 @@ 
+/* { dg-do compile { target ia32 } } */
+/* { dg-options "-O2 -mgeneral-regs-only -mavoid-libcall -mtune=pentium" } */
+
+struct foo
+{
+  char array[257];
+};
+
+extern struct foo x;
+
+int
+func (struct foo i)
+{
+  x = i;
+  return 0;
+}
+
+/* { dg-final { scan-assembler-not "call\[\\t \]*_?memcpy" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr95134-3.c b/gcc/testsuite/gcc.target/i386/pr95134-3.c
new file mode 100644
index 00000000000..4e4428cd0ae
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr95134-3.c
@@ -0,0 +1,18 @@ 
+/* { dg-do compile { target ia32 } } */
+/* { dg-options "-O2 -mgeneral-regs-only -mavoid-libcall -mtune=pentium" } */
+
+struct foo
+{
+  char array[257];
+};
+
+extern struct foo x;
+
+int
+func (struct foo i)
+{
+  __builtin_memcpy (&x, &i, sizeof (x));
+  return 0;
+}
+
+/* { dg-final { scan-assembler-not "call\[\\t \]*_?memcpy" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr95134-4.c b/gcc/testsuite/gcc.target/i386/pr95134-4.c
new file mode 100644
index 00000000000..d1bd8fbf4c1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr95134-4.c
@@ -0,0 +1,11 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mgeneral-regs-only -mavoid-libcall" } */
+
+int
+func (void *d, void *s, unsigned int l)
+{
+  __builtin_memcpy (d, s, l);
+  return 0;
+}
+
+/* { dg-final { scan-assembler-not "call\[\\t \]*_?memcpy" } } */