[1/3] Support instrumenting returns of instrumented functions

Message ID 20181105000221.32180-1-andi@firstfloor.org
State New
Headers show
Series
  • [1/3] Support instrumenting returns of instrumented functions
Related show

Commit Message

Andi Kleen Nov. 5, 2018, 12:02 a.m.
From: Andi Kleen <ak@linux.intel.com>


When instrumenting programs using __fentry__ it is often useful
to instrument the function return too. Traditionally this
has been done by patching the return address on the stack
frame on entry. However this is fairly complicated (trace
function has to emulate a stack) and also slow because
it causes a branch misprediction on every return.

Add an option to generate call or nop instrumentation for
every return instead, including patch sections.

This will increase the program size slightly, but can be a
lot faster and simpler.

This version only instruments true returns, not sibling
calls or tail recursion. This matches the semantics of the
original stack.

gcc/:

2018-11-04  Andi Kleen  <ak@linux.intel.com>

	* config/i386/i386-opts.h (enum instrument_return): Add.
	* config/i386/i386.c (output_return_instrumentation): Add.
	(ix86_output_function_return): Call output_return_instrumentation.
	(ix86_output_call_insn): Call output_return_instrumentation.
	* config/i386/i386.opt: Add -minstrument-return=.
	* doc/invoke.texi (-minstrument-return): Document.

gcc/testsuite/:

2018-11-04  Andi Kleen  <ak@linux.intel.com>

	* gcc.target/i386/returninst1.c: New test.
	* gcc.target/i386/returninst2.c: New test.
	* gcc.target/i386/returninst3.c: New test.
---
 gcc/config/i386/i386-opts.h                 |  6 ++++
 gcc/config/i386/i386.c                      | 36 +++++++++++++++++++++
 gcc/config/i386/i386.opt                    | 21 ++++++++++++
 gcc/doc/invoke.texi                         | 14 ++++++++
 gcc/testsuite/gcc.target/i386/returninst1.c | 14 ++++++++
 gcc/testsuite/gcc.target/i386/returninst2.c | 21 ++++++++++++
 gcc/testsuite/gcc.target/i386/returninst3.c |  9 ++++++
 7 files changed, 121 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/returninst1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/returninst2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/returninst3.c

-- 
2.19.1

Comments

Andi Kleen Nov. 8, 2018, 5:11 p.m. | #1
Andi Kleen <andi@firstfloor.org> writes:

Ping!

> From: Andi Kleen <ak@linux.intel.com>

>

> When instrumenting programs using __fentry__ it is often useful

> to instrument the function return too. Traditionally this

> has been done by patching the return address on the stack

> frame on entry. However this is fairly complicated (trace

> function has to emulate a stack) and also slow because

> it causes a branch misprediction on every return.

>

> Add an option to generate call or nop instrumentation for

> every return instead, including patch sections.

>

> This will increase the program size slightly, but can be a

> lot faster and simpler.

>

> This version only instruments true returns, not sibling

> calls or tail recursion. This matches the semantics of the

> original stack.

>

> gcc/:

>

> 2018-11-04  Andi Kleen  <ak@linux.intel.com>

>

> 	* config/i386/i386-opts.h (enum instrument_return): Add.

> 	* config/i386/i386.c (output_return_instrumentation): Add.

> 	(ix86_output_function_return): Call output_return_instrumentation.

> 	(ix86_output_call_insn): Call output_return_instrumentation.

> 	* config/i386/i386.opt: Add -minstrument-return=.

> 	* doc/invoke.texi (-minstrument-return): Document.

>

> gcc/testsuite/:

>

> 2018-11-04  Andi Kleen  <ak@linux.intel.com>

>

> 	* gcc.target/i386/returninst1.c: New test.

> 	* gcc.target/i386/returninst2.c: New test.

> 	* gcc.target/i386/returninst3.c: New test.

> ---

>  gcc/config/i386/i386-opts.h                 |  6 ++++

>  gcc/config/i386/i386.c                      | 36 +++++++++++++++++++++

>  gcc/config/i386/i386.opt                    | 21 ++++++++++++

>  gcc/doc/invoke.texi                         | 14 ++++++++

>  gcc/testsuite/gcc.target/i386/returninst1.c | 14 ++++++++

>  gcc/testsuite/gcc.target/i386/returninst2.c | 21 ++++++++++++

>  gcc/testsuite/gcc.target/i386/returninst3.c |  9 ++++++

>  7 files changed, 121 insertions(+)

>  create mode 100644 gcc/testsuite/gcc.target/i386/returninst1.c

>  create mode 100644 gcc/testsuite/gcc.target/i386/returninst2.c

>  create mode 100644 gcc/testsuite/gcc.target/i386/returninst3.c

>

> diff --git a/gcc/config/i386/i386-opts.h b/gcc/config/i386/i386-opts.h

> index 46366cbfa72..35e9413100e 100644

> --- a/gcc/config/i386/i386-opts.h

> +++ b/gcc/config/i386/i386-opts.h

> @@ -119,4 +119,10 @@ enum indirect_branch {

>    indirect_branch_thunk_extern

>  };

>  

> +enum instrument_return {

> +  instrument_return_none = 0,

> +  instrument_return_call,

> +  instrument_return_nop5

> +};

> +

>  #endif

> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c

> index f9ef0b4445b..f7cd94a8139 100644

> --- a/gcc/config/i386/i386.c

> +++ b/gcc/config/i386/i386.c

> @@ -28336,12 +28336,47 @@ ix86_output_indirect_jmp (rtx call_op)

>      return "%!jmp\t%A0";

>  }

>  

> +/* Output return instrumentation for current function if needed.  */

> +

> +static void

> +output_return_instrumentation (void)

> +{

> +  if (ix86_instrument_return != instrument_return_none

> +      && flag_fentry

> +      && !DECL_NO_INSTRUMENT_FUNCTION_ENTRY_EXIT (cfun->decl))

> +    {

> +      if (ix86_flag_record_return)

> +	fprintf (asm_out_file, "1:\n");

> +      switch (ix86_instrument_return)

> +	{

> +	case instrument_return_call:

> +	  fprintf (asm_out_file, "\tcall\t__return__\n");

> +	  break;

> +	case instrument_return_nop5:

> +	  /* 5 byte nop: nopl 0(%[re]ax,%[re]ax,1)  */

> +	  fprintf (asm_out_file, ASM_BYTE "0x0f, 0x1f, 0x44, 0x00, 0x00\n");

> +	  break;

> +	case instrument_return_none:

> +	  break;

> +	}

> +

> +      if (ix86_flag_record_return)

> +	{

> +	  fprintf (asm_out_file, "\t.section __return_loc, \"a\",@progbits\n");

> +	  fprintf (asm_out_file, "\t.%s 1b\n", TARGET_64BIT ? "quad" : "long");

> +	  fprintf (asm_out_file, "\t.previous\n");

> +	}

> +    }

> +}

> +

>  /* Output function return.  CALL_OP is the jump target.  Add a REP

>     prefix to RET if LONG_P is true and function return is kept.  */

>  

>  const char *

>  ix86_output_function_return (bool long_p)

>  {

> +  output_return_instrumentation ();

> +

>    if (cfun->machine->function_return_type != indirect_branch_keep)

>      {

>        char thunk_name[32];

> @@ -28454,6 +28489,7 @@ ix86_output_call_insn (rtx_insn *insn, rtx call_op)

>  

>    if (SIBLING_CALL_P (insn))

>      {

> +      output_return_instrumentation ();

>        if (direct_p)

>  	{

>  	  if (ix86_nopic_noplt_attribute_p (call_op))

> diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt

> index e7fbf9b6f99..5925b75244f 100644

> --- a/gcc/config/i386/i386.opt

> +++ b/gcc/config/i386/i386.opt

> @@ -1063,3 +1063,24 @@ Support WAITPKG built-in functions and code generation.

>  mcldemote

>  Target Report Mask(ISA_CLDEMOTE) Var(ix86_isa_flags2) Save

>  Support CLDEMOTE built-in functions and code generation.

> +

> +minstrument-return=

> +Target Report RejectNegative Joined Enum(instrument_return) Var(ix86_instrument_return) Init(instrument_return_none)

> +Instrument function exit in instrumented functions with __fentry__.

> +

> +Enum

> +Name(instrument_return) Type(enum instrument_return)

> +Known choices for return instrumentation with -minstrument-return=

> +

> +EnumValue

> +Enum(instrument_return) String(none) Value(instrument_return_none)

> +

> +EnumValue

> +Enum(instrument_return) String(call) Value(instrument_return_call)

> +

> +EnumValue

> +Enum(instrument_return) String(nop5) Value(instrument_return_nop5)

> +

> +mrecord-return

> +Target Report Var(ix86_flag_record_return) Init(0)

> +Generate a __return_loc section pointing to all return instrumentation code.

> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi

> index 1743c64582e..939be3e251b 100644

> --- a/gcc/doc/invoke.texi

> +++ b/gcc/doc/invoke.texi

> @@ -1301,6 +1301,7 @@ See RS/6000 and PowerPC Options.

>  -mcmodel=@var{code-model}  -mabi=@var{name}  -maddress-mode=@var{mode} @gol

>  -m32  -m64  -mx32  -m16  -miamcu  -mlarge-data-threshold=@var{num} @gol

>  -msse2avx  -mfentry  -mrecord-mcount  -mnop-mcount  -m8bit-idiv @gol

> +-minstrument-return=@var{type} @gol

>  -mavx256-split-unaligned-load  -mavx256-split-unaligned-store @gol

>  -malign-data=@var{type}  -mstack-protector-guard=@var{guard} @gol

>  -mstack-protector-guard-reg=@var{reg} @gol

> @@ -28442,6 +28443,19 @@ the profiling functions as NOPs. This is useful when they

>  should be patched in later dynamically. This is likely only

>  useful together with @option{-mrecord-mcount}.

>  

> +@item -minstrument-return=@var{type}

> +@opindex minstrument-return

> +Instrument function exit in -pg -mfentry instrumented functions with

> +call to specified function. This only instruments true returns ending

> +with ret, but not sibling calls ending with jump. Valid types

> +are @var{none} to not instrument, @var{call} to generate a call to __return__,

> +or @var{nop5} to generate a 5 byte nop.

> +

> +@item -mrecord-return

> +@itemx -mno-record-return

> +@opindex mrecord-return

> +Generate a __return_loc section pointing to all return instrumentation code.

> +

>  @item -mskip-rax-setup

>  @itemx -mno-skip-rax-setup

>  @opindex mskip-rax-setup

> diff --git a/gcc/testsuite/gcc.target/i386/returninst1.c b/gcc/testsuite/gcc.target/i386/returninst1.c

> new file mode 100644

> index 00000000000..f970e75a774

> --- /dev/null

> +++ b/gcc/testsuite/gcc.target/i386/returninst1.c

> @@ -0,0 +1,14 @@

> +/* { dg-do compile } */

> +/* { dg-options "-pg -mfentry -minstrument-return=call -mrecord-return" } */

> +/* { dg-final { scan-assembler "call.*__return__" } } */

> +/* { dg-final { scan-assembler "section.*return_loc" } } */

> +

> +int func(int a)

> +{

> +  return a+1;

> +}

> +

> +int func2(int a)

> +{

> +  return a+1;

> +}

> diff --git a/gcc/testsuite/gcc.target/i386/returninst2.c b/gcc/testsuite/gcc.target/i386/returninst2.c

> new file mode 100644

> index 00000000000..716b38556dd

> --- /dev/null

> +++ b/gcc/testsuite/gcc.target/i386/returninst2.c

> @@ -0,0 +1,21 @@

> +/* { dg-do compile } */

> +/* { dg-options "-pg -mfentry -minstrument-return=nop5 -mrecord-return" } */

> +/* { dg-final { scan-assembler-times "0x0f, 0x1f, 0x44, 0x00, 0x00" 3 } } */

> +/* { dg-final { scan-assembler "section.*return_loc" } } */

> +

> +int func(int a)

> +{

> +  return a+1;

> +}

> +

> +int func2(int a)

> +{

> +  return a+1;

> +}

> +

> +extern void func4(int);

> +

> +int func3(int a)

> +{

> +  func4(a + 1);

> +}

> diff --git a/gcc/testsuite/gcc.target/i386/returninst3.c b/gcc/testsuite/gcc.target/i386/returninst3.c

> new file mode 100644

> index 00000000000..5bbc60e8bd4

> --- /dev/null

> +++ b/gcc/testsuite/gcc.target/i386/returninst3.c

> @@ -0,0 +1,9 @@

> +/* { dg-do compile } */

> +/* { dg-options "-pg -mfentry -minstrument-return=call" } */

> +/* { dg-final { scan-assembler-not "call.*__return__" } } */

> +

> +__attribute__((no_instrument_function))

> +int func(int a)

> +{

> +  return a+1;

> +}
Andi Kleen Nov. 15, 2018, 7:03 p.m. | #2
Andi Kleen <ak@linux.intel.com> writes:

Ping!^2

> Andi Kleen <andi@firstfloor.org> writes:

>

> Ping!

>

>> From: Andi Kleen <ak@linux.intel.com>

>>

>> When instrumenting programs using __fentry__ it is often useful

>> to instrument the function return too. Traditionally this

>> has been done by patching the return address on the stack

>> frame on entry. However this is fairly complicated (trace

>> function has to emulate a stack) and also slow because

>> it causes a branch misprediction on every return.

>>

>> Add an option to generate call or nop instrumentation for

>> every return instead, including patch sections.

>>

>> This will increase the program size slightly, but can be a

>> lot faster and simpler.

>>

>> This version only instruments true returns, not sibling

>> calls or tail recursion. This matches the semantics of the

>> original stack.

>>

>> gcc/:

>>

>> 2018-11-04  Andi Kleen  <ak@linux.intel.com>

>>

>> 	* config/i386/i386-opts.h (enum instrument_return): Add.

>> 	* config/i386/i386.c (output_return_instrumentation): Add.

>> 	(ix86_output_function_return): Call output_return_instrumentation.

>> 	(ix86_output_call_insn): Call output_return_instrumentation.

>> 	* config/i386/i386.opt: Add -minstrument-return=.

>> 	* doc/invoke.texi (-minstrument-return): Document.

>>

>> gcc/testsuite/:

>>

>> 2018-11-04  Andi Kleen  <ak@linux.intel.com>

>>

>> 	* gcc.target/i386/returninst1.c: New test.

>> 	* gcc.target/i386/returninst2.c: New test.

>> 	* gcc.target/i386/returninst3.c: New test.

>> ---

>>  gcc/config/i386/i386-opts.h                 |  6 ++++

>>  gcc/config/i386/i386.c                      | 36 +++++++++++++++++++++

>>  gcc/config/i386/i386.opt                    | 21 ++++++++++++

>>  gcc/doc/invoke.texi                         | 14 ++++++++

>>  gcc/testsuite/gcc.target/i386/returninst1.c | 14 ++++++++

>>  gcc/testsuite/gcc.target/i386/returninst2.c | 21 ++++++++++++

>>  gcc/testsuite/gcc.target/i386/returninst3.c |  9 ++++++

>>  7 files changed, 121 insertions(+)

>>  create mode 100644 gcc/testsuite/gcc.target/i386/returninst1.c

>>  create mode 100644 gcc/testsuite/gcc.target/i386/returninst2.c

>>  create mode 100644 gcc/testsuite/gcc.target/i386/returninst3.c

>>

>> diff --git a/gcc/config/i386/i386-opts.h b/gcc/config/i386/i386-opts.h

>> index 46366cbfa72..35e9413100e 100644

>> --- a/gcc/config/i386/i386-opts.h

>> +++ b/gcc/config/i386/i386-opts.h

>> @@ -119,4 +119,10 @@ enum indirect_branch {

>>    indirect_branch_thunk_extern

>>  };

>>  

>> +enum instrument_return {

>> +  instrument_return_none = 0,

>> +  instrument_return_call,

>> +  instrument_return_nop5

>> +};

>> +

>>  #endif

>> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c

>> index f9ef0b4445b..f7cd94a8139 100644

>> --- a/gcc/config/i386/i386.c

>> +++ b/gcc/config/i386/i386.c

>> @@ -28336,12 +28336,47 @@ ix86_output_indirect_jmp (rtx call_op)

>>      return "%!jmp\t%A0";

>>  }

>>  

>> +/* Output return instrumentation for current function if needed.  */

>> +

>> +static void

>> +output_return_instrumentation (void)

>> +{

>> +  if (ix86_instrument_return != instrument_return_none

>> +      && flag_fentry

>> +      && !DECL_NO_INSTRUMENT_FUNCTION_ENTRY_EXIT (cfun->decl))

>> +    {

>> +      if (ix86_flag_record_return)

>> +	fprintf (asm_out_file, "1:\n");

>> +      switch (ix86_instrument_return)

>> +	{

>> +	case instrument_return_call:

>> +	  fprintf (asm_out_file, "\tcall\t__return__\n");

>> +	  break;

>> +	case instrument_return_nop5:

>> +	  /* 5 byte nop: nopl 0(%[re]ax,%[re]ax,1)  */

>> +	  fprintf (asm_out_file, ASM_BYTE "0x0f, 0x1f, 0x44, 0x00, 0x00\n");

>> +	  break;

>> +	case instrument_return_none:

>> +	  break;

>> +	}

>> +

>> +      if (ix86_flag_record_return)

>> +	{

>> +	  fprintf (asm_out_file, "\t.section __return_loc, \"a\",@progbits\n");

>> +	  fprintf (asm_out_file, "\t.%s 1b\n", TARGET_64BIT ? "quad" : "long");

>> +	  fprintf (asm_out_file, "\t.previous\n");

>> +	}

>> +    }

>> +}

>> +

>>  /* Output function return.  CALL_OP is the jump target.  Add a REP

>>     prefix to RET if LONG_P is true and function return is kept.  */

>>  

>>  const char *

>>  ix86_output_function_return (bool long_p)

>>  {

>> +  output_return_instrumentation ();

>> +

>>    if (cfun->machine->function_return_type != indirect_branch_keep)

>>      {

>>        char thunk_name[32];

>> @@ -28454,6 +28489,7 @@ ix86_output_call_insn (rtx_insn *insn, rtx call_op)

>>  

>>    if (SIBLING_CALL_P (insn))

>>      {

>> +      output_return_instrumentation ();

>>        if (direct_p)

>>  	{

>>  	  if (ix86_nopic_noplt_attribute_p (call_op))

>> diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt

>> index e7fbf9b6f99..5925b75244f 100644

>> --- a/gcc/config/i386/i386.opt

>> +++ b/gcc/config/i386/i386.opt

>> @@ -1063,3 +1063,24 @@ Support WAITPKG built-in functions and code generation.

>>  mcldemote

>>  Target Report Mask(ISA_CLDEMOTE) Var(ix86_isa_flags2) Save

>>  Support CLDEMOTE built-in functions and code generation.

>> +

>> +minstrument-return=

>> +Target Report RejectNegative Joined Enum(instrument_return) Var(ix86_instrument_return) Init(instrument_return_none)

>> +Instrument function exit in instrumented functions with __fentry__.

>> +

>> +Enum

>> +Name(instrument_return) Type(enum instrument_return)

>> +Known choices for return instrumentation with -minstrument-return=

>> +

>> +EnumValue

>> +Enum(instrument_return) String(none) Value(instrument_return_none)

>> +

>> +EnumValue

>> +Enum(instrument_return) String(call) Value(instrument_return_call)

>> +

>> +EnumValue

>> +Enum(instrument_return) String(nop5) Value(instrument_return_nop5)

>> +

>> +mrecord-return

>> +Target Report Var(ix86_flag_record_return) Init(0)

>> +Generate a __return_loc section pointing to all return instrumentation code.

>> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi

>> index 1743c64582e..939be3e251b 100644

>> --- a/gcc/doc/invoke.texi

>> +++ b/gcc/doc/invoke.texi

>> @@ -1301,6 +1301,7 @@ See RS/6000 and PowerPC Options.

>>  -mcmodel=@var{code-model}  -mabi=@var{name}  -maddress-mode=@var{mode} @gol

>>  -m32  -m64  -mx32  -m16  -miamcu  -mlarge-data-threshold=@var{num} @gol

>>  -msse2avx  -mfentry  -mrecord-mcount  -mnop-mcount  -m8bit-idiv @gol

>> +-minstrument-return=@var{type} @gol

>>  -mavx256-split-unaligned-load  -mavx256-split-unaligned-store @gol

>>  -malign-data=@var{type}  -mstack-protector-guard=@var{guard} @gol

>>  -mstack-protector-guard-reg=@var{reg} @gol

>> @@ -28442,6 +28443,19 @@ the profiling functions as NOPs. This is useful when they

>>  should be patched in later dynamically. This is likely only

>>  useful together with @option{-mrecord-mcount}.

>>  

>> +@item -minstrument-return=@var{type}

>> +@opindex minstrument-return

>> +Instrument function exit in -pg -mfentry instrumented functions with

>> +call to specified function. This only instruments true returns ending

>> +with ret, but not sibling calls ending with jump. Valid types

>> +are @var{none} to not instrument, @var{call} to generate a call to __return__,

>> +or @var{nop5} to generate a 5 byte nop.

>> +

>> +@item -mrecord-return

>> +@itemx -mno-record-return

>> +@opindex mrecord-return

>> +Generate a __return_loc section pointing to all return instrumentation code.

>> +

>>  @item -mskip-rax-setup

>>  @itemx -mno-skip-rax-setup

>>  @opindex mskip-rax-setup

>> diff --git a/gcc/testsuite/gcc.target/i386/returninst1.c b/gcc/testsuite/gcc.target/i386/returninst1.c

>> new file mode 100644

>> index 00000000000..f970e75a774

>> --- /dev/null

>> +++ b/gcc/testsuite/gcc.target/i386/returninst1.c

>> @@ -0,0 +1,14 @@

>> +/* { dg-do compile } */

>> +/* { dg-options "-pg -mfentry -minstrument-return=call -mrecord-return" } */

>> +/* { dg-final { scan-assembler "call.*__return__" } } */

>> +/* { dg-final { scan-assembler "section.*return_loc" } } */

>> +

>> +int func(int a)

>> +{

>> +  return a+1;

>> +}

>> +

>> +int func2(int a)

>> +{

>> +  return a+1;

>> +}

>> diff --git a/gcc/testsuite/gcc.target/i386/returninst2.c b/gcc/testsuite/gcc.target/i386/returninst2.c

>> new file mode 100644

>> index 00000000000..716b38556dd

>> --- /dev/null

>> +++ b/gcc/testsuite/gcc.target/i386/returninst2.c

>> @@ -0,0 +1,21 @@

>> +/* { dg-do compile } */

>> +/* { dg-options "-pg -mfentry -minstrument-return=nop5 -mrecord-return" } */

>> +/* { dg-final { scan-assembler-times "0x0f, 0x1f, 0x44, 0x00, 0x00" 3 } } */

>> +/* { dg-final { scan-assembler "section.*return_loc" } } */

>> +

>> +int func(int a)

>> +{

>> +  return a+1;

>> +}

>> +

>> +int func2(int a)

>> +{

>> +  return a+1;

>> +}

>> +

>> +extern void func4(int);

>> +

>> +int func3(int a)

>> +{

>> +  func4(a + 1);

>> +}

>> diff --git a/gcc/testsuite/gcc.target/i386/returninst3.c b/gcc/testsuite/gcc.target/i386/returninst3.c

>> new file mode 100644

>> index 00000000000..5bbc60e8bd4

>> --- /dev/null

>> +++ b/gcc/testsuite/gcc.target/i386/returninst3.c

>> @@ -0,0 +1,9 @@

>> +/* { dg-do compile } */

>> +/* { dg-options "-pg -mfentry -minstrument-return=call" } */

>> +/* { dg-final { scan-assembler-not "call.*__return__" } } */

>> +

>> +__attribute__((no_instrument_function))

>> +int func(int a)

>> +{

>> +  return a+1;

>> +}
Andi Kleen Nov. 21, 2018, 8:48 p.m. | #3
Andi Kleen <ak@linux.intel.com> writes:

Ping^3!

> Andi Kleen <ak@linux.intel.com> writes:

>

> Ping!^2

>

>> Andi Kleen <andi@firstfloor.org> writes:

>>

>> Ping!

>>

>>> From: Andi Kleen <ak@linux.intel.com>

>>>

>>> When instrumenting programs using __fentry__ it is often useful

>>> to instrument the function return too. Traditionally this

>>> has been done by patching the return address on the stack

>>> frame on entry. However this is fairly complicated (trace

>>> function has to emulate a stack) and also slow because

>>> it causes a branch misprediction on every return.

>>>

>>> Add an option to generate call or nop instrumentation for

>>> every return instead, including patch sections.

>>>

>>> This will increase the program size slightly, but can be a

>>> lot faster and simpler.

>>>

>>> This version only instruments true returns, not sibling

>>> calls or tail recursion. This matches the semantics of the

>>> original stack.

>>>

>>> gcc/:

>>>

>>> 2018-11-04  Andi Kleen  <ak@linux.intel.com>

>>>

>>> 	* config/i386/i386-opts.h (enum instrument_return): Add.

>>> 	* config/i386/i386.c (output_return_instrumentation): Add.

>>> 	(ix86_output_function_return): Call output_return_instrumentation.

>>> 	(ix86_output_call_insn): Call output_return_instrumentation.

>>> 	* config/i386/i386.opt: Add -minstrument-return=.

>>> 	* doc/invoke.texi (-minstrument-return): Document.

>>>

>>> gcc/testsuite/:

>>>

>>> 2018-11-04  Andi Kleen  <ak@linux.intel.com>

>>>

>>> 	* gcc.target/i386/returninst1.c: New test.

>>> 	* gcc.target/i386/returninst2.c: New test.

>>> 	* gcc.target/i386/returninst3.c: New test.

>>> ---

>>>  gcc/config/i386/i386-opts.h                 |  6 ++++

>>>  gcc/config/i386/i386.c                      | 36 +++++++++++++++++++++

>>>  gcc/config/i386/i386.opt                    | 21 ++++++++++++

>>>  gcc/doc/invoke.texi                         | 14 ++++++++

>>>  gcc/testsuite/gcc.target/i386/returninst1.c | 14 ++++++++

>>>  gcc/testsuite/gcc.target/i386/returninst2.c | 21 ++++++++++++

>>>  gcc/testsuite/gcc.target/i386/returninst3.c |  9 ++++++

>>>  7 files changed, 121 insertions(+)

>>>  create mode 100644 gcc/testsuite/gcc.target/i386/returninst1.c

>>>  create mode 100644 gcc/testsuite/gcc.target/i386/returninst2.c

>>>  create mode 100644 gcc/testsuite/gcc.target/i386/returninst3.c

>>>

>>> diff --git a/gcc/config/i386/i386-opts.h b/gcc/config/i386/i386-opts.h

>>> index 46366cbfa72..35e9413100e 100644

>>> --- a/gcc/config/i386/i386-opts.h

>>> +++ b/gcc/config/i386/i386-opts.h

>>> @@ -119,4 +119,10 @@ enum indirect_branch {

>>>    indirect_branch_thunk_extern

>>>  };

>>>  

>>> +enum instrument_return {

>>> +  instrument_return_none = 0,

>>> +  instrument_return_call,

>>> +  instrument_return_nop5

>>> +};

>>> +

>>>  #endif

>>> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c

>>> index f9ef0b4445b..f7cd94a8139 100644

>>> --- a/gcc/config/i386/i386.c

>>> +++ b/gcc/config/i386/i386.c

>>> @@ -28336,12 +28336,47 @@ ix86_output_indirect_jmp (rtx call_op)

>>>      return "%!jmp\t%A0";

>>>  }

>>>  

>>> +/* Output return instrumentation for current function if needed.  */

>>> +

>>> +static void

>>> +output_return_instrumentation (void)

>>> +{

>>> +  if (ix86_instrument_return != instrument_return_none

>>> +      && flag_fentry

>>> +      && !DECL_NO_INSTRUMENT_FUNCTION_ENTRY_EXIT (cfun->decl))

>>> +    {

>>> +      if (ix86_flag_record_return)

>>> +	fprintf (asm_out_file, "1:\n");

>>> +      switch (ix86_instrument_return)

>>> +	{

>>> +	case instrument_return_call:

>>> +	  fprintf (asm_out_file, "\tcall\t__return__\n");

>>> +	  break;

>>> +	case instrument_return_nop5:

>>> +	  /* 5 byte nop: nopl 0(%[re]ax,%[re]ax,1)  */

>>> +	  fprintf (asm_out_file, ASM_BYTE "0x0f, 0x1f, 0x44, 0x00, 0x00\n");

>>> +	  break;

>>> +	case instrument_return_none:

>>> +	  break;

>>> +	}

>>> +

>>> +      if (ix86_flag_record_return)

>>> +	{

>>> +	  fprintf (asm_out_file, "\t.section __return_loc, \"a\",@progbits\n");

>>> +	  fprintf (asm_out_file, "\t.%s 1b\n", TARGET_64BIT ? "quad" : "long");

>>> +	  fprintf (asm_out_file, "\t.previous\n");

>>> +	}

>>> +    }

>>> +}

>>> +

>>>  /* Output function return.  CALL_OP is the jump target.  Add a REP

>>>     prefix to RET if LONG_P is true and function return is kept.  */

>>>  

>>>  const char *

>>>  ix86_output_function_return (bool long_p)

>>>  {

>>> +  output_return_instrumentation ();

>>> +

>>>    if (cfun->machine->function_return_type != indirect_branch_keep)

>>>      {

>>>        char thunk_name[32];

>>> @@ -28454,6 +28489,7 @@ ix86_output_call_insn (rtx_insn *insn, rtx call_op)

>>>  

>>>    if (SIBLING_CALL_P (insn))

>>>      {

>>> +      output_return_instrumentation ();

>>>        if (direct_p)

>>>  	{

>>>  	  if (ix86_nopic_noplt_attribute_p (call_op))

>>> diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt

>>> index e7fbf9b6f99..5925b75244f 100644

>>> --- a/gcc/config/i386/i386.opt

>>> +++ b/gcc/config/i386/i386.opt

>>> @@ -1063,3 +1063,24 @@ Support WAITPKG built-in functions and code generation.

>>>  mcldemote

>>>  Target Report Mask(ISA_CLDEMOTE) Var(ix86_isa_flags2) Save

>>>  Support CLDEMOTE built-in functions and code generation.

>>> +

>>> +minstrument-return=

>>> +Target Report RejectNegative Joined Enum(instrument_return) Var(ix86_instrument_return) Init(instrument_return_none)

>>> +Instrument function exit in instrumented functions with __fentry__.

>>> +

>>> +Enum

>>> +Name(instrument_return) Type(enum instrument_return)

>>> +Known choices for return instrumentation with -minstrument-return=

>>> +

>>> +EnumValue

>>> +Enum(instrument_return) String(none) Value(instrument_return_none)

>>> +

>>> +EnumValue

>>> +Enum(instrument_return) String(call) Value(instrument_return_call)

>>> +

>>> +EnumValue

>>> +Enum(instrument_return) String(nop5) Value(instrument_return_nop5)

>>> +

>>> +mrecord-return

>>> +Target Report Var(ix86_flag_record_return) Init(0)

>>> +Generate a __return_loc section pointing to all return instrumentation code.

>>> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi

>>> index 1743c64582e..939be3e251b 100644

>>> --- a/gcc/doc/invoke.texi

>>> +++ b/gcc/doc/invoke.texi

>>> @@ -1301,6 +1301,7 @@ See RS/6000 and PowerPC Options.

>>>  -mcmodel=@var{code-model}  -mabi=@var{name}  -maddress-mode=@var{mode} @gol

>>>  -m32  -m64  -mx32  -m16  -miamcu  -mlarge-data-threshold=@var{num} @gol

>>>  -msse2avx  -mfentry  -mrecord-mcount  -mnop-mcount  -m8bit-idiv @gol

>>> +-minstrument-return=@var{type} @gol

>>>  -mavx256-split-unaligned-load  -mavx256-split-unaligned-store @gol

>>>  -malign-data=@var{type}  -mstack-protector-guard=@var{guard} @gol

>>>  -mstack-protector-guard-reg=@var{reg} @gol

>>> @@ -28442,6 +28443,19 @@ the profiling functions as NOPs. This is useful when they

>>>  should be patched in later dynamically. This is likely only

>>>  useful together with @option{-mrecord-mcount}.

>>>  

>>> +@item -minstrument-return=@var{type}

>>> +@opindex minstrument-return

>>> +Instrument function exit in -pg -mfentry instrumented functions with

>>> +call to specified function. This only instruments true returns ending

>>> +with ret, but not sibling calls ending with jump. Valid types

>>> +are @var{none} to not instrument, @var{call} to generate a call to __return__,

>>> +or @var{nop5} to generate a 5 byte nop.

>>> +

>>> +@item -mrecord-return

>>> +@itemx -mno-record-return

>>> +@opindex mrecord-return

>>> +Generate a __return_loc section pointing to all return instrumentation code.

>>> +

>>>  @item -mskip-rax-setup

>>>  @itemx -mno-skip-rax-setup

>>>  @opindex mskip-rax-setup

>>> diff --git a/gcc/testsuite/gcc.target/i386/returninst1.c b/gcc/testsuite/gcc.target/i386/returninst1.c

>>> new file mode 100644

>>> index 00000000000..f970e75a774

>>> --- /dev/null

>>> +++ b/gcc/testsuite/gcc.target/i386/returninst1.c

>>> @@ -0,0 +1,14 @@

>>> +/* { dg-do compile } */

>>> +/* { dg-options "-pg -mfentry -minstrument-return=call -mrecord-return" } */

>>> +/* { dg-final { scan-assembler "call.*__return__" } } */

>>> +/* { dg-final { scan-assembler "section.*return_loc" } } */

>>> +

>>> +int func(int a)

>>> +{

>>> +  return a+1;

>>> +}

>>> +

>>> +int func2(int a)

>>> +{

>>> +  return a+1;

>>> +}

>>> diff --git a/gcc/testsuite/gcc.target/i386/returninst2.c b/gcc/testsuite/gcc.target/i386/returninst2.c

>>> new file mode 100644

>>> index 00000000000..716b38556dd

>>> --- /dev/null

>>> +++ b/gcc/testsuite/gcc.target/i386/returninst2.c

>>> @@ -0,0 +1,21 @@

>>> +/* { dg-do compile } */

>>> +/* { dg-options "-pg -mfentry -minstrument-return=nop5 -mrecord-return" } */

>>> +/* { dg-final { scan-assembler-times "0x0f, 0x1f, 0x44, 0x00, 0x00" 3 } } */

>>> +/* { dg-final { scan-assembler "section.*return_loc" } } */

>>> +

>>> +int func(int a)

>>> +{

>>> +  return a+1;

>>> +}

>>> +

>>> +int func2(int a)

>>> +{

>>> +  return a+1;

>>> +}

>>> +

>>> +extern void func4(int);

>>> +

>>> +int func3(int a)

>>> +{

>>> +  func4(a + 1);

>>> +}

>>> diff --git a/gcc/testsuite/gcc.target/i386/returninst3.c b/gcc/testsuite/gcc.target/i386/returninst3.c

>>> new file mode 100644

>>> index 00000000000..5bbc60e8bd4

>>> --- /dev/null

>>> +++ b/gcc/testsuite/gcc.target/i386/returninst3.c

>>> @@ -0,0 +1,9 @@

>>> +/* { dg-do compile } */

>>> +/* { dg-options "-pg -mfentry -minstrument-return=call" } */

>>> +/* { dg-final { scan-assembler-not "call.*__return__" } } */

>>> +

>>> +__attribute__((no_instrument_function))

>>> +int func(int a)

>>> +{

>>> +  return a+1;

>>> +}
Jan Hubicka Nov. 28, 2018, 3:59 p.m. | #4
Hi,
I wonder how __return_loc sections work.  In general it would be nice to
avoid direct output of assembler code in favour of adding instructions
to rtl stream which should be possible to insert the call and return.
__return_loc seems harder, also I wonder if it makes unwind information
possible?

Honza
Andi Kleen Nov. 28, 2018, 4:10 p.m. | #5
On Wed, Nov 28, 2018 at 04:59:25PM +0100, Jan Hubicka wrote:
> Hi,

> I wonder how __return_loc sections work.  In general it would be nice to


You mean how it's used? It's for patching in/out return instrumentation
at runtime, and to find them we use the sections.

> avoid direct output of assembler code in favour of adding instructions

> to rtl stream which should be possible to insert the call and return.


That would be a full rewrite how all these hooks work. I was 
hoping extending the existing frame work would be good enough for now.

> __return_loc seems harder, also I wonder if it makes unwind information

> possible?


Do we need unwind information for calls? I thought it was only
for other stack manipulation.

-Andi
Jan Hubicka Nov. 28, 2018, 4:24 p.m. | #6
> On Wed, Nov 28, 2018 at 04:59:25PM +0100, Jan Hubicka wrote:

> > Hi,

> > I wonder how __return_loc sections work.  In general it would be nice to

> 

> You mean how it's used? It's for patching in/out return instrumentation

> at runtime, and to find them we use the sections.

> 

> > avoid direct output of assembler code in favour of adding instructions

> > to rtl stream which should be possible to insert the call and return.

> 

> That would be a full rewrite how all these hooks work. I was 

> hoping extending the existing frame work would be good enough for now.


Yep, it is ugly and it hits us ocassionaly as with the CET
instrumentation, but I suppose we should care about this in stage1.
> 

> > __return_loc seems harder, also I wonder if it makes unwind information

> > possible?

> 

> Do we need unwind information for calls? I thought it was only

> for other stack manipulation.


I just wonder if backtraces are right if something triggers in the extra
code.  If you make control flow to jump into separate section, I suppose
backtraces won't be right.  Not sure if that is necessarily issue?

Honza
> 

> -Andi
Andi Kleen Nov. 28, 2018, 8:47 p.m. | #7
On Wed, Nov 28, 2018 at 05:24:37PM +0100, Jan Hubicka wrote:
> > On Wed, Nov 28, 2018 at 04:59:25PM +0100, Jan Hubicka wrote:

> > > Hi,

> > > I wonder how __return_loc sections work.  In general it would be nice to

> > 

> > You mean how it's used? It's for patching in/out return instrumentation

> > at runtime, and to find them we use the sections.

> > 

> > > avoid direct output of assembler code in favour of adding instructions

> > > to rtl stream which should be possible to insert the call and return.

> > 

> > That would be a full rewrite how all these hooks work. I was 

> > hoping extending the existing frame work would be good enough for now.

> 

> Yep, it is ugly and it hits us ocassionaly as with the CET

> instrumentation, but I suppose we should care about this in stage1.

> > 

> > > __return_loc seems harder, also I wonder if it makes unwind information

> > > possible?

> > 

> > Do we need unwind information for calls? I thought it was only

> > for other stack manipulation.

> 

> I just wonder if backtraces are right if something triggers in the extra

> code.  If you make control flow to jump into separate section, I suppose

> backtraces won't be right.  Not sure if that is necessarily issue?


I don't think it is because the instrumentation only adds calls, and
calls don't get annotated in DWARF. The only issue I could think of
if is something patches in push instructions through the nops, 
but there is really nothing the compiler could do about that.

I tested gdb and it can backtrace through the return instrumentation.

Breakpoint 1, 0x0000000000401182 in __return__ ()
(gdb) bt
#0  0x0000000000401182 in __return__ ()
#1  0x00000000004011a3 in f2 ()
#2  0x00000000004011b7 in main ()
Backtrace stopped: previous frame inner to this frame (corrupt stack?)

-Andi
Jan Hubicka Nov. 29, 2018, 10:30 p.m. | #8
> 

> I don't think it is because the instrumentation only adds calls, and

> calls don't get annotated in DWARF. The only issue I could think of

> if is something patches in push instructions through the nops, 

> but there is really nothing the compiler could do about that.

> 

> I tested gdb and it can backtrace through the return instrumentation.

> 

> Breakpoint 1, 0x0000000000401182 in __return__ ()

> (gdb) bt

> #0  0x0000000000401182 in __return__ ()

> #1  0x00000000004011a3 in f2 ()

> #2  0x00000000004011b7 in main ()

> Backtrace stopped: previous frame inner to this frame (corrupt stack?)


:) Not most user friendly message though.
The patch is OK.

Honza
> 

> -Andi

Patch

diff --git a/gcc/config/i386/i386-opts.h b/gcc/config/i386/i386-opts.h
index 46366cbfa72..35e9413100e 100644
--- a/gcc/config/i386/i386-opts.h
+++ b/gcc/config/i386/i386-opts.h
@@ -119,4 +119,10 @@  enum indirect_branch {
   indirect_branch_thunk_extern
 };
 
+enum instrument_return {
+  instrument_return_none = 0,
+  instrument_return_call,
+  instrument_return_nop5
+};
+
 #endif
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index f9ef0b4445b..f7cd94a8139 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -28336,12 +28336,47 @@  ix86_output_indirect_jmp (rtx call_op)
     return "%!jmp\t%A0";
 }
 
+/* Output return instrumentation for current function if needed.  */
+
+static void
+output_return_instrumentation (void)
+{
+  if (ix86_instrument_return != instrument_return_none
+      && flag_fentry
+      && !DECL_NO_INSTRUMENT_FUNCTION_ENTRY_EXIT (cfun->decl))
+    {
+      if (ix86_flag_record_return)
+	fprintf (asm_out_file, "1:\n");
+      switch (ix86_instrument_return)
+	{
+	case instrument_return_call:
+	  fprintf (asm_out_file, "\tcall\t__return__\n");
+	  break;
+	case instrument_return_nop5:
+	  /* 5 byte nop: nopl 0(%[re]ax,%[re]ax,1)  */
+	  fprintf (asm_out_file, ASM_BYTE "0x0f, 0x1f, 0x44, 0x00, 0x00\n");
+	  break;
+	case instrument_return_none:
+	  break;
+	}
+
+      if (ix86_flag_record_return)
+	{
+	  fprintf (asm_out_file, "\t.section __return_loc, \"a\",@progbits\n");
+	  fprintf (asm_out_file, "\t.%s 1b\n", TARGET_64BIT ? "quad" : "long");
+	  fprintf (asm_out_file, "\t.previous\n");
+	}
+    }
+}
+
 /* Output function return.  CALL_OP is the jump target.  Add a REP
    prefix to RET if LONG_P is true and function return is kept.  */
 
 const char *
 ix86_output_function_return (bool long_p)
 {
+  output_return_instrumentation ();
+
   if (cfun->machine->function_return_type != indirect_branch_keep)
     {
       char thunk_name[32];
@@ -28454,6 +28489,7 @@  ix86_output_call_insn (rtx_insn *insn, rtx call_op)
 
   if (SIBLING_CALL_P (insn))
     {
+      output_return_instrumentation ();
       if (direct_p)
 	{
 	  if (ix86_nopic_noplt_attribute_p (call_op))
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index e7fbf9b6f99..5925b75244f 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -1063,3 +1063,24 @@  Support WAITPKG built-in functions and code generation.
 mcldemote
 Target Report Mask(ISA_CLDEMOTE) Var(ix86_isa_flags2) Save
 Support CLDEMOTE built-in functions and code generation.
+
+minstrument-return=
+Target Report RejectNegative Joined Enum(instrument_return) Var(ix86_instrument_return) Init(instrument_return_none)
+Instrument function exit in instrumented functions with __fentry__.
+
+Enum
+Name(instrument_return) Type(enum instrument_return)
+Known choices for return instrumentation with -minstrument-return=
+
+EnumValue
+Enum(instrument_return) String(none) Value(instrument_return_none)
+
+EnumValue
+Enum(instrument_return) String(call) Value(instrument_return_call)
+
+EnumValue
+Enum(instrument_return) String(nop5) Value(instrument_return_nop5)
+
+mrecord-return
+Target Report Var(ix86_flag_record_return) Init(0)
+Generate a __return_loc section pointing to all return instrumentation code.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 1743c64582e..939be3e251b 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -1301,6 +1301,7 @@  See RS/6000 and PowerPC Options.
 -mcmodel=@var{code-model}  -mabi=@var{name}  -maddress-mode=@var{mode} @gol
 -m32  -m64  -mx32  -m16  -miamcu  -mlarge-data-threshold=@var{num} @gol
 -msse2avx  -mfentry  -mrecord-mcount  -mnop-mcount  -m8bit-idiv @gol
+-minstrument-return=@var{type} @gol
 -mavx256-split-unaligned-load  -mavx256-split-unaligned-store @gol
 -malign-data=@var{type}  -mstack-protector-guard=@var{guard} @gol
 -mstack-protector-guard-reg=@var{reg} @gol
@@ -28442,6 +28443,19 @@  the profiling functions as NOPs. This is useful when they
 should be patched in later dynamically. This is likely only
 useful together with @option{-mrecord-mcount}.
 
+@item -minstrument-return=@var{type}
+@opindex minstrument-return
+Instrument function exit in -pg -mfentry instrumented functions with
+call to specified function. This only instruments true returns ending
+with ret, but not sibling calls ending with jump. Valid types
+are @var{none} to not instrument, @var{call} to generate a call to __return__,
+or @var{nop5} to generate a 5 byte nop.
+
+@item -mrecord-return
+@itemx -mno-record-return
+@opindex mrecord-return
+Generate a __return_loc section pointing to all return instrumentation code.
+
 @item -mskip-rax-setup
 @itemx -mno-skip-rax-setup
 @opindex mskip-rax-setup
diff --git a/gcc/testsuite/gcc.target/i386/returninst1.c b/gcc/testsuite/gcc.target/i386/returninst1.c
new file mode 100644
index 00000000000..f970e75a774
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/returninst1.c
@@ -0,0 +1,14 @@ 
+/* { dg-do compile } */
+/* { dg-options "-pg -mfentry -minstrument-return=call -mrecord-return" } */
+/* { dg-final { scan-assembler "call.*__return__" } } */
+/* { dg-final { scan-assembler "section.*return_loc" } } */
+
+int func(int a)
+{
+  return a+1;
+}
+
+int func2(int a)
+{
+  return a+1;
+}
diff --git a/gcc/testsuite/gcc.target/i386/returninst2.c b/gcc/testsuite/gcc.target/i386/returninst2.c
new file mode 100644
index 00000000000..716b38556dd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/returninst2.c
@@ -0,0 +1,21 @@ 
+/* { dg-do compile } */
+/* { dg-options "-pg -mfentry -minstrument-return=nop5 -mrecord-return" } */
+/* { dg-final { scan-assembler-times "0x0f, 0x1f, 0x44, 0x00, 0x00" 3 } } */
+/* { dg-final { scan-assembler "section.*return_loc" } } */
+
+int func(int a)
+{
+  return a+1;
+}
+
+int func2(int a)
+{
+  return a+1;
+}
+
+extern void func4(int);
+
+int func3(int a)
+{
+  func4(a + 1);
+}
diff --git a/gcc/testsuite/gcc.target/i386/returninst3.c b/gcc/testsuite/gcc.target/i386/returninst3.c
new file mode 100644
index 00000000000..5bbc60e8bd4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/returninst3.c
@@ -0,0 +1,9 @@ 
+/* { dg-do compile } */
+/* { dg-options "-pg -mfentry -minstrument-return=call" } */
+/* { dg-final { scan-assembler-not "call.*__return__" } } */
+
+__attribute__((no_instrument_function))
+int func(int a)
+{
+  return a+1;
+}