[6/6,ARC] Reimplement return padding operation for ARC700.

Message ID 20180521101839.13288-6-claziss@gmail.com
State New
Headers show
Series
  • [1/6,ARC] Add modes scheduler HS
Related show

Commit Message

Claudiu Zissulescu May 21, 2018, 10:18 a.m.
From: Claudiu Zissulescu <claziss@synopsys.com>


For ARC700, adding padding if necessary to avoid a mispredict.  A
return could happen immediately after the function start.  A
call/return and return/return must be 6 bytes apart to avoid
mispredict.

The old implementation was doing this operation very late in the
compilation process, and the additional nop instructions and/or
forcing some other instruction to take their long form was not taken
into account when generating brcc instructions. Thus, wrong code could
be generated.

Ok to apply?
Claudiu

gcc/
2017-03-24  Claudiu Zissulescu  <claziss@synopsys.com>

	* config/arc/arc-protos.h (arc_pad_return): Remove.
	* config/arc/arc.c (machine_function): Remove force_short_suffix
	and size_reason.
	(arc_print_operand): Adjust printing of '&'.
	(arc_verify_short): Remove conditional printing of short suffix.
	(arc_final_prescan_insn): Remove reference to size_reason.
	(pad_return): New function.
	(arc_reorg): Call pad_return.
	(arc_pad_return): Remove.
	(arc_init_machine_status): Remove reference to force_short_suffix.
	* config/arc/arc.md (vunspec): Add VUNSPEC_ARC_BLOCKAGE.
	(attr length): When attribute iscompact is true force to 2
	regardless; in the case of maybe check if we want to force the
	instruction to have 4 bytes length.
	(nopv): Change it to generate 4 byte long nop as well.
	(blockage): New pattern.
	(simple_return): Remove call to arc_pad_return.
	(p_return_i): Likewise.

gcc/testsuite/
2017-03-24  Claudiu Zissulescu  <claziss@synopsys.com>

	* gcc.target/arc/pr9001107555.c: New file.
---
 gcc/config/arc/arc-protos.h                 |   1 -
 gcc/config/arc/arc.c                        | 156 +++++++++++++---------------
 gcc/config/arc/arc.md                       |  26 +++--
 gcc/testsuite/gcc.target/arc/pr9001107555.c |  38 +++++++
 4 files changed, 128 insertions(+), 93 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/arc/pr9001107555.c

-- 
2.14.3

Comments

Andrew Burgess June 12, 2018, 8:10 p.m. | #1
* Claudiu Zissulescu <claziss@gmail.com> [2018-05-21 13:18:39 +0300]:

> From: Claudiu Zissulescu <claziss@synopsys.com>

> 

> For ARC700, adding padding if necessary to avoid a mispredict.  A

> return could happen immediately after the function start.  A

> call/return and return/return must be 6 bytes apart to avoid

> mispredict.

> 

> The old implementation was doing this operation very late in the

> compilation process, and the additional nop instructions and/or

> forcing some other instruction to take their long form was not taken

> into account when generating brcc instructions. Thus, wrong code could

> be generated.

> 

> Ok to apply?


This looks fine with a couple of small adjustments, inline below...

> Claudiu

> 

> gcc/

> 2017-03-24  Claudiu Zissulescu  <claziss@synopsys.com>

> 

> 	* config/arc/arc-protos.h (arc_pad_return): Remove.

> 	* config/arc/arc.c (machine_function): Remove force_short_suffix

> 	and size_reason.

> 	(arc_print_operand): Adjust printing of '&'.

> 	(arc_verify_short): Remove conditional printing of short suffix.

> 	(arc_final_prescan_insn): Remove reference to size_reason.

> 	(pad_return): New function.

> 	(arc_reorg): Call pad_return.

> 	(arc_pad_return): Remove.

> 	(arc_init_machine_status): Remove reference to force_short_suffix.

> 	* config/arc/arc.md (vunspec): Add VUNSPEC_ARC_BLOCKAGE.

> 	(attr length): When attribute iscompact is true force to 2

> 	regardless; in the case of maybe check if we want to force the

> 	instruction to have 4 bytes length.

> 	(nopv): Change it to generate 4 byte long nop as well.

> 	(blockage): New pattern.

> 	(simple_return): Remove call to arc_pad_return.

> 	(p_return_i): Likewise.

> 

> gcc/testsuite/

> 2017-03-24  Claudiu Zissulescu  <claziss@synopsys.com>

> 

> 	* gcc.target/arc/pr9001107555.c: New file.

> ---

>  gcc/config/arc/arc-protos.h                 |   1 -

>  gcc/config/arc/arc.c                        | 156 +++++++++++++---------------

>  gcc/config/arc/arc.md                       |  26 +++--

>  gcc/testsuite/gcc.target/arc/pr9001107555.c |  38 +++++++

>  4 files changed, 128 insertions(+), 93 deletions(-)

>  create mode 100644 gcc/testsuite/gcc.target/arc/pr9001107555.c

> 

> diff --git a/gcc/config/arc/arc-protos.h b/gcc/config/arc/arc-protos.h

> index 67f3b4e3226..ce4b6f84749 100644

> --- a/gcc/config/arc/arc-protos.h

> +++ b/gcc/config/arc/arc-protos.h

> @@ -89,7 +89,6 @@ extern void arc_clear_unalign (void);

>  extern void arc_toggle_unalign (void);

>  extern void split_addsi (rtx *);

>  extern void split_subsi (rtx *);

> -extern void arc_pad_return (void);

>  extern void arc_split_move (rtx *);

>  extern const char *arc_short_long (rtx_insn *insn, const char *, const char *);

>  extern rtx arc_regno_use_in (unsigned int, rtx);

> diff --git a/gcc/config/arc/arc.c b/gcc/config/arc/arc.c

> index b1a09d82b72..22f1442a027 100644

> --- a/gcc/config/arc/arc.c

> +++ b/gcc/config/arc/arc.c

> @@ -2648,8 +2648,6 @@ typedef struct GTY (()) machine_function

>    struct arc_frame_info frame_info;

>    /* To keep track of unalignment caused by short insns.  */

>    int unalign;

> -  int force_short_suffix; /* Used when disgorging return delay slot insns.  */

> -  const char *size_reason;

>    struct arc_ccfsm ccfsm_current;

>    /* Map from uid to ccfsm state during branch shortening.  */

>    rtx ccfsm_current_insn;

> @@ -4307,7 +4305,7 @@ arc_print_operand (FILE *file, rtx x, int code)

>  	}

>        break;

>      case '&':

> -      if (TARGET_ANNOTATE_ALIGN && cfun->machine->size_reason)

> +      if (TARGET_ANNOTATE_ALIGN)

>  	fprintf (file, "; unalign: %d", cfun->machine->unalign);

>        return;

>      case '+':

> @@ -4980,7 +4978,6 @@ static int

>  arc_verify_short (rtx_insn *insn, int, int check_attr)

>  {

>    enum attr_iscompact iscompact;

> -  struct machine_function *machine;

>  

>    if (check_attr > 0)

>      {

> @@ -4988,10 +4985,6 @@ arc_verify_short (rtx_insn *insn, int, int check_attr)

>        if (iscompact == ISCOMPACT_FALSE)

>  	return 0;

>      }

> -  machine = cfun->machine;

> -

> -  if (machine->force_short_suffix >= 0)

> -    return machine->force_short_suffix;

>  

>    return (get_attr_length (insn) & 2) != 0;

>  }

> @@ -5030,8 +5023,6 @@ arc_final_prescan_insn (rtx_insn *insn, rtx *opvec ATTRIBUTE_UNUSED,

>        cfun->machine->prescan_initialized = 1;

>      }

>    arc_ccfsm_advance (insn, &arc_ccfsm_current);

> -

> -  cfun->machine->size_reason = 0;

>  }

>  

>  /* Given FROM and TO register numbers, say whether this elimination is allowed.

> @@ -7673,6 +7664,76 @@ jli_call_scan (void)

>      }

>  }

>  

> +/* Add padding if necessary to avoid a mispredict.  A return could

> +   happen immediately after the function start.  A call/return and

> +   return/return must be 6 bytes apart to avoid mispredict.  */

> +

> +static void

> +pad_return (void)

> +{

> +  rtx_insn *insn;

> +  long offset;

> +

> +  if (!TARGET_PAD_RETURN)

> +    return;

> +

> +  for (insn = get_insns (); insn; insn = NEXT_INSN (insn))

> +    {

> +      rtx_insn *prev0 = prev_active_insn (insn);

> +      bool wantlong = false;

> +

> +      if (!INSN_P (insn) || GET_CODE (PATTERN (insn)) != SIMPLE_RETURN)

> +	continue;

> +

> +      if (!prev0)

> +	{

> +	  prev0 = emit_insn_before (gen_nopv (), insn);

> +	  /* REG_SAVE_NOTE is used by Haifa scheduler, we are in reorg

> +	     so it is safe to reuse it for forcing a particular length

> +	     for an instruction.  */

> +	  add_reg_note (prev0, REG_SAVE_NOTE, GEN_INT (1));

> +	  emit_insn_before (gen_nopv (), insn);

> +	  continue;

> +	}

> +      offset = get_attr_length (prev0);

> +

> +      if (get_attr_length (prev0) == 2

> +	  && get_attr_iscompact (prev0) != ISCOMPACT_TRUE)

> +	{

> +	  /* Force long version of the insn.  */

> +	  wantlong = true;

> +	  offset += 2;

> +	}

> +

> +     rtx_insn *prev = prev_active_insn (prev0);


There's something funny with the indentation in this line.  Could you
just double check before committing.

> +      if (prev)

> +	offset += get_attr_length (prev);

> +

> +      prev = prev_active_insn (prev);

> +      if (prev)

> +	offset += get_attr_length (prev);

> +

> +      switch (offset)

> +	{

> +	case 2:

> +	  prev = emit_insn_before (gen_nopv (), insn);

> +	  add_reg_note (prev, REG_SAVE_NOTE, GEN_INT (1));

> +	  break;

> +	case 4:

> +	  emit_insn_before (gen_nopv (), insn);

> +	  break;

> +	default:

> +	  continue;

> +	}

> +

> +      if (wantlong)

> +	add_reg_note (prev0, REG_SAVE_NOTE, GEN_INT (1));

> +

> +      /* Emit a blockage to avoid delay slot scheduling.  */

> +      emit_insn_before (gen_blockage(), insn);

> +    }

> +}

> +

>  static int arc_reorg_in_progress = 0;

>  

>  /* ARC's machince specific reorg function.  */

> @@ -7698,6 +7759,7 @@ arc_reorg (void)

>  

>    workaround_arc_anomaly ();

>    jli_call_scan ();

> +  pad_return ();

>  

>  /* FIXME: should anticipate ccfsm action, generate special patterns for

>     to-be-deleted branches that have no delay slot and have at least the

> @@ -9256,79 +9318,6 @@ arc_branch_size_unknown_p (void)

>    return !optimize_size && arc_reorg_in_progress;

>  }

>  

> -/* We are about to output a return insn.  Add padding if necessary to avoid

> -   a mispredict.  A return could happen immediately after the function

> -   start, but after a call we know that there will be at least a blink

> -   restore.  */

> -

> -void

> -arc_pad_return (void)

> -{

> -  rtx_insn *insn = current_output_insn;

> -  rtx_insn *prev = prev_active_insn (insn);

> -  int want_long;

> -

> -  if (!prev)

> -    {

> -      fputs ("\tnop_s\n", asm_out_file);

> -      cfun->machine->unalign ^= 2;

> -      want_long = 1;

> -    }

> -  /* If PREV is a sequence, we know it must be a branch / jump or a tailcall,

> -     because after a call, we'd have to restore blink first.  */

> -  else if (GET_CODE (PATTERN (prev)) == SEQUENCE)

> -    return;

> -  else

> -    {

> -      want_long = (get_attr_length (prev) == 2);

> -      prev = prev_active_insn (prev);

> -    }

> -  if (!prev

> -      || ((NONJUMP_INSN_P (prev) && GET_CODE (PATTERN (prev)) == SEQUENCE)

> -	  ? CALL_ATTR (as_a <rtx_sequence *> (PATTERN (prev))->insn (0),

> -		       NON_SIBCALL)

> -	  : CALL_ATTR (prev, NON_SIBCALL)))

> -    {

> -      if (want_long)

> -	cfun->machine->size_reason

> -	  = "call/return and return/return must be 6 bytes apart to avoid mispredict";

> -      else if (TARGET_UNALIGN_BRANCH && cfun->machine->unalign)

> -	{

> -	  cfun->machine->size_reason

> -	    = "Long unaligned jump avoids non-delay slot penalty";

> -	  want_long = 1;

> -	}

> -      /* Disgorge delay insn, if there is any, and it may be moved.  */

> -      if (final_sequence

> -	  /* ??? Annulled would be OK if we can and do conditionalize

> -	     the delay slot insn accordingly.  */

> -	  && !INSN_ANNULLED_BRANCH_P (insn)

> -	  && (get_attr_cond (insn) != COND_USE

> -	      || !reg_set_p (gen_rtx_REG (CCmode, CC_REG),

> -			     XVECEXP (final_sequence, 0, 1))))

> -	{

> -	  prev = as_a <rtx_insn *> (XVECEXP (final_sequence, 0, 1));

> -	  gcc_assert (!prev_real_insn (insn)

> -		      || !arc_hazard (prev_real_insn (insn), prev));

> -	  cfun->machine->force_short_suffix = !want_long;

> -	  rtx save_pred = current_insn_predicate;

> -	  final_scan_insn (prev, asm_out_file, optimize, 1, NULL);

> -	  cfun->machine->force_short_suffix = -1;

> -	  prev->set_deleted ();

> -	  current_output_insn = insn;

> -	  current_insn_predicate = save_pred;

> -	}

> -      else if (want_long)

> -	fputs ("\tnop\n", asm_out_file);

> -      else

> -	{

> -	  fputs ("\tnop_s\n", asm_out_file);

> -	  cfun->machine->unalign ^= 2;

> -	}

> -    }

> -  return;

> -}

> -

>  /* The usual; we set up our machine_function data.  */

>  

>  static struct machine_function *

> @@ -9337,7 +9326,6 @@ arc_init_machine_status (void)

>    struct machine_function *machine;

>    machine = ggc_cleared_alloc<machine_function> ();

>    machine->fn_type = ARC_FUNCTION_UNKNOWN;

> -  machine->force_short_suffix = -1;

>  

>    return machine;

>  }

> diff --git a/gcc/config/arc/arc.md b/gcc/config/arc/arc.md

> index 5610bab694c..2401926f08d 100644

> --- a/gcc/config/arc/arc.md

> +++ b/gcc/config/arc/arc.md

> @@ -162,6 +162,7 @@

>    VUNSPEC_ARC_CAS

>    VUNSPEC_ARC_SC

>    VUNSPEC_ARC_LL

> +  VUNSPEC_ARC_BLOCKAGE

>    ])

>  

>  (define_constants

> @@ -385,13 +386,18 @@

>  ;; and insn lengths: insns with shimm values cannot be conditionally executed.

>  (define_attr "length" ""

>    (cond

> -    [(eq_attr "iscompact" "true,maybe")

> +    [(eq_attr "iscompact" "true")

> +      (const_int 2)

> +

> +     (eq_attr "iscompact" "maybe")

>       (cond

>         [(eq_attr "type" "sfunc")

>  	(cond [(match_test "GET_CODE (PATTERN (insn)) == COND_EXEC")

>  	       (const_int 12)]

>  	      (const_int 10))

> -	(match_test "GET_CODE (PATTERN (insn)) == COND_EXEC") (const_int 4)]

> +	(match_test "GET_CODE (PATTERN (insn)) == COND_EXEC") (const_int 4)

> +	(match_test "find_reg_note (insn, REG_SAVE_NOTE, GEN_INT (1))")

> +	(const_int 4)]

>        (const_int 2))

>  

>      (eq_attr "iscompact" "true_limm")

> @@ -4447,8 +4453,16 @@ archs4x, archs4xd, archs4xd_slow"

>    ""

>    "nop%?"

>    [(set_attr "type" "misc")

> -   (set_attr "iscompact" "true")

> -   (set_attr "length" "2")])

> +   (set_attr "iscompact" "maybe")

> +   (set_attr "length" "*")])

> +

> +(define_insn "blockage"

> +  [(unspec_volatile [(const_int 0)] VUNSPEC_ARC_BLOCKAGE)]

> +  ""

> +  ""

> +  [(set_attr "length" "0")

> +   (set_attr "type" "block")]

> +)

>  

>  ;; Split up troublesome insns for better scheduling.

>  

> @@ -4993,8 +5007,6 @@ archs4x, archs4xd, archs4xd_slow"

>    {

>      return \"rtie\";

>    }

> -  if (TARGET_PAD_RETURN)

> -    arc_pad_return ();

>    output_asm_insn (\"j%!%* [%0]%&\", &reg);

>    return \"\";

>  }

> @@ -5038,8 +5050,6 @@ archs4x, archs4xd, archs4xd_slow"

>  		   arc_return_address_register (arc_compute_function_type

>  						(cfun)));

>  

> -  if (TARGET_PAD_RETURN)

> -    arc_pad_return ();

>    output_asm_insn (\"j%d0%!%# [%1]%&\", xop);

>    /* record the condition in case there is a delay insn.  */

>    arc_ccfsm_record_condition (xop[0], false, insn, 0);

> diff --git a/gcc/testsuite/gcc.target/arc/pr9001107555.c b/gcc/testsuite/gcc.target/arc/pr9001107555.c

> new file mode 100644

> index 00000000000..134426d33d9

> --- /dev/null

> +++ b/gcc/testsuite/gcc.target/arc/pr9001107555.c

> @@ -0,0 +1,38 @@

> +/* { dg-do assemble } *

> +/* { dg-skip-if "" { ! { clmcpu } } } */

> +/* { dg-options "-O3 -w -funroll-loops -mno-sdata -mcpu=arc700" } */

> +

> +typedef a __attribute__((__mode__(__DI__)));

> +typedef struct c c;

> +struct b {

> +  int d;

> +  c *e

> +};

> +enum { f };

> +typedef struct {

> +  a g;

> +  a h;

> +  int i

> +} j;

> +struct c {

> +  int count;

> +  int current

> +};

> +k;


I prefer tests to be up to date C, so 'int k;' here, _unless_ it turns
out that being old style C is critical for the test, in which case,
ideally we'd have a comment explaining that.

In general, if it's possible (without breaking the test) it would be
nice if we followed the GCC coding standard for test source, it just
makes things easier to read.

> +l(struct b *demux, __builtin_va_list args) {

> +  c m = *demux->e;

> +  j *n;

> +  switch (k)

> +  case f: {

> +    a o = __builtin_va_arg(args, a);

> +    m.current = 0;

> +    while (m.current < m.count) {

> +      if (n[m.current].h > o) {

> +        p(demux->d, 4 + 128LL * n[m.current].i);

> +        break;

> +      }

> +      m.current++;

> +    }

> +    return 0;

> +  }

> +}


Otherwise, this all looks fine.

Thanks,
Andrew
Claudiu Zissulescu June 13, 2018, 8:43 a.m. | #2
I've included and I updated the patch with your suggestions.
Thank you,
Claudiu 
________________________________________
From: Andrew Burgess [andrew.burgess@embecosm.com]

Sent: Tuesday, June 12, 2018 10:10 PM
To: Claudiu Zissulescu
Cc: gcc-patches@gcc.gnu.org; Francois.Bedard@synopsys.com; claziss@synopsys.comq; Claudiu Zissulescu
Subject: Re: [PATCH 6/6] [ARC] Reimplement return padding operation for ARC700.

* Claudiu Zissulescu <claziss@gmail.com> [2018-05-21 13:18:39 +0300]:

> From: Claudiu Zissulescu <claziss@synopsys.com>

>

> For ARC700, adding padding if necessary to avoid a mispredict.  A

> return could happen immediately after the function start.  A

> call/return and return/return must be 6 bytes apart to avoid

> mispredict.

>

> The old implementation was doing this operation very late in the

> compilation process, and the additional nop instructions and/or

> forcing some other instruction to take their long form was not taken

> into account when generating brcc instructions. Thus, wrong code could

> be generated.

>

> Ok to apply?


This looks fine with a couple of small adjustments, inline below...

> Claudiu

>

> gcc/

> 2017-03-24  Claudiu Zissulescu  <claziss@synopsys.com>

>

>       * config/arc/arc-protos.h (arc_pad_return): Remove.

>       * config/arc/arc.c (machine_function): Remove force_short_suffix

>       and size_reason.

>       (arc_print_operand): Adjust printing of '&'.

>       (arc_verify_short): Remove conditional printing of short suffix.

>       (arc_final_prescan_insn): Remove reference to size_reason.

>       (pad_return): New function.

>       (arc_reorg): Call pad_return.

>       (arc_pad_return): Remove.

>       (arc_init_machine_status): Remove reference to force_short_suffix.

>       * config/arc/arc.md (vunspec): Add VUNSPEC_ARC_BLOCKAGE.

>       (attr length): When attribute iscompact is true force to 2

>       regardless; in the case of maybe check if we want to force the

>       instruction to have 4 bytes length.

>       (nopv): Change it to generate 4 byte long nop as well.

>       (blockage): New pattern.

>       (simple_return): Remove call to arc_pad_return.

>       (p_return_i): Likewise.

>

> gcc/testsuite/

> 2017-03-24  Claudiu Zissulescu  <claziss@synopsys.com>

>

>       * gcc.target/arc/pr9001107555.c: New file.

> ---

>  gcc/config/arc/arc-protos.h                 |   1 -

>  gcc/config/arc/arc.c                        | 156 +++++++++++++---------------

>  gcc/config/arc/arc.md                       |  26 +++--

>  gcc/testsuite/gcc.target/arc/pr9001107555.c |  38 +++++++

>  4 files changed, 128 insertions(+), 93 deletions(-)

>  create mode 100644 gcc/testsuite/gcc.target/arc/pr9001107555.c

>

> diff --git a/gcc/config/arc/arc-protos.h b/gcc/config/arc/arc-protos.h

> index 67f3b4e3226..ce4b6f84749 100644

> --- a/gcc/config/arc/arc-protos.h

> +++ b/gcc/config/arc/arc-protos.h

> @@ -89,7 +89,6 @@ extern void arc_clear_unalign (void);

>  extern void arc_toggle_unalign (void);

>  extern void split_addsi (rtx *);

>  extern void split_subsi (rtx *);

> -extern void arc_pad_return (void);

>  extern void arc_split_move (rtx *);

>  extern const char *arc_short_long (rtx_insn *insn, const char *, const char *);

>  extern rtx arc_regno_use_in (unsigned int, rtx);

> diff --git a/gcc/config/arc/arc.c b/gcc/config/arc/arc.c

> index b1a09d82b72..22f1442a027 100644

> --- a/gcc/config/arc/arc.c

> +++ b/gcc/config/arc/arc.c

> @@ -2648,8 +2648,6 @@ typedef struct GTY (()) machine_function

>    struct arc_frame_info frame_info;

>    /* To keep track of unalignment caused by short insns.  */

>    int unalign;

> -  int force_short_suffix; /* Used when disgorging return delay slot insns.  */

> -  const char *size_reason;

>    struct arc_ccfsm ccfsm_current;

>    /* Map from uid to ccfsm state during branch shortening.  */

>    rtx ccfsm_current_insn;

> @@ -4307,7 +4305,7 @@ arc_print_operand (FILE *file, rtx x, int code)

>       }

>        break;

>      case '&':

> -      if (TARGET_ANNOTATE_ALIGN && cfun->machine->size_reason)

> +      if (TARGET_ANNOTATE_ALIGN)

>       fprintf (file, "; unalign: %d", cfun->machine->unalign);

>        return;

>      case '+':

> @@ -4980,7 +4978,6 @@ static int

>  arc_verify_short (rtx_insn *insn, int, int check_attr)

>  {

>    enum attr_iscompact iscompact;

> -  struct machine_function *machine;

>

>    if (check_attr > 0)

>      {

> @@ -4988,10 +4985,6 @@ arc_verify_short (rtx_insn *insn, int, int check_attr)

>        if (iscompact == ISCOMPACT_FALSE)

>       return 0;

>      }

> -  machine = cfun->machine;

> -

> -  if (machine->force_short_suffix >= 0)

> -    return machine->force_short_suffix;

>

>    return (get_attr_length (insn) & 2) != 0;

>  }

> @@ -5030,8 +5023,6 @@ arc_final_prescan_insn (rtx_insn *insn, rtx *opvec ATTRIBUTE_UNUSED,

>        cfun->machine->prescan_initialized = 1;

>      }

>    arc_ccfsm_advance (insn, &arc_ccfsm_current);

> -

> -  cfun->machine->size_reason = 0;

>  }

>

>  /* Given FROM and TO register numbers, say whether this elimination is allowed.

> @@ -7673,6 +7664,76 @@ jli_call_scan (void)

>      }

>  }

>

> +/* Add padding if necessary to avoid a mispredict.  A return could

> +   happen immediately after the function start.  A call/return and

> +   return/return must be 6 bytes apart to avoid mispredict.  */

> +

> +static void

> +pad_return (void)

> +{

> +  rtx_insn *insn;

> +  long offset;

> +

> +  if (!TARGET_PAD_RETURN)

> +    return;

> +

> +  for (insn = get_insns (); insn; insn = NEXT_INSN (insn))

> +    {

> +      rtx_insn *prev0 = prev_active_insn (insn);

> +      bool wantlong = false;

> +

> +      if (!INSN_P (insn) || GET_CODE (PATTERN (insn)) != SIMPLE_RETURN)

> +     continue;

> +

> +      if (!prev0)

> +     {

> +       prev0 = emit_insn_before (gen_nopv (), insn);

> +       /* REG_SAVE_NOTE is used by Haifa scheduler, we are in reorg

> +          so it is safe to reuse it for forcing a particular length

> +          for an instruction.  */

> +       add_reg_note (prev0, REG_SAVE_NOTE, GEN_INT (1));

> +       emit_insn_before (gen_nopv (), insn);

> +       continue;

> +     }

> +      offset = get_attr_length (prev0);

> +

> +      if (get_attr_length (prev0) == 2

> +       && get_attr_iscompact (prev0) != ISCOMPACT_TRUE)

> +     {

> +       /* Force long version of the insn.  */

> +       wantlong = true;

> +       offset += 2;

> +     }

> +

> +     rtx_insn *prev = prev_active_insn (prev0);


There's something funny with the indentation in this line.  Could you
just double check before committing.

> +      if (prev)

> +     offset += get_attr_length (prev);

> +

> +      prev = prev_active_insn (prev);

> +      if (prev)

> +     offset += get_attr_length (prev);

> +

> +      switch (offset)

> +     {

> +     case 2:

> +       prev = emit_insn_before (gen_nopv (), insn);

> +       add_reg_note (prev, REG_SAVE_NOTE, GEN_INT (1));

> +       break;

> +     case 4:

> +       emit_insn_before (gen_nopv (), insn);

> +       break;

> +     default:

> +       continue;

> +     }

> +

> +      if (wantlong)

> +     add_reg_note (prev0, REG_SAVE_NOTE, GEN_INT (1));

> +

> +      /* Emit a blockage to avoid delay slot scheduling.  */

> +      emit_insn_before (gen_blockage(), insn);

> +    }

> +}

> +

>  static int arc_reorg_in_progress = 0;

>

>  /* ARC's machince specific reorg function.  */

> @@ -7698,6 +7759,7 @@ arc_reorg (void)

>

>    workaround_arc_anomaly ();

>    jli_call_scan ();

> +  pad_return ();

>

>  /* FIXME: should anticipate ccfsm action, generate special patterns for

>     to-be-deleted branches that have no delay slot and have at least the

> @@ -9256,79 +9318,6 @@ arc_branch_size_unknown_p (void)

>    return !optimize_size && arc_reorg_in_progress;

>  }

>

> -/* We are about to output a return insn.  Add padding if necessary to avoid

> -   a mispredict.  A return could happen immediately after the function

> -   start, but after a call we know that there will be at least a blink

> -   restore.  */

> -

> -void

> -arc_pad_return (void)

> -{

> -  rtx_insn *insn = current_output_insn;

> -  rtx_insn *prev = prev_active_insn (insn);

> -  int want_long;

> -

> -  if (!prev)

> -    {

> -      fputs ("\tnop_s\n", asm_out_file);

> -      cfun->machine->unalign ^= 2;

> -      want_long = 1;

> -    }

> -  /* If PREV is a sequence, we know it must be a branch / jump or a tailcall,

> -     because after a call, we'd have to restore blink first.  */

> -  else if (GET_CODE (PATTERN (prev)) == SEQUENCE)

> -    return;

> -  else

> -    {

> -      want_long = (get_attr_length (prev) == 2);

> -      prev = prev_active_insn (prev);

> -    }

> -  if (!prev

> -      || ((NONJUMP_INSN_P (prev) && GET_CODE (PATTERN (prev)) == SEQUENCE)

> -       ? CALL_ATTR (as_a <rtx_sequence *> (PATTERN (prev))->insn (0),

> -                    NON_SIBCALL)

> -       : CALL_ATTR (prev, NON_SIBCALL)))

> -    {

> -      if (want_long)

> -     cfun->machine->size_reason

> -       = "call/return and return/return must be 6 bytes apart to avoid mispredict";

> -      else if (TARGET_UNALIGN_BRANCH && cfun->machine->unalign)

> -     {

> -       cfun->machine->size_reason

> -         = "Long unaligned jump avoids non-delay slot penalty";

> -       want_long = 1;

> -     }

> -      /* Disgorge delay insn, if there is any, and it may be moved.  */

> -      if (final_sequence

> -       /* ??? Annulled would be OK if we can and do conditionalize

> -          the delay slot insn accordingly.  */

> -       && !INSN_ANNULLED_BRANCH_P (insn)

> -       && (get_attr_cond (insn) != COND_USE

> -           || !reg_set_p (gen_rtx_REG (CCmode, CC_REG),

> -                          XVECEXP (final_sequence, 0, 1))))

> -     {

> -       prev = as_a <rtx_insn *> (XVECEXP (final_sequence, 0, 1));

> -       gcc_assert (!prev_real_insn (insn)

> -                   || !arc_hazard (prev_real_insn (insn), prev));

> -       cfun->machine->force_short_suffix = !want_long;

> -       rtx save_pred = current_insn_predicate;

> -       final_scan_insn (prev, asm_out_file, optimize, 1, NULL);

> -       cfun->machine->force_short_suffix = -1;

> -       prev->set_deleted ();

> -       current_output_insn = insn;

> -       current_insn_predicate = save_pred;

> -     }

> -      else if (want_long)

> -     fputs ("\tnop\n", asm_out_file);

> -      else

> -     {

> -       fputs ("\tnop_s\n", asm_out_file);

> -       cfun->machine->unalign ^= 2;

> -     }

> -    }

> -  return;

> -}

> -

>  /* The usual; we set up our machine_function data.  */

>

>  static struct machine_function *

> @@ -9337,7 +9326,6 @@ arc_init_machine_status (void)

>    struct machine_function *machine;

>    machine = ggc_cleared_alloc<machine_function> ();

>    machine->fn_type = ARC_FUNCTION_UNKNOWN;

> -  machine->force_short_suffix = -1;

>

>    return machine;

>  }

> diff --git a/gcc/config/arc/arc.md b/gcc/config/arc/arc.md

> index 5610bab694c..2401926f08d 100644

> --- a/gcc/config/arc/arc.md

> +++ b/gcc/config/arc/arc.md

> @@ -162,6 +162,7 @@

>    VUNSPEC_ARC_CAS

>    VUNSPEC_ARC_SC

>    VUNSPEC_ARC_LL

> +  VUNSPEC_ARC_BLOCKAGE

>    ])

>

>  (define_constants

> @@ -385,13 +386,18 @@

>  ;; and insn lengths: insns with shimm values cannot be conditionally executed.

>  (define_attr "length" ""

>    (cond

> -    [(eq_attr "iscompact" "true,maybe")

> +    [(eq_attr "iscompact" "true")

> +      (const_int 2)

> +

> +     (eq_attr "iscompact" "maybe")

>       (cond

>         [(eq_attr "type" "sfunc")

>       (cond [(match_test "GET_CODE (PATTERN (insn)) == COND_EXEC")

>              (const_int 12)]

>             (const_int 10))

> -     (match_test "GET_CODE (PATTERN (insn)) == COND_EXEC") (const_int 4)]

> +     (match_test "GET_CODE (PATTERN (insn)) == COND_EXEC") (const_int 4)

> +     (match_test "find_reg_note (insn, REG_SAVE_NOTE, GEN_INT (1))")

> +     (const_int 4)]

>        (const_int 2))

>

>      (eq_attr "iscompact" "true_limm")

> @@ -4447,8 +4453,16 @@ archs4x, archs4xd, archs4xd_slow"

>    ""

>    "nop%?"

>    [(set_attr "type" "misc")

> -   (set_attr "iscompact" "true")

> -   (set_attr "length" "2")])

> +   (set_attr "iscompact" "maybe")

> +   (set_attr "length" "*")])

> +

> +(define_insn "blockage"

> +  [(unspec_volatile [(const_int 0)] VUNSPEC_ARC_BLOCKAGE)]

> +  ""

> +  ""

> +  [(set_attr "length" "0")

> +   (set_attr "type" "block")]

> +)

>

>  ;; Split up troublesome insns for better scheduling.

>

> @@ -4993,8 +5007,6 @@ archs4x, archs4xd, archs4xd_slow"

>    {

>      return \"rtie\";

>    }

> -  if (TARGET_PAD_RETURN)

> -    arc_pad_return ();

>    output_asm_insn (\"j%!%* [%0]%&\", &reg);

>    return \"\";

>  }

> @@ -5038,8 +5050,6 @@ archs4x, archs4xd, archs4xd_slow"

>                  arc_return_address_register (arc_compute_function_type

>                                               (cfun)));

>

> -  if (TARGET_PAD_RETURN)

> -    arc_pad_return ();

>    output_asm_insn (\"j%d0%!%# [%1]%&\", xop);

>    /* record the condition in case there is a delay insn.  */

>    arc_ccfsm_record_condition (xop[0], false, insn, 0);

> diff --git a/gcc/testsuite/gcc.target/arc/pr9001107555.c b/gcc/testsuite/gcc.target/arc/pr9001107555.c

> new file mode 100644

> index 00000000000..134426d33d9

> --- /dev/null

> +++ b/gcc/testsuite/gcc.target/arc/pr9001107555.c

> @@ -0,0 +1,38 @@

> +/* { dg-do assemble } *

> +/* { dg-skip-if "" { ! { clmcpu } } } */

> +/* { dg-options "-O3 -w -funroll-loops -mno-sdata -mcpu=arc700" } */

> +

> +typedef a __attribute__((__mode__(__DI__)));

> +typedef struct c c;

> +struct b {

> +  int d;

> +  c *e

> +};

> +enum { f };

> +typedef struct {

> +  a g;

> +  a h;

> +  int i

> +} j;

> +struct c {

> +  int count;

> +  int current

> +};

> +k;


I prefer tests to be up to date C, so 'int k;' here, _unless_ it turns
out that being old style C is critical for the test, in which case,
ideally we'd have a comment explaining that.

In general, if it's possible (without breaking the test) it would be
nice if we followed the GCC coding standard for test source, it just
makes things easier to read.

> +l(struct b *demux, __builtin_va_list args) {

> +  c m = *demux->e;

> +  j *n;

> +  switch (k)

> +  case f: {

> +    a o = __builtin_va_arg(args, a);

> +    m.current = 0;

> +    while (m.current < m.count) {

> +      if (n[m.current].h > o) {

> +        p(demux->d, 4 + 128LL * n[m.current].i);

> +        break;

> +      }

> +      m.current++;

> +    }

> +    return 0;

> +  }

> +}


Otherwise, this all looks fine.

Thanks,
Andrew

Patch

diff --git a/gcc/config/arc/arc-protos.h b/gcc/config/arc/arc-protos.h
index 67f3b4e3226..ce4b6f84749 100644
--- a/gcc/config/arc/arc-protos.h
+++ b/gcc/config/arc/arc-protos.h
@@ -89,7 +89,6 @@  extern void arc_clear_unalign (void);
 extern void arc_toggle_unalign (void);
 extern void split_addsi (rtx *);
 extern void split_subsi (rtx *);
-extern void arc_pad_return (void);
 extern void arc_split_move (rtx *);
 extern const char *arc_short_long (rtx_insn *insn, const char *, const char *);
 extern rtx arc_regno_use_in (unsigned int, rtx);
diff --git a/gcc/config/arc/arc.c b/gcc/config/arc/arc.c
index b1a09d82b72..22f1442a027 100644
--- a/gcc/config/arc/arc.c
+++ b/gcc/config/arc/arc.c
@@ -2648,8 +2648,6 @@  typedef struct GTY (()) machine_function
   struct arc_frame_info frame_info;
   /* To keep track of unalignment caused by short insns.  */
   int unalign;
-  int force_short_suffix; /* Used when disgorging return delay slot insns.  */
-  const char *size_reason;
   struct arc_ccfsm ccfsm_current;
   /* Map from uid to ccfsm state during branch shortening.  */
   rtx ccfsm_current_insn;
@@ -4307,7 +4305,7 @@  arc_print_operand (FILE *file, rtx x, int code)
 	}
       break;
     case '&':
-      if (TARGET_ANNOTATE_ALIGN && cfun->machine->size_reason)
+      if (TARGET_ANNOTATE_ALIGN)
 	fprintf (file, "; unalign: %d", cfun->machine->unalign);
       return;
     case '+':
@@ -4980,7 +4978,6 @@  static int
 arc_verify_short (rtx_insn *insn, int, int check_attr)
 {
   enum attr_iscompact iscompact;
-  struct machine_function *machine;
 
   if (check_attr > 0)
     {
@@ -4988,10 +4985,6 @@  arc_verify_short (rtx_insn *insn, int, int check_attr)
       if (iscompact == ISCOMPACT_FALSE)
 	return 0;
     }
-  machine = cfun->machine;
-
-  if (machine->force_short_suffix >= 0)
-    return machine->force_short_suffix;
 
   return (get_attr_length (insn) & 2) != 0;
 }
@@ -5030,8 +5023,6 @@  arc_final_prescan_insn (rtx_insn *insn, rtx *opvec ATTRIBUTE_UNUSED,
       cfun->machine->prescan_initialized = 1;
     }
   arc_ccfsm_advance (insn, &arc_ccfsm_current);
-
-  cfun->machine->size_reason = 0;
 }
 
 /* Given FROM and TO register numbers, say whether this elimination is allowed.
@@ -7673,6 +7664,76 @@  jli_call_scan (void)
     }
 }
 
+/* Add padding if necessary to avoid a mispredict.  A return could
+   happen immediately after the function start.  A call/return and
+   return/return must be 6 bytes apart to avoid mispredict.  */
+
+static void
+pad_return (void)
+{
+  rtx_insn *insn;
+  long offset;
+
+  if (!TARGET_PAD_RETURN)
+    return;
+
+  for (insn = get_insns (); insn; insn = NEXT_INSN (insn))
+    {
+      rtx_insn *prev0 = prev_active_insn (insn);
+      bool wantlong = false;
+
+      if (!INSN_P (insn) || GET_CODE (PATTERN (insn)) != SIMPLE_RETURN)
+	continue;
+
+      if (!prev0)
+	{
+	  prev0 = emit_insn_before (gen_nopv (), insn);
+	  /* REG_SAVE_NOTE is used by Haifa scheduler, we are in reorg
+	     so it is safe to reuse it for forcing a particular length
+	     for an instruction.  */
+	  add_reg_note (prev0, REG_SAVE_NOTE, GEN_INT (1));
+	  emit_insn_before (gen_nopv (), insn);
+	  continue;
+	}
+      offset = get_attr_length (prev0);
+
+      if (get_attr_length (prev0) == 2
+	  && get_attr_iscompact (prev0) != ISCOMPACT_TRUE)
+	{
+	  /* Force long version of the insn.  */
+	  wantlong = true;
+	  offset += 2;
+	}
+
+     rtx_insn *prev = prev_active_insn (prev0);
+      if (prev)
+	offset += get_attr_length (prev);
+
+      prev = prev_active_insn (prev);
+      if (prev)
+	offset += get_attr_length (prev);
+
+      switch (offset)
+	{
+	case 2:
+	  prev = emit_insn_before (gen_nopv (), insn);
+	  add_reg_note (prev, REG_SAVE_NOTE, GEN_INT (1));
+	  break;
+	case 4:
+	  emit_insn_before (gen_nopv (), insn);
+	  break;
+	default:
+	  continue;
+	}
+
+      if (wantlong)
+	add_reg_note (prev0, REG_SAVE_NOTE, GEN_INT (1));
+
+      /* Emit a blockage to avoid delay slot scheduling.  */
+      emit_insn_before (gen_blockage(), insn);
+    }
+}
+
 static int arc_reorg_in_progress = 0;
 
 /* ARC's machince specific reorg function.  */
@@ -7698,6 +7759,7 @@  arc_reorg (void)
 
   workaround_arc_anomaly ();
   jli_call_scan ();
+  pad_return ();
 
 /* FIXME: should anticipate ccfsm action, generate special patterns for
    to-be-deleted branches that have no delay slot and have at least the
@@ -9256,79 +9318,6 @@  arc_branch_size_unknown_p (void)
   return !optimize_size && arc_reorg_in_progress;
 }
 
-/* We are about to output a return insn.  Add padding if necessary to avoid
-   a mispredict.  A return could happen immediately after the function
-   start, but after a call we know that there will be at least a blink
-   restore.  */
-
-void
-arc_pad_return (void)
-{
-  rtx_insn *insn = current_output_insn;
-  rtx_insn *prev = prev_active_insn (insn);
-  int want_long;
-
-  if (!prev)
-    {
-      fputs ("\tnop_s\n", asm_out_file);
-      cfun->machine->unalign ^= 2;
-      want_long = 1;
-    }
-  /* If PREV is a sequence, we know it must be a branch / jump or a tailcall,
-     because after a call, we'd have to restore blink first.  */
-  else if (GET_CODE (PATTERN (prev)) == SEQUENCE)
-    return;
-  else
-    {
-      want_long = (get_attr_length (prev) == 2);
-      prev = prev_active_insn (prev);
-    }
-  if (!prev
-      || ((NONJUMP_INSN_P (prev) && GET_CODE (PATTERN (prev)) == SEQUENCE)
-	  ? CALL_ATTR (as_a <rtx_sequence *> (PATTERN (prev))->insn (0),
-		       NON_SIBCALL)
-	  : CALL_ATTR (prev, NON_SIBCALL)))
-    {
-      if (want_long)
-	cfun->machine->size_reason
-	  = "call/return and return/return must be 6 bytes apart to avoid mispredict";
-      else if (TARGET_UNALIGN_BRANCH && cfun->machine->unalign)
-	{
-	  cfun->machine->size_reason
-	    = "Long unaligned jump avoids non-delay slot penalty";
-	  want_long = 1;
-	}
-      /* Disgorge delay insn, if there is any, and it may be moved.  */
-      if (final_sequence
-	  /* ??? Annulled would be OK if we can and do conditionalize
-	     the delay slot insn accordingly.  */
-	  && !INSN_ANNULLED_BRANCH_P (insn)
-	  && (get_attr_cond (insn) != COND_USE
-	      || !reg_set_p (gen_rtx_REG (CCmode, CC_REG),
-			     XVECEXP (final_sequence, 0, 1))))
-	{
-	  prev = as_a <rtx_insn *> (XVECEXP (final_sequence, 0, 1));
-	  gcc_assert (!prev_real_insn (insn)
-		      || !arc_hazard (prev_real_insn (insn), prev));
-	  cfun->machine->force_short_suffix = !want_long;
-	  rtx save_pred = current_insn_predicate;
-	  final_scan_insn (prev, asm_out_file, optimize, 1, NULL);
-	  cfun->machine->force_short_suffix = -1;
-	  prev->set_deleted ();
-	  current_output_insn = insn;
-	  current_insn_predicate = save_pred;
-	}
-      else if (want_long)
-	fputs ("\tnop\n", asm_out_file);
-      else
-	{
-	  fputs ("\tnop_s\n", asm_out_file);
-	  cfun->machine->unalign ^= 2;
-	}
-    }
-  return;
-}
-
 /* The usual; we set up our machine_function data.  */
 
 static struct machine_function *
@@ -9337,7 +9326,6 @@  arc_init_machine_status (void)
   struct machine_function *machine;
   machine = ggc_cleared_alloc<machine_function> ();
   machine->fn_type = ARC_FUNCTION_UNKNOWN;
-  machine->force_short_suffix = -1;
 
   return machine;
 }
diff --git a/gcc/config/arc/arc.md b/gcc/config/arc/arc.md
index 5610bab694c..2401926f08d 100644
--- a/gcc/config/arc/arc.md
+++ b/gcc/config/arc/arc.md
@@ -162,6 +162,7 @@ 
   VUNSPEC_ARC_CAS
   VUNSPEC_ARC_SC
   VUNSPEC_ARC_LL
+  VUNSPEC_ARC_BLOCKAGE
   ])
 
 (define_constants
@@ -385,13 +386,18 @@ 
 ;; and insn lengths: insns with shimm values cannot be conditionally executed.
 (define_attr "length" ""
   (cond
-    [(eq_attr "iscompact" "true,maybe")
+    [(eq_attr "iscompact" "true")
+      (const_int 2)
+
+     (eq_attr "iscompact" "maybe")
      (cond
        [(eq_attr "type" "sfunc")
 	(cond [(match_test "GET_CODE (PATTERN (insn)) == COND_EXEC")
 	       (const_int 12)]
 	      (const_int 10))
-	(match_test "GET_CODE (PATTERN (insn)) == COND_EXEC") (const_int 4)]
+	(match_test "GET_CODE (PATTERN (insn)) == COND_EXEC") (const_int 4)
+	(match_test "find_reg_note (insn, REG_SAVE_NOTE, GEN_INT (1))")
+	(const_int 4)]
       (const_int 2))
 
     (eq_attr "iscompact" "true_limm")
@@ -4447,8 +4453,16 @@  archs4x, archs4xd, archs4xd_slow"
   ""
   "nop%?"
   [(set_attr "type" "misc")
-   (set_attr "iscompact" "true")
-   (set_attr "length" "2")])
+   (set_attr "iscompact" "maybe")
+   (set_attr "length" "*")])
+
+(define_insn "blockage"
+  [(unspec_volatile [(const_int 0)] VUNSPEC_ARC_BLOCKAGE)]
+  ""
+  ""
+  [(set_attr "length" "0")
+   (set_attr "type" "block")]
+)
 
 ;; Split up troublesome insns for better scheduling.
 
@@ -4993,8 +5007,6 @@  archs4x, archs4xd, archs4xd_slow"
   {
     return \"rtie\";
   }
-  if (TARGET_PAD_RETURN)
-    arc_pad_return ();
   output_asm_insn (\"j%!%* [%0]%&\", &reg);
   return \"\";
 }
@@ -5038,8 +5050,6 @@  archs4x, archs4xd, archs4xd_slow"
 		   arc_return_address_register (arc_compute_function_type
 						(cfun)));
 
-  if (TARGET_PAD_RETURN)
-    arc_pad_return ();
   output_asm_insn (\"j%d0%!%# [%1]%&\", xop);
   /* record the condition in case there is a delay insn.  */
   arc_ccfsm_record_condition (xop[0], false, insn, 0);
diff --git a/gcc/testsuite/gcc.target/arc/pr9001107555.c b/gcc/testsuite/gcc.target/arc/pr9001107555.c
new file mode 100644
index 00000000000..134426d33d9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arc/pr9001107555.c
@@ -0,0 +1,38 @@ 
+/* { dg-do assemble } *
+/* { dg-skip-if "" { ! { clmcpu } } } */
+/* { dg-options "-O3 -w -funroll-loops -mno-sdata -mcpu=arc700" } */
+
+typedef a __attribute__((__mode__(__DI__)));
+typedef struct c c;
+struct b {
+  int d;
+  c *e
+};
+enum { f };
+typedef struct {
+  a g;
+  a h;
+  int i
+} j;
+struct c {
+  int count;
+  int current
+};
+k;
+l(struct b *demux, __builtin_va_list args) {
+  c m = *demux->e;
+  j *n;
+  switch (k)
+  case f: {
+    a o = __builtin_va_arg(args, a);
+    m.current = 0;
+    while (m.current < m.count) {
+      if (n[m.current].h > o) {
+        p(demux->d, 4 + 128LL * n[m.current].i);
+        break;
+      }
+      m.current++;
+    }
+    return 0;
+  }
+}