RFC: Patch to implement Aarch64 SIMD ABI

Message ID 1531950855.1378.56.camel@cavium.com
State New
Headers show
Series
  • RFC: Patch to implement Aarch64 SIMD ABI
Related show

Commit Message

Steve Ellcey July 18, 2018, 9:54 p.m.
This is a patch to support the Aarch64 SIMD ABI [1] in GCC.  I intend
to eventually follow this up with two more patches; one to define the
TARGET_SIMD_CLONE* macros and one to improve the GCC register
allocation/usage when calling SIMD functions.

The significant difference between the standard ARM ABI and the SIMD ABI
is that in the normal ABI a callee saves only the lower 64 bits of registers
V8-V15, in the SIMD ABI the callee must save all 128 bits of registers
V8-V23.

This patch checks for SIMD functions and saves the extra registers when
needed.  It does not change the caller behavour, so with just this patch
there may be values saved by both the caller and callee.  This is not
efficient, but it is correct code.

This patch bootstraps and passes the GCC testsuite but that only verifies
I haven't broken anything, it doesn't validate the handling of SIMD functions.
I tried to write some tests, but I could never get GCC to generate code
that would save the FP callee-save registers in the prologue.  Complex code
might generate spills and fills but it never triggered the prologue/epilogue
code to save V8-V23.  If anyone has ideas on how to write a test that would
cause GCC to generate this code I would appreciate some ideas.  Just doing
lots of calculations with lots of intermediate values doesn't seem to be enough.

Steve Ellcey
sellcey@cavium.com

[1] https://developer.arm.com/products/software-development-tools/hpc/arm-compiler-for-hpc/vector-function-abi


2018-07-18  Steve Ellcey  <sellcey@cavium.com>

	* config/aarch64/aarch64.c (aarch64_attribute_table): New array.
	(aarch64_simd_function_p): New function.
	(aarch64_layout_frame): Check for simd function.
	(aarch64_process_components): Ditto.
	(aarch64_expand_prologue): Ditto.
	(aarch64_expand_epilogue): Ditto.
	(TARGET_ATTRIBUTE_TABLE): New define.
	* config/aarch64/aarch64.h (FP_SIMD_SAVED_REGNUM_P): New define.
	* config/aarch64/aarch64.md (V23_REGNUM) New constant.

Comments

Richard Sandiford July 19, 2018, 7:31 a.m. | #1
Hi,

Thanks for doing this.

Steve Ellcey <sellcey@cavium.com> writes:
> This is a patch to support the Aarch64 SIMD ABI [1] in GCC.  I intend

> to eventually follow this up with two more patches; one to define the

> TARGET_SIMD_CLONE* macros and one to improve the GCC register

> allocation/usage when calling SIMD functions.

>

> The significant difference between the standard ARM ABI and the SIMD ABI

> is that in the normal ABI a callee saves only the lower 64 bits of registers

> V8-V15, in the SIMD ABI the callee must save all 128 bits of registers

> V8-V23.

>

> This patch checks for SIMD functions and saves the extra registers when

> needed.  It does not change the caller behavour, so with just this patch

> there may be values saved by both the caller and callee.  This is not

> efficient, but it is correct code.

>

> This patch bootstraps and passes the GCC testsuite but that only verifies

> I haven't broken anything, it doesn't validate the handling of SIMD functions.

> I tried to write some tests, but I could never get GCC to generate code

> that would save the FP callee-save registers in the prologue.  Complex code

> might generate spills and fills but it never triggered the prologue/epilogue

> code to save V8-V23.  If anyone has ideas on how to write a test that would

> cause GCC to generate this code I would appreciate some ideas.  Just doing

> lots of calculations with lots of intermediate values doesn't seem to be enough.


Probably easiest to use asm clobbers, e.g.:

void __attribute__ ((aarch64_vector_pcs))
f (void)
{
  asm volatile ("" ::: "s8", "s13");
}

This also lets you control exactly which registers are saved.

> @@ -4105,7 +4128,8 @@ aarch64_layout_frame (void)

>        {

>  	/* If there is an alignment gap between integer and fp callee-saves,

>  	   allocate the last fp register to it if possible.  */

> -	if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)

> +	if (regno == last_fp_reg && has_align_gap

> +	    && !simd_function && (offset & 8) == 0)

>  	  {

>  	    cfun->machine->frame.reg_offset[regno] = max_int_offset;

>  	    break;

> @@ -4117,7 +4141,7 @@ aarch64_layout_frame (void)

>  	else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM

>  		 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)

>  	  cfun->machine->frame.wb_candidate2 = regno;

> -	offset += UNITS_PER_WORD;

> +	offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD;

>        }

>  

>    offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);

> @@ -4706,8 +4730,11 @@ aarch64_process_components (sbitmap components, bool prologue_p)

>    while (regno != last_regno)

>      {

>        /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved

> -	 so DFmode for the vector registers is enough.  */

> -      machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;

> +	 so DFmode for the vector registers is enough.  For simd functions

> +         we want to save the entire register.  */

> +      machine_mode mode = GP_REGNUM_P (regno) ? E_DImode

> +	: (aarch64_simd_function_p (cfun->decl) ? E_TFmode : E_DFmode);


This condition also occurs in aarch64_push_regs and aarch64_pop_regs.
It'd probably be worth splitting it out into a subfunction.

I think you also need to handle the writeback cases, which should work
for Q registers too.  This will mean extra loadwb_pair and storewb_pair
patterns.

LGTM otherwise FWIW.

> diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h

> index f284e74..d11474e 100644

> --- a/gcc/config/aarch64/aarch64.h

> +++ b/gcc/config/aarch64/aarch64.h

> @@ -500,6 +500,8 @@ extern unsigned aarch64_architecture_version;

>  #define PR_LO_REGNUM_P(REGNO)\

>    (((unsigned) (REGNO - P0_REGNUM)) <= (P7_REGNUM - P0_REGNUM))

>  

> +#define FP_SIMD_SAVED_REGNUM_P(REGNO)			\

> +  (((unsigned) (REGNO - V8_REGNUM)) <= (V23_REGNUM - V8_REGNUM))


(We should probably rewrite these to use IN_RANGE at some point,
but I agree it's better to be consistent until then.)

Thanks,
Richard
Ramana Radhakrishnan July 19, 2018, 8:52 a.m. | #2
On Thu, Jul 19, 2018 at 8:31 AM, Richard Sandiford
<richard.sandiford@arm.com> wrote:
> Hi,

>

> Thanks for doing this.

>

> Steve Ellcey <sellcey@cavium.com> writes:

>> This is a patch to support the Aarch64 SIMD ABI [1] in GCC.  I intend

>> to eventually follow this up with two more patches; one to define the

>> TARGET_SIMD_CLONE* macros and one to improve the GCC register

>> allocation/usage when calling SIMD functions.

>>

>> The significant difference between the standard ARM ABI and the SIMD ABI

>> is that in the normal ABI a callee saves only the lower 64 bits of registers

>> V8-V15, in the SIMD ABI the callee must save all 128 bits of registers

>> V8-V23.

>>

>> This patch checks for SIMD functions and saves the extra registers when

>> needed.  It does not change the caller behavour, so with just this patch

>> there may be values saved by both the caller and callee.  This is not

>> efficient, but it is correct code.

>>

>> This patch bootstraps and passes the GCC testsuite but that only verifies

>> I haven't broken anything, it doesn't validate the handling of SIMD functions.

>> I tried to write some tests, but I could never get GCC to generate code

>> that would save the FP callee-save registers in the prologue.  Complex code

>> might generate spills and fills but it never triggered the prologue/epilogue

>> code to save V8-V23.  If anyone has ideas on how to write a test that would

>> cause GCC to generate this code I would appreciate some ideas.  Just doing

>> lots of calculations with lots of intermediate values doesn't seem to be enough.

>

> Probably easiest to use asm clobbers, e.g.:

>

> void __attribute__ ((aarch64_vector_pcs))

> f (void)

> {

>   asm volatile ("" ::: "s8", "s13");

> }

>

> This also lets you control exactly which registers are saved.


For just checking the save and restore the technique Richard suggests
is probably sufficient.

One of the techniques I've used in the past in general is to force
everything to be tested with a command line option added for testing
-. In this case after all the C library dependence of the testsuite
isn't huge and there wouldn't be any vector PCS interfaces to the
libraries needed to run the testsuite that things would work ?

You could cross-check coverage by using lcov. Instructions thanks to
marxin are :

$> ../configure --disable-bootstrap --enable-coverage=opt
--enable-languages=c,c++,fortran,go,jit,lto --enable-host-shared
$> make
$> make check
$>find gcc/testsuite/ -name '*.gcda' -exec rm -rf {} \;
$> lcov -d . --capture --output-file gcc.info
$> lcov --remove gcc.info "/usr/*" "/opt/*" "*/gcc/gt-*"
"*/gcc/gtype-*" --output-file gcc.info
$> genhtml gcc.info --ignore-errors=source --output-directory html
--html-epilog epilog.txt

You will see a warning

genhtml: WARNING: cannot read $builddir/gcc/cfns.gperf

and you can ignore that.

Then just look at the html output that it produces, pretty neat and I
see about 80% coverage on an aarch64-none-linux-gnu test run with
c,c++,fortran,go,lto IIRC.


regards
Ramana

1.
>

>> @@ -4105,7 +4128,8 @@ aarch64_layout_frame (void)

>>        {

>>       /* If there is an alignment gap between integer and fp callee-saves,

>>          allocate the last fp register to it if possible.  */

>> -     if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)

>> +     if (regno == last_fp_reg && has_align_gap

>> +         && !simd_function && (offset & 8) == 0)

>>         {

>>           cfun->machine->frame.reg_offset[regno] = max_int_offset;

>>           break;

>> @@ -4117,7 +4141,7 @@ aarch64_layout_frame (void)

>>       else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM

>>                && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)

>>         cfun->machine->frame.wb_candidate2 = regno;

>> -     offset += UNITS_PER_WORD;

>> +     offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD;

>>        }

>>

>>    offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);

>> @@ -4706,8 +4730,11 @@ aarch64_process_components (sbitmap components, bool prologue_p)

>>    while (regno != last_regno)

>>      {

>>        /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved

>> -      so DFmode for the vector registers is enough.  */

>> -      machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;

>> +      so DFmode for the vector registers is enough.  For simd functions

>> +         we want to save the entire register.  */

>> +      machine_mode mode = GP_REGNUM_P (regno) ? E_DImode

>> +     : (aarch64_simd_function_p (cfun->decl) ? E_TFmode : E_DFmode);

>

> This condition also occurs in aarch64_push_regs and aarch64_pop_regs.

> It'd probably be worth splitting it out into a subfunction.

>

> I think you also need to handle the writeback cases, which should work

> for Q registers too.  This will mean extra loadwb_pair and storewb_pair

> patterns.

>

> LGTM otherwise FWIW.

>

>> diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h

>> index f284e74..d11474e 100644

>> --- a/gcc/config/aarch64/aarch64.h

>> +++ b/gcc/config/aarch64/aarch64.h

>> @@ -500,6 +500,8 @@ extern unsigned aarch64_architecture_version;

>>  #define PR_LO_REGNUM_P(REGNO)\

>>    (((unsigned) (REGNO - P0_REGNUM)) <= (P7_REGNUM - P0_REGNUM))

>>

>> +#define FP_SIMD_SAVED_REGNUM_P(REGNO)                        \

>> +  (((unsigned) (REGNO - V8_REGNUM)) <= (V23_REGNUM - V8_REGNUM))

>

> (We should probably rewrite these to use IN_RANGE at some point,

> but I agree it's better to be consistent until then.)

>

> Thanks,

> Richard
Steve Ellcey July 19, 2018, 9:29 p.m. | #3
On Thu, 2018-07-19 at 08:31 +0100, Richard Sandiford wrote:

> > @@ -4706,8 +4730,11 @@ aarch64_process_components (sbitmap

> > components, bool prologue_p)

> >    while (regno != last_regno)

> >      {

> >        /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved

> > -      so DFmode for the vector registers is enough.  */

> > -      machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;

> > +      so DFmode for the vector registers is enough.  For simd functions

> > +         we want to save the entire register.  */

> > +      machine_mode mode = GP_REGNUM_P (regno) ? E_DImode

> > +     : (aarch64_simd_function_p (cfun->decl) ? E_TFmode : E_DFmode);

> This condition also occurs in aarch64_push_regs and aarch64_pop_regs.

> It'd probably be worth splitting it out into a subfunction.

> 

> I think you also need to handle the writeback cases, which should work

> for Q registers too.  This will mean extra loadwb_pair and storewb_pair

> patterns.

> 

> LGTM otherwise FWIW.


Yes, I see where I missed this in aarch64_push_regs
and aarch64_pop_regs.  I think that is why the second of
Wilco's two examples (f2) is wrong.  I am unclear about
exactly what is meant by writeback and why we have it and
how that and callee_adjust are used.  Any chance someone
could help me understand this part of the prologue/epilogue
code better?  The comments in aarch64.c/aarch64.h aren't
really helping me understand what the code is doing or
why it is doing it.

Steve Ellcey
sellcey@cavium.com
Wilco Dijkstra July 20, 2018, 11:11 a.m. | #4
Steve Ellcey wrote:

> Yes, I see where I missed this in aarch64_push_regs

> and aarch64_pop_regs.  I think that is why the second of

> Wilco's two examples (f2) is wrong.  I am unclear about

> exactly what is meant by writeback and why we have it and

> how that and callee_adjust are used.  Any chance someone

> could help me understand this part of the prologue/epilogue

> code better?  The comments in aarch64.c/aarch64.h aren't

> really helping me understand what the code is doing or

> why it is doing it.


Writeback is the same as a base update in a load or store. When
creating the frame there are 3 stack adjustments to be made:
creating stack for locals, pushing callee-saves and reserving space
for outgoing arguments. We merge these stack adjustments as much as
possible and use load/store with writeback for codesize and performance.
See the last part in layout_frame for the different cases.

In many common cases the frame is small and there are no outgoing
arguments, so we emit an STP with writeback to store the first 2 callee saves
and create th full frame in a single instruction. In this case callee_adjust will
be the frame size and initial_adjust will be zero.

push_regs and pop_regs need to be passed a mode since layout_frame
will use STP with writeback of floating point callee-saves if there are no integer
callee-saves. Note if there is only 1 or odd number of callee-save it may use
LDR/STR with writeback, so we need to support TFmode for these too.

Wilco
Steve Ellcey July 20, 2018, 3:09 p.m. | #5
On Fri, 2018-07-20 at 11:11 +0000, Wilco Dijkstra wrote:

> Steve Ellcey wrote:

> 

> > Yes, I see where I missed this in aarch64_push_regs

> > and aarch64_pop_regs.  I think that is why the second of

> > Wilco's two examples (f2) is wrong.  I am unclear about

> > exactly what is meant by writeback and why we have it and

> > how that and callee_adjust are used.  Any chance someone

> > could help me understand this part of the prologue/epilogue

> > code better?  The comments in aarch64.c/aarch64.h aren't

> > really helping me understand what the code is doing or

> > why it is doing it.


> Writeback is the same as a base update in a load or store. When

> creating the frame there are 3 stack adjustments to be made:

> creating stack for locals, pushing callee-saves and reserving space

> for outgoing arguments. We merge these stack adjustments as much as

> possible and use load/store with writeback for codesize and performance.

> See the last part in layout_frame for the different cases.


OK, I think I understand this a bit better now.  I think my main
problem is with the  term 'writeback' which I am not used to seeing.
But if I understand things correctly we are saving one or two registers
and (possibly) updating the stack pointer using auto-increment/auto-
decrement in one instruction and that the updating of SP is what you
mean by 'writeback'.  Correct?

Steve Ellcey
sellcey@cavium.com
Wilco Dijkstra July 23, 2018, 2:14 p.m. | #6
Steve Ellcey wrote:

> OK, I think I understand this a bit better now.  I think my main

> problem is with the  term 'writeback' which I am not used to seeing.

> But if I understand things correctly we are saving one or two registers

> and (possibly) updating the stack pointer using auto-increment/auto-

> decrement in one instruction and that the updating of SP is what you

> mean by 'writeback'.  Correct?


Correct. The term has been in use since the very first Arm CPUs, where
load/stores have a writeback bit to control whether the base register is updated.
Note that we don't limit the instructions to simple push/pops: SP is updated
by the frame size rather than by the transfer size.

Wilco
Steve Ellcey July 23, 2018, 8:08 p.m. | #7
Here is an updated version of my patch for the Aarch64 SIMD ABI.  I
think the writeback register saves are correct now and I improved the
register allocation by defining REG_ALLOC_ORDER.  I also added clobbers
to expand_call when calling a non-SIMD function from a SIMD function.

I am still testing but any feedback on what I have so far would be
helpful.

Steve Ellcey
sellcey@cavium.com

2018-07-23  Steve Ellcey  <sellcey@cavium.com>

	* config/aarch64/aarch64.c (aarch64_attribute_table): New array.
	(aarch64_simd_decl_p): New function.
	(aarch64_reg_save_mode): New function.
	(aarch64_is_simd_call_p): New function.
	(aarch64_layout_frame): Check for simd function.
	(aarch64_gen_storewb_pair): Handle E_TFmode.
	(aarch64_push_regs): Use aarch64_reg_save_mode to get mode.
	(aarch64_gen_loadwb_pair): Handle E_TFmode.
	(aarch64_pop_regs): Use aarch64_reg_save_mode to get mode.
	(aarch64_components_for_bb): Check for simd function.
	(aarch64_process_components): Ditto.
	(aarch64_expand_prologue): Ditto.
	(aarch64_expand_epilogue): Ditto.
	(aarch64_expand_call): Ditto.
	(TARGET_ATTRIBUTE_TABLE): New define.
	* config/aarch64/aarch64.h (REG_ALLOC_ORDER): New define.
	(HONOR_REG_ALLOC_ORDER): Ditto.
	(FP_SIMD_SAVED_REGNUM_P): Ditto.
	* config/aarch64/aarch64.md (V23_REGNUM) New constant.
	(loadwb_pair<TX:mode>_<P:mode>): New instruction.
	("storewb_pair<TX:mode>_<P:mode>): Ditto.
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 1369704..d7557e2 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1026,6 +1026,15 @@ static const struct processor *selected_tune;
 /* The current tuning set.  */
 struct tune_params aarch64_tune_params = generic_tunings;
 
+/* Table of machine attributes.  */
+static const struct attribute_spec aarch64_attribute_table[] =
+{
+  /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
+       affects_type_identity, handler, exclude } */
+  { "aarch64_vector_pcs", 0, 0, true,  false, false, false, NULL, NULL },
+  { NULL,                 0, 0, false, false, false, false, NULL, NULL }
+};
+
 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
 
 /* An ISA extension in the co-processor and main instruction set space.  */
@@ -1404,6 +1413,26 @@ aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
   return false;
 }
 
+/* Return true if this is a definition of a vectorized simd function.  */
+
+static bool
+aarch64_simd_decl_p (tree fndecl)
+{
+  if (lookup_attribute ("aarch64_vector_pcs", DECL_ATTRIBUTES (fndecl)) != NULL)
+    return true;
+  if (lookup_attribute ("simd", DECL_ATTRIBUTES (fndecl)) == NULL)
+    return false;
+  return (VECTOR_TYPE_P (TREE_TYPE (TREE_TYPE (fndecl))));
+}
+
+static
+machine_mode aarch64_reg_save_mode (tree fndecl, unsigned regno)
+{
+  return GP_REGNUM_P (regno)
+	   ? E_DImode
+	   : (aarch64_simd_decl_p (fndecl) ? E_TFmode : E_DFmode);
+}
+
 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
    clobbers the top 64 bits when restoring the bottom 64 bits.  */
@@ -1498,6 +1527,13 @@ aarch64_is_noplt_call_p (rtx sym)
   return false;
 }
 
+static bool
+aarch64_is_simd_call_p (rtx sym)
+{
+  tree decl = SYMBOL_REF_DECL (sym);
+  return  decl && aarch64_simd_decl_p (decl);
+}
+
 /* Return true if the offsets to a zero/sign-extract operation
    represent an expression that matches an extend operation.  The
    operands represent the paramters from
@@ -4034,6 +4070,7 @@ aarch64_layout_frame (void)
 {
   HOST_WIDE_INT offset = 0;
   int regno, last_fp_reg = INVALID_REGNUM;
+  bool simd_function = aarch64_simd_decl_p (cfun->decl);
 
   if (reload_completed && cfun->machine->frame.laid_out)
     return;
@@ -4068,7 +4105,8 @@ aarch64_layout_frame (void)
 
   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
     if (df_regs_ever_live_p (regno)
-	&& !call_used_regs[regno])
+	&& (!call_used_regs[regno]
+	    || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno))))
       {
 	cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
 	last_fp_reg = regno;
@@ -4105,7 +4143,8 @@ aarch64_layout_frame (void)
       {
 	/* If there is an alignment gap between integer and fp callee-saves,
 	   allocate the last fp register to it if possible.  */
-	if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
+	if (regno == last_fp_reg && has_align_gap
+	    && !simd_function && (offset & 8) == 0)
 	  {
 	    cfun->machine->frame.reg_offset[regno] = max_int_offset;
 	    break;
@@ -4117,7 +4156,7 @@ aarch64_layout_frame (void)
 	else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
 		 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
 	  cfun->machine->frame.wb_candidate2 = regno;
-	offset += UNITS_PER_WORD;
+	offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD;
       }
 
   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
@@ -4260,6 +4299,10 @@ aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
       return gen_storewb_pairdf_di (base, base, reg, reg2,
 				    GEN_INT (-adjustment),
 				    GEN_INT (UNITS_PER_WORD - adjustment));
+    case E_TFmode:
+      return gen_storewb_pairtf_di (base, base, reg, reg2,
+				    GEN_INT (-adjustment),
+				    GEN_INT (UNITS_PER_VREG - adjustment));
     default:
       gcc_unreachable ();
     }
@@ -4272,7 +4315,7 @@ static void
 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
 {
   rtx_insn *insn;
-  machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
+  machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
 
   if (regno2 == INVALID_REGNUM)
     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
@@ -4302,6 +4345,9 @@ aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
     case E_DFmode:
       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
 				   GEN_INT (UNITS_PER_WORD));
+    case E_TFmode:
+      return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
+				   GEN_INT (UNITS_PER_VREG));
     default:
       gcc_unreachable ();
     }
@@ -4315,7 +4361,7 @@ static void
 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
 		  rtx *cfi_ops)
 {
-  machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
+  machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
   rtx reg1 = gen_rtx_REG (mode, regno1);
 
   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
@@ -4628,13 +4674,15 @@ aarch64_components_for_bb (basic_block bb)
   bitmap in = DF_LIVE_IN (bb);
   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
+  bool simd_function = aarch64_simd_decl_p (cfun->decl);
 
   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
   bitmap_clear (components);
 
   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
-    if ((!call_used_regs[regno])
+    if ((!call_used_regs[regno]
+	|| (simd_function && FP_SIMD_SAVED_REGNUM_P (regno)))
        && (bitmap_bit_p (in, regno)
 	   || bitmap_bit_p (gen, regno)
 	   || bitmap_bit_p (kill, regno)))
@@ -4706,8 +4754,10 @@ aarch64_process_components (sbitmap components, bool prologue_p)
   while (regno != last_regno)
     {
       /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
-	 so DFmode for the vector registers is enough.  */
-      machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
+	 so DFmode for the vector registers is enough.  For simd functions
+         we want to save the entire register.  */
+      machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno);
+      
       rtx reg = gen_rtx_REG (mode, regno);
       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
       if (!frame_pointer_needed)
@@ -4736,6 +4786,7 @@ aarch64_process_components (sbitmap components, bool prologue_p)
 	 mergeable with the current one into a pair.  */
       if (!satisfies_constraint_Ump (mem)
 	  || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
+	  || (aarch64_simd_decl_p (cfun->decl) && (FP_REGNUM_P (regno)))
 	  || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
 		       GET_MODE_SIZE (mode)))
 	{
@@ -4958,8 +5009,12 @@ aarch64_expand_prologue (void)
 
   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
 			     callee_adjust != 0 || emit_frame_chain);
-  aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
-			     callee_adjust != 0 || emit_frame_chain);
+  if (aarch64_simd_decl_p (cfun->decl))
+    aarch64_save_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
+			       callee_adjust != 0 || emit_frame_chain);
+  else
+    aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
+			       callee_adjust != 0 || emit_frame_chain);
   aarch64_sub_sp (ip1_rtx, ip0_rtx, final_adjust, !frame_pointer_needed);
 }
 
@@ -5040,8 +5095,12 @@ aarch64_expand_epilogue (bool for_sibcall)
 
   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
 				callee_adjust != 0, &cfi_ops);
-  aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
-				callee_adjust != 0, &cfi_ops);
+  if (aarch64_simd_decl_p (cfun->decl))
+    aarch64_restore_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
+				  callee_adjust != 0, &cfi_ops);
+  else
+    aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
+				  callee_adjust != 0, &cfi_ops);
 
   if (need_barrier_p)
     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
@@ -6281,6 +6340,7 @@ aarch64_expand_call (rtx result, rtx mem, bool sibcall)
   rtx call, callee, tmp;
   rtvec vec;
   machine_mode mode;
+  rtx *fusage;
 
   gcc_assert (MEM_P (mem));
   callee = XEXP (mem, 0);
@@ -6309,6 +6369,14 @@ aarch64_expand_call (rtx result, rtx mem, bool sibcall)
   vec = gen_rtvec (2, call, tmp);
   call = gen_rtx_PARALLEL (VOIDmode, vec);
 
+  if (aarch64_simd_decl_p (cfun->decl) && !aarch64_is_simd_call_p (callee))
+    {
+      rtx *fusage = &CALL_INSN_FUNCTION_USAGE (call);
+      int i;
+      for (i = V0_REGNUM; i <= V31_REGNUM; i++)
+	clobber_reg (fusage, gen_rtx_REG (TFmode, i));
+    }
+
   aarch64_emit_call_insn (call);
 }
 
@@ -18070,6 +18138,9 @@ aarch64_libgcc_floating_mode_supported_p
 #undef TARGET_SELECT_EARLY_REMAT_MODES
 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
 
+#undef TARGET_ATTRIBUTE_TABLE
+#define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
+
 #if CHECKING_P
 #undef TARGET_RUN_TARGET_SELFTESTS
 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index f284e74..6b9becc 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -389,6 +389,37 @@ extern unsigned aarch64_architecture_version;
     V_ALIASES(28), V_ALIASES(29), V_ALIASES(30), V_ALIASES(31)  \
   }
 
+/* This is here just to change the order of the vector registers so
+   that V24 to V31 are used before V16 to V23.  In SIMD functions
+   V16 to V23 are callee saved so we want to use V24 to V31 first.
+
+   ADJUST_REG_ALLOC_ORDER does not work if REG_ALLOC_ORDER is not used.  */
+
+#define REG_ALLOC_ORDER				\
+{						\
+  /* Argument registers.  */			\
+  0, 1, 2, 3, 4, 5, 6, 7,			\
+  /* Caller-saved registers.  */		\
+  8, 9, 10, 11, 12, 13, 14, 15,			\
+  16, 17, 18, 					\
+  /* Callee-saved registers.  */		\
+  19, 20, 21, 22, 23, 24, 25, 26,		\
+  27, 28,					\
+  /* All other registers.  */			\
+  29, 30, 31,					\
+  /* Argument vregisters.  */			\
+  32, 33, 34, 35, 36, 37, 38, 39,		\
+  /* Caller-saved vregisters.  */		\
+  56, 57, 58, 59, 60, 61, 62, 63,		\
+  48, 49, 50, 51, 52, 53, 54, 55,		\
+  /* Callee-saved vregisters.  */		\
+  40, 41, 42, 43, 44, 45, 46, 47,		\
+  /* Other pseudo registers.  */		\
+  64, 65, 66					\
+}
+
+#define HONOR_REG_ALLOC_ORDER 1
+
 /* Say that the return address register is used by the epilogue, but only after
    epilogue generation is complete.  Note that in the case of sibcalls, the
    values "used by the epilogue" are considered live at the start of the called
@@ -500,6 +531,8 @@ extern unsigned aarch64_architecture_version;
 #define PR_LO_REGNUM_P(REGNO)\
   (((unsigned) (REGNO - P0_REGNUM)) <= (P7_REGNUM - P0_REGNUM))
 
+#define FP_SIMD_SAVED_REGNUM_P(REGNO)			\
+  (((unsigned) (REGNO - V8_REGNUM)) <= (V23_REGNUM - V8_REGNUM))
 
 /* Register and constant classes.  */
 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index a014a01..da82782 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -63,6 +63,7 @@
     (V15_REGNUM		47)
     (V16_REGNUM		48)
     (V20_REGNUM		52)
+    (V23_REGNUM		55)
     (V24_REGNUM		56)
     (V28_REGNUM		60)
     (V31_REGNUM		63)
@@ -1413,6 +1414,21 @@
   [(set_attr "type" "neon_load1_2reg")]
 )
 
+(define_insn "loadwb_pair<TX:mode>_<P:mode>"
+  [(parallel
+    [(set (match_operand:P 0 "register_operand" "=k")
+          (plus:P (match_operand:P 1 "register_operand" "0")
+                  (match_operand:P 4 "aarch64_mem_pair_offset" "n")))
+     (set (match_operand:TX 2 "register_operand" "=w")
+          (mem:TX (match_dup 1)))
+     (set (match_operand:TX 3 "register_operand" "=w")
+          (mem:TX (plus:P (match_dup 1)
+                  (match_operand:P 5 "const_int_operand" "n"))))])]
+  "INTVAL (operands[5]) == GET_MODE_SIZE (<TX:MODE>mode)"
+  "ldp\\t%<w>2, %<w>3, [%1], %4"
+  [(set_attr "type" "neon_load1_2reg")]
+)
+
 ;; Store pair with pre-index writeback.  This is primarily used in function
 ;; prologues.
 (define_insn "storewb_pair<GPI:mode>_<P:mode>"
@@ -1447,6 +1463,22 @@
   [(set_attr "type" "neon_store1_2reg<q>")]
 )
 
+(define_insn "storewb_pair<TX:mode>_<P:mode>"
+  [(parallel
+    [(set (match_operand:P 0 "register_operand" "=&k")
+          (plus:P (match_operand:P 1 "register_operand" "0")
+                  (match_operand:P 4 "aarch64_mem_pair_offset" "n")))
+     (set (mem:TX (plus:P (match_dup 0)
+                  (match_dup 4)))
+          (match_operand:TX 2 "register_operand" "w"))
+     (set (mem:TX (plus:P (match_dup 0)
+                  (match_operand:P 5 "const_int_operand" "n")))
+          (match_operand:TX 3 "register_operand" "w"))])]
+  "INTVAL (operands[5]) == INTVAL (operands[4]) + GET_MODE_SIZE (<TX:MODE>mode)"
+  "stp\\t%<w>2, %<w>3, [%0, %4]!"
+  [(set_attr "type" "neon_store1_2reg<q>")]
+)
+
 ;; -------------------------------------------------------------------
 ;; Sign/Zero extension
 ;; -------------------------------------------------------------------
Steve Ellcey July 25, 2018, 6:30 p.m. | #8
Here is version 3 of my patch to implement the SIMD ABI on Aarch64.
I am having a problem with how to handle a SIMD function calling a
non-SIMD function.  When this happens the SIMD function needs to save
V8 to V23 because it cannot count on the non-SIMD function to save
all 128 bits of these registers.

I thought I had this working in the last patch but as I write test
cases, it appears that it is not working and I am not sure how to
implement it.  I tried adding clobbers in aarch64_expand_call but
that is not working (see code in this patch in aarch64_expand_call).
If I add them to 'call' which is a parallel insn, they are ignored.
If I find the underlying call instruction that is part of the parallel
then the clobbers get added to the instruction but then the call itself
is not recognized with the extra clobbers in place.  I don't think we
want to add new call instructions in aarch64.md to handle the vector
register saves and restores.  Am I trying to add the clobbers in the
wrong place?  Where and when should extra clobbers be added to a call
that is going to clobber more registers than what is indicated by
CALL_USED_REGISTERS?

I suppose I could use TARGET_HARD_REGNO_CALL_PART_CLOBBERED but I would
have to extend it to include the call instruction as an argument so the
the code could determine if the call being made was to a simd or non-simd
function.

Steve Ellcey
sellcey@cavium.com


2018-07-25  Steve Ellcey  <sellcey@cavium.com>

	* config/aarch64/aarch64.c (aarch64_attribute_table): New array.
	(aarch64_simd_decl_p): New function.
	(aarch64_reg_save_mode): New function.
	(aarch64_is_simd_call_p): New function.
	(aarch64_function_ok_for_sibcall): Check for simd calls.
	(aarch64_layout_frame): Check for simd function.
	(aarch64_gen_storewb_pair): Handle E_TFmode.
	(aarch64_push_regs): Use aarch64_reg_save_mode to get mode.
	(aarch64_gen_loadwb_pair): Handle E_TFmode.
	(aarch64_pop_regs): Use aarch64_reg_save_mode to get mode.
	(aarch64_components_for_bb): Check for simd function.
	(aarch64_process_components): Ditto.
	(aarch64_expand_prologue): Ditto.
	(aarch64_expand_epilogue): Ditto.
	(aarch64_expand_call): Ditto.
	(TARGET_ATTRIBUTE_TABLE): New define.
	* config/aarch64/aarch64.h (REG_ALLOC_ORDER): New define.
	(HONOR_REG_ALLOC_ORDER): Ditto.
	(FP_SIMD_SAVED_REGNUM_P): Ditto.
	* config/aarch64/aarch64.md (V23_REGNUM) New constant.
	(loadwb_pair<TX:mode>_<P:mode>): New instruction.
	("storewb_pair<TX:mode>_<P:mode>): Ditto.
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index fa01475..cc642f5 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1027,6 +1027,15 @@ static const struct processor *selected_tune;
 /* The current tuning set.  */
 struct tune_params aarch64_tune_params = generic_tunings;
 
+/* Table of machine attributes.  */
+static const struct attribute_spec aarch64_attribute_table[] =
+{
+  /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
+       affects_type_identity, handler, exclude } */
+  { "aarch64_vector_pcs", 0, 0, true,  false, false, false, NULL, NULL },
+  { NULL,                 0, 0, false, false, false, false, NULL, NULL }
+};
+
 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
 
 /* An ISA extension in the co-processor and main instruction set space.  */
@@ -1405,6 +1414,26 @@ aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
   return false;
 }
 
+/* Return true if this is a definition of a vectorized simd function.  */
+
+static bool
+aarch64_simd_decl_p (tree fndecl)
+{
+  if (lookup_attribute ("aarch64_vector_pcs", DECL_ATTRIBUTES (fndecl)) != NULL)
+    return true;
+  if (lookup_attribute ("simd", DECL_ATTRIBUTES (fndecl)) == NULL)
+    return false;
+  return (VECTOR_TYPE_P (TREE_TYPE (TREE_TYPE (fndecl))));
+}
+
+static
+machine_mode aarch64_reg_save_mode (tree fndecl, unsigned regno)
+{
+  return GP_REGNUM_P (regno)
+	   ? E_DImode
+	   : (aarch64_simd_decl_p (fndecl) ? E_TFmode : E_DFmode);
+}
+
 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
    clobbers the top 64 bits when restoring the bottom 64 bits.  */
@@ -1499,6 +1528,13 @@ aarch64_is_noplt_call_p (rtx sym)
   return false;
 }
 
+static bool
+aarch64_is_simd_call_p (rtx sym)
+{
+  tree decl = SYMBOL_REF_DECL (sym);
+  return  decl && aarch64_simd_decl_p (decl);
+}
+
 /* Return true if the offsets to a zero/sign-extract operation
    represent an expression that matches an extend operation.  The
    operands represent the paramters from
@@ -3269,10 +3305,11 @@ aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
 }
 
 static bool
-aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
-				 tree exp ATTRIBUTE_UNUSED)
+aarch64_function_ok_for_sibcall (tree decl, tree exp ATTRIBUTE_UNUSED)
 {
-  /* Currently, always true.  */
+  if (aarch64_simd_decl_p (cfun->decl) && (!decl || !aarch64_simd_decl_p (decl)))
+    return false;
+
   return true;
 }
 
@@ -4035,6 +4072,7 @@ aarch64_layout_frame (void)
 {
   HOST_WIDE_INT offset = 0;
   int regno, last_fp_reg = INVALID_REGNUM;
+  bool simd_function = aarch64_simd_decl_p (cfun->decl);
 
   if (reload_completed && cfun->machine->frame.laid_out)
     return;
@@ -4069,7 +4107,8 @@ aarch64_layout_frame (void)
 
   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
     if (df_regs_ever_live_p (regno)
-	&& !call_used_regs[regno])
+	&& (!call_used_regs[regno]
+	    || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno))))
       {
 	cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
 	last_fp_reg = regno;
@@ -4106,7 +4145,8 @@ aarch64_layout_frame (void)
       {
 	/* If there is an alignment gap between integer and fp callee-saves,
 	   allocate the last fp register to it if possible.  */
-	if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
+	if (regno == last_fp_reg && has_align_gap
+	    && !simd_function && (offset & 8) == 0)
 	  {
 	    cfun->machine->frame.reg_offset[regno] = max_int_offset;
 	    break;
@@ -4118,7 +4158,7 @@ aarch64_layout_frame (void)
 	else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
 		 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
 	  cfun->machine->frame.wb_candidate2 = regno;
-	offset += UNITS_PER_WORD;
+	offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD;
       }
 
   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
@@ -4261,6 +4301,10 @@ aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
       return gen_storewb_pairdf_di (base, base, reg, reg2,
 				    GEN_INT (-adjustment),
 				    GEN_INT (UNITS_PER_WORD - adjustment));
+    case E_TFmode:
+      return gen_storewb_pairtf_di (base, base, reg, reg2,
+				    GEN_INT (-adjustment),
+				    GEN_INT (UNITS_PER_VREG - adjustment));
     default:
       gcc_unreachable ();
     }
@@ -4273,7 +4317,7 @@ static void
 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
 {
   rtx_insn *insn;
-  machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
+  machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
 
   if (regno2 == INVALID_REGNUM)
     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
@@ -4303,6 +4347,9 @@ aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
     case E_DFmode:
       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
 				   GEN_INT (UNITS_PER_WORD));
+    case E_TFmode:
+      return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
+				   GEN_INT (UNITS_PER_VREG));
     default:
       gcc_unreachable ();
     }
@@ -4316,7 +4363,7 @@ static void
 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
 		  rtx *cfi_ops)
 {
-  machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
+  machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
   rtx reg1 = gen_rtx_REG (mode, regno1);
 
   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
@@ -4629,13 +4676,15 @@ aarch64_components_for_bb (basic_block bb)
   bitmap in = DF_LIVE_IN (bb);
   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
+  bool simd_function = aarch64_simd_decl_p (cfun->decl);
 
   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
   bitmap_clear (components);
 
   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
-    if ((!call_used_regs[regno])
+    if ((!call_used_regs[regno]
+	|| (simd_function && FP_SIMD_SAVED_REGNUM_P (regno)))
        && (bitmap_bit_p (in, regno)
 	   || bitmap_bit_p (gen, regno)
 	   || bitmap_bit_p (kill, regno)))
@@ -4707,8 +4756,10 @@ aarch64_process_components (sbitmap components, bool prologue_p)
   while (regno != last_regno)
     {
       /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
-	 so DFmode for the vector registers is enough.  */
-      machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
+	 so DFmode for the vector registers is enough.  For simd functions
+         we want to save the entire register.  */
+      machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno);
+      
       rtx reg = gen_rtx_REG (mode, regno);
       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
       if (!frame_pointer_needed)
@@ -4737,6 +4788,7 @@ aarch64_process_components (sbitmap components, bool prologue_p)
 	 mergeable with the current one into a pair.  */
       if (!satisfies_constraint_Ump (mem)
 	  || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
+	  || (aarch64_simd_decl_p (cfun->decl) && (FP_REGNUM_P (regno)))
 	  || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
 		       GET_MODE_SIZE (mode)))
 	{
@@ -4959,8 +5011,12 @@ aarch64_expand_prologue (void)
 
   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
 			     callee_adjust != 0 || emit_frame_chain);
-  aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
-			     callee_adjust != 0 || emit_frame_chain);
+  if (aarch64_simd_decl_p (cfun->decl))
+    aarch64_save_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
+			       callee_adjust != 0 || emit_frame_chain);
+  else
+    aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
+			       callee_adjust != 0 || emit_frame_chain);
   aarch64_sub_sp (ip1_rtx, ip0_rtx, final_adjust, !frame_pointer_needed);
 }
 
@@ -5041,8 +5097,12 @@ aarch64_expand_epilogue (bool for_sibcall)
 
   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
 				callee_adjust != 0, &cfi_ops);
-  aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
-				callee_adjust != 0, &cfi_ops);
+  if (aarch64_simd_decl_p (cfun->decl))
+    aarch64_restore_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
+				  callee_adjust != 0, &cfi_ops);
+  else
+    aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
+				  callee_adjust != 0, &cfi_ops);
 
   if (need_barrier_p)
     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
@@ -6318,6 +6378,18 @@ aarch64_expand_call (rtx result, rtx mem, bool sibcall)
   vec = gen_rtvec (2, call, tmp);
   call = gen_rtx_PARALLEL (VOIDmode, vec);
 
+#if 1
+  if (aarch64_simd_decl_p (cfun->decl) && !aarch64_is_simd_call_p (callee))
+    {
+      rtx *fusage = &CALL_INSN_FUNCTION_USAGE (call);
+      int i;
+
+      for (i = V0_REGNUM; i <= V31_REGNUM; i++)
+	if (FP_SIMD_SAVED_REGNUM_P (i))
+	  clobber_reg (fusage, gen_rtx_REG (TFmode, i));
+    }
+#endif
+
   aarch64_emit_call_insn (call);
 }
 
@@ -18210,6 +18282,9 @@ aarch64_libgcc_floating_mode_supported_p
 #undef TARGET_SELECT_EARLY_REMAT_MODES
 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
 
+#undef TARGET_ATTRIBUTE_TABLE
+#define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
+
 #if CHECKING_P
 #undef TARGET_RUN_TARGET_SELFTESTS
 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index c121850..279dbed 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -392,6 +392,37 @@ extern unsigned aarch64_architecture_version;
     V_ALIASES(28), V_ALIASES(29), V_ALIASES(30), V_ALIASES(31)  \
   }
 
+/* This is here just to change the order of the vector registers so
+   that V24 to V31 are used before V16 to V23.  In SIMD functions
+   V16 to V23 are callee saved so we want to use V24 to V31 first.
+
+   ADJUST_REG_ALLOC_ORDER does not work if REG_ALLOC_ORDER is not used.  */
+
+#define REG_ALLOC_ORDER				\
+{						\
+  /* Argument registers.  */			\
+  0, 1, 2, 3, 4, 5, 6, 7,			\
+  /* Caller-saved registers.  */		\
+  8, 9, 10, 11, 12, 13, 14, 15,			\
+  16, 17, 18, 					\
+  /* Callee-saved registers.  */		\
+  19, 20, 21, 22, 23, 24, 25, 26,		\
+  27, 28,					\
+  /* All other registers.  */			\
+  29, 30, 31,					\
+  /* Argument vregisters.  */			\
+  32, 33, 34, 35, 36, 37, 38, 39,		\
+  /* Caller-saved vregisters.  */		\
+  56, 57, 58, 59, 60, 61, 62, 63,		\
+  48, 49, 50, 51, 52, 53, 54, 55,		\
+  /* Callee-saved vregisters.  */		\
+  40, 41, 42, 43, 44, 45, 46, 47,		\
+  /* Other pseudo registers.  */		\
+  64, 65, 66					\
+}
+
+#define HONOR_REG_ALLOC_ORDER 1
+
 /* Say that the return address register is used by the epilogue, but only after
    epilogue generation is complete.  Note that in the case of sibcalls, the
    values "used by the epilogue" are considered live at the start of the called
@@ -503,6 +534,8 @@ extern unsigned aarch64_architecture_version;
 #define PR_LO_REGNUM_P(REGNO)\
   (((unsigned) (REGNO - P0_REGNUM)) <= (P7_REGNUM - P0_REGNUM))
 
+#define FP_SIMD_SAVED_REGNUM_P(REGNO)			\
+  (((unsigned) (REGNO - V8_REGNUM)) <= (V23_REGNUM - V8_REGNUM))
 
 /* Register and constant classes.  */
 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index e9c16f9..74a4821 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -63,6 +63,7 @@
     (V15_REGNUM		47)
     (V16_REGNUM		48)
     (V20_REGNUM		52)
+    (V23_REGNUM		55)
     (V24_REGNUM		56)
     (V28_REGNUM		60)
     (V31_REGNUM		63)
@@ -1413,6 +1414,21 @@
   [(set_attr "type" "neon_load1_2reg")]
 )
 
+(define_insn "loadwb_pair<TX:mode>_<P:mode>"
+  [(parallel
+    [(set (match_operand:P 0 "register_operand" "=k")
+          (plus:P (match_operand:P 1 "register_operand" "0")
+                  (match_operand:P 4 "aarch64_mem_pair_offset" "n")))
+     (set (match_operand:TX 2 "register_operand" "=w")
+          (mem:TX (match_dup 1)))
+     (set (match_operand:TX 3 "register_operand" "=w")
+          (mem:TX (plus:P (match_dup 1)
+                  (match_operand:P 5 "const_int_operand" "n"))))])]
+  "INTVAL (operands[5]) == GET_MODE_SIZE (<TX:MODE>mode)"
+  "ldp\\t%q2, %q3, [%1], %4"
+  [(set_attr "type" "neon_load1_2reg")]
+)
+
 ;; Store pair with pre-index writeback.  This is primarily used in function
 ;; prologues.
 (define_insn "storewb_pair<GPI:mode>_<P:mode>"
@@ -1447,6 +1463,22 @@
   [(set_attr "type" "neon_store1_2reg<q>")]
 )
 
+(define_insn "storewb_pair<TX:mode>_<P:mode>"
+  [(parallel
+    [(set (match_operand:P 0 "register_operand" "=&k")
+          (plus:P (match_operand:P 1 "register_operand" "0")
+                  (match_operand:P 4 "aarch64_mem_pair_offset" "n")))
+     (set (mem:TX (plus:P (match_dup 0)
+                  (match_dup 4)))
+          (match_operand:TX 2 "register_operand" "w"))
+     (set (mem:TX (plus:P (match_dup 0)
+                  (match_operand:P 5 "const_int_operand" "n")))
+          (match_operand:TX 3 "register_operand" "w"))])]
+  "INTVAL (operands[5]) == INTVAL (operands[4]) + GET_MODE_SIZE (<TX:MODE>mode)"
+  "stp\\t%q2, %q3, [%0, %4]!"
+  [(set_attr "type" "neon_store1_2reg<q>")]
+)
+
 ;; -------------------------------------------------------------------
 ;; Sign/Zero extension
 ;; -------------------------------------------------------------------

Patch

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 1369704..b25da11 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1026,6 +1026,15 @@  static const struct processor *selected_tune;
 /* The current tuning set.  */
 struct tune_params aarch64_tune_params = generic_tunings;
 
+/* Table of machine attributes.  */
+static const struct attribute_spec aarch64_attribute_table[] =
+{
+  /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
+       affects_type_identity, handler, exclude } */
+  { "aarch64_vector_pcs", 0, 0, true,  false, false, false, NULL, NULL },
+  { NULL,                 0, 0, false, false, false, false, NULL, NULL }
+};
+
 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
 
 /* An ISA extension in the co-processor and main instruction set space.  */
@@ -1404,6 +1413,18 @@  aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
   return false;
 }
 
+/* Return true if this is a definition of a vectorized simd function.  */
+
+static bool
+aarch64_simd_function_p (tree fndecl)
+{
+  if (lookup_attribute ("aarch64_vector_pcs", DECL_ATTRIBUTES (fndecl)) != NULL)
+    return true;
+  if (lookup_attribute ("simd", DECL_ATTRIBUTES (fndecl)) == NULL)
+    return false;
+  return (VECTOR_TYPE_P (TREE_TYPE (TREE_TYPE (fndecl))));
+}
+
 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
    clobbers the top 64 bits when restoring the bottom 64 bits.  */
@@ -4034,6 +4055,7 @@  aarch64_layout_frame (void)
 {
   HOST_WIDE_INT offset = 0;
   int regno, last_fp_reg = INVALID_REGNUM;
+  bool simd_function = aarch64_simd_function_p (cfun->decl);
 
   if (reload_completed && cfun->machine->frame.laid_out)
     return;
@@ -4068,7 +4090,8 @@  aarch64_layout_frame (void)
 
   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
     if (df_regs_ever_live_p (regno)
-	&& !call_used_regs[regno])
+	&& (!call_used_regs[regno]
+	    || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno))))
       {
 	cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
 	last_fp_reg = regno;
@@ -4105,7 +4128,8 @@  aarch64_layout_frame (void)
       {
 	/* If there is an alignment gap between integer and fp callee-saves,
 	   allocate the last fp register to it if possible.  */
-	if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
+	if (regno == last_fp_reg && has_align_gap
+	    && !simd_function && (offset & 8) == 0)
 	  {
 	    cfun->machine->frame.reg_offset[regno] = max_int_offset;
 	    break;
@@ -4117,7 +4141,7 @@  aarch64_layout_frame (void)
 	else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
 		 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
 	  cfun->machine->frame.wb_candidate2 = regno;
-	offset += UNITS_PER_WORD;
+	offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD;
       }
 
   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
@@ -4706,8 +4730,11 @@  aarch64_process_components (sbitmap components, bool prologue_p)
   while (regno != last_regno)
     {
       /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
-	 so DFmode for the vector registers is enough.  */
-      machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
+	 so DFmode for the vector registers is enough.  For simd functions
+         we want to save the entire register.  */
+      machine_mode mode = GP_REGNUM_P (regno) ? E_DImode
+	: (aarch64_simd_function_p (cfun->decl) ? E_TFmode : E_DFmode);
+      
       rtx reg = gen_rtx_REG (mode, regno);
       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
       if (!frame_pointer_needed)
@@ -4736,6 +4763,7 @@  aarch64_process_components (sbitmap components, bool prologue_p)
 	 mergeable with the current one into a pair.  */
       if (!satisfies_constraint_Ump (mem)
 	  || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
+	  || (aarch64_simd_function_p (cfun->decl) && (FP_REGNUM_P (regno)))
 	  || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
 		       GET_MODE_SIZE (mode)))
 	{
@@ -4958,8 +4986,12 @@  aarch64_expand_prologue (void)
 
   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
 			     callee_adjust != 0 || emit_frame_chain);
-  aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
-			     callee_adjust != 0 || emit_frame_chain);
+  if (aarch64_simd_function_p (cfun->decl))
+    aarch64_save_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
+			       callee_adjust != 0 || emit_frame_chain);
+  else
+    aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
+			       callee_adjust != 0 || emit_frame_chain);
   aarch64_sub_sp (ip1_rtx, ip0_rtx, final_adjust, !frame_pointer_needed);
 }
 
@@ -5040,8 +5072,12 @@  aarch64_expand_epilogue (bool for_sibcall)
 
   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
 				callee_adjust != 0, &cfi_ops);
-  aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
-				callee_adjust != 0, &cfi_ops);
+  if (aarch64_simd_function_p (cfun->decl))
+    aarch64_restore_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
+				  callee_adjust != 0, &cfi_ops);
+  else
+    aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
+				  callee_adjust != 0, &cfi_ops);
 
   if (need_barrier_p)
     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
@@ -18070,6 +18106,9 @@  aarch64_libgcc_floating_mode_supported_p
 #undef TARGET_SELECT_EARLY_REMAT_MODES
 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
 
+#undef TARGET_ATTRIBUTE_TABLE
+#define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
+
 #if CHECKING_P
 #undef TARGET_RUN_TARGET_SELFTESTS
 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index f284e74..d11474e 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -500,6 +500,8 @@  extern unsigned aarch64_architecture_version;
 #define PR_LO_REGNUM_P(REGNO)\
   (((unsigned) (REGNO - P0_REGNUM)) <= (P7_REGNUM - P0_REGNUM))
 
+#define FP_SIMD_SAVED_REGNUM_P(REGNO)			\
+  (((unsigned) (REGNO - V8_REGNUM)) <= (V23_REGNUM - V8_REGNUM))
 
 /* Register and constant classes.  */
 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index a014a01..d319430 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -63,6 +63,7 @@ 
     (V15_REGNUM		47)
     (V16_REGNUM		48)
     (V20_REGNUM		52)
+    (V23_REGNUM		55)
     (V24_REGNUM		56)
     (V28_REGNUM		60)
     (V31_REGNUM		63)