[PING,AArch64] Disable reg offset in quad-word store for Falkor

Message ID 20180117112459.23362-1-siddhesh@gotplt.org
State New
Headers show
Series
  • [PING,AArch64] Disable reg offset in quad-word store for Falkor
Related show

Commit Message

Siddhesh Poyarekar Jan. 17, 2018, 11:24 a.m.
From: Siddhesh Poyarekar <siddhesh@sourceware.org>


Hi,

Jim Wilson posted a patch for this in September[1] and it appears
following discussions that the patch was an acceptable fix for falkor.
Kugan followed up[2] with a test case since that was requested during
initial review.  Jim has moved on from Linaro, so I'm pinging this patch
with the hope that it is OK for inclusion since it was posted before the
freeze and is also isolated in impact to just falkor.

Siddhesh

[1] https://gcc.gnu.org/ml/gcc-patches/2017-09/msg01547.html
[2] https://gcc.gnu.org/ml/gcc-patches/2017-11/msg00050.html

On Falkor, because of an idiosyncracy of how the pipelines are designed, a
quad-word store using a reg+reg addressing mode is almost twice as slow as an
add followed by a quad-word store with a single reg addressing mode.  So we
get better performance if we disallow addressing modes using register offsets
with quad-word stores.

Using lmbench compiled with -O2 -ftree-vectorize as my benchmark, I see a 13%
performance increase on stream copy using this patch, and a 16% performance
increase on stream scale using this patch.  I also see a small performance
increase on SPEC CPU2006 of around 0.2% for int and 0.4% for FP at -O3.

2018-01-17  Jim Wilson  <jim.wilson@linaro.org>
	    Kugan Vivekanandarajah  <kugan.vivekanandarajah@linaro.org>

	gcc/
	* config/aarch64/aarch64-protos.h (aarch64_falkor_movti_target_operand_p): Declare.
	constraint instead of m.
	* config/aarch64/aarch64.c (aarch64_falkor_movti_target_operand_p): New.
	* config/aarch64/constraints.md (Utf): New.
	* config/aarch64/aarch64.md (movti_aarch64): Use Utf constraint instead
	of m.
	(movtf_aarch64): Likewise.
	* config/aarch64/aarch64-simd.md (aarch64_simd_mov<mode>): Use Utf

	gcc/testsuite/
	* gcc/testsuite/gcc.target/aarch64/pr82533.c: New test case.

---
 gcc/config/aarch64/aarch64-protos.h        |  1 +
 gcc/config/aarch64/aarch64-simd.md         |  4 ++--
 gcc/config/aarch64/aarch64.c               | 10 ++++++++++
 gcc/config/aarch64/aarch64.md              |  6 +++---
 gcc/config/aarch64/constraints.md          |  6 ++++++
 gcc/testsuite/gcc.target/aarch64/pr82533.c | 11 +++++++++++
 6 files changed, 33 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/pr82533.c

-- 
2.1.4

Patch

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 2d705d2..088d864 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -433,6 +433,7 @@  bool aarch64_simd_mem_operand_p (rtx);
 bool aarch64_sve_ld1r_operand_p (rtx);
 bool aarch64_sve_ldr_operand_p (rtx);
 bool aarch64_sve_struct_memory_operand_p (rtx);
+bool aarch64_falkor_movti_target_operand_p (rtx);
 rtx aarch64_simd_vect_par_cnst_half (machine_mode, int, bool);
 rtx aarch64_tls_get_addr (void);
 tree aarch64_fold_builtin (tree, int, tree *, bool);
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 3d1f6a0..f7daac3 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -131,9 +131,9 @@ 
 
 (define_insn "*aarch64_simd_mov<VQ:mode>"
   [(set (match_operand:VQ 0 "nonimmediate_operand"
-		"=w, Umq,  m,  w, ?r, ?w, ?r, w")
+		"=w, Umq, Utf,  w, ?r, ?w, ?r, w")
 	(match_operand:VQ 1 "general_operand"
-		"m,  Dz, w,  w,  w,  r,  r, Dn"))]
+		"m,  Dz,    w,  w,  w,  r,  r, Dn"))]
   "TARGET_SIMD
    && (register_operand (operands[0], <MODE>mode)
        || aarch64_simd_reg_or_zero (operands[1], <MODE>mode))"
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 2e70f3a..0db7a4f 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -13477,6 +13477,16 @@  aarch64_sve_struct_memory_operand_p (rtx op)
 	  && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
 }
 
+/* Return TRUE if OP is a good address mode for movti target on falkor.  */
+bool
+aarch64_falkor_movti_target_operand_p (rtx op)
+{
+  if ((enum attr_tune) aarch64_tune == TUNE_FALKOR)
+    return MEM_P (op) && ! (GET_CODE (XEXP (op, 0)) == PLUS
+			    && ! CONST_INT_P (XEXP (XEXP (op, 0), 1)));
+  return MEM_P (op);
+}
+
 /* Emit a register copy from operand to operand, taking care not to
    early-clobber source registers in the process.
 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index edb6a75..696fd12 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1079,7 +1079,7 @@ 
 
 (define_insn "*movti_aarch64"
   [(set (match_operand:TI 0
-	 "nonimmediate_operand"  "=r, w,r,w,r,m,m,w,m")
+	 "nonimmediate_operand"  "=r, w,r,w,r,m,m,w,Utf")
 	(match_operand:TI 1
 	 "aarch64_movti_operand" " rn,r,w,w,m,r,Z,m,w"))]
   "(register_operand (operands[0], TImode)
@@ -1226,9 +1226,9 @@ 
 
 (define_insn "*movtf_aarch64"
   [(set (match_operand:TF 0
-	 "nonimmediate_operand" "=w,?&r,w ,?r,w,?w,w,m,?r,m ,m")
+	 "nonimmediate_operand" "=w,?&r,w ,?r,w,?w,w,Utf,?r,m ,m")
 	(match_operand:TF 1
-	 "general_operand"      " w,?r, ?r,w ,Y,Y ,m,w,m ,?r,Y"))]
+	 "general_operand"      " w,?r, ?r,w ,Y,Y ,m,w  ,m ,?r,Y"))]
   "TARGET_FLOAT && (register_operand (operands[0], TFmode)
     || aarch64_reg_or_fp_zero (operands[1], TFmode))"
   "@
diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
index 6cc4cad..d9f2921 100644
--- a/gcc/config/aarch64/constraints.md
+++ b/gcc/config/aarch64/constraints.md
@@ -229,6 +229,12 @@ 
   (and (match_code "mem")
        (match_test "aarch64_sve_ldr_operand_p (op)")))
 
+(define_memory_constraint "Utf"
+  "@iternal
+   A good address for a falkor movti target operand."
+  (and (match_code "mem")
+       (match_test "aarch64_falkor_movti_target_operand_p (op)")))
+
 (define_memory_constraint "Utv"
   "@internal
    An address valid for loading/storing opaque structure
diff --git a/gcc/testsuite/gcc.target/aarch64/pr82533.c b/gcc/testsuite/gcc.target/aarch64/pr82533.c
new file mode 100644
index 0000000..fa28ffa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr82533.c
@@ -0,0 +1,11 @@ 
+/* { dg-do compile } */
+/* { dg-options "-mcpu=falkor -O2 -ftree-vectorize" } */
+
+void
+copy (int N, double *c, double *a)
+{
+  for (int i = 0; i < N; ++i)
+    c[i] = a[i];
+}
+
+/* { dg-final { scan-assembler-not "str\tq\[0-9\]+, \\\[x\[0-9\]+, x\[0-9\]+\\\]" } } */