[committed] aarch64: A couple of mul_laneq tweaks

Message ID mpt7dk534ap.fsf@arm.com
State New
Headers show
Series
  • [committed] aarch64: A couple of mul_laneq tweaks
Related show

Commit Message

Jakub Jelinek via Gcc-patches May 11, 2021, 11:20 a.m.
This patch removes the duplication between the mul_laneq<mode>3
and the older mul-lane patterns.  The older patterns were previously
divided into two based on whether the indexed operand had the same mode
as the other operands or whether it had the opposite length from the
other operands (64-bit vs. 128-bit).  However, it seemed easier to
divide them instead based on whether the indexed operand was 64-bit or
128-bit, since that maps directly to the arm_neon.h “q” conventions.

Also, it looks like the older patterns were missing cases for
V8HF<->V4HF combinations, which meant that vmul_laneq_f16 and
vmulq_lane_f16 didn't produce single instructions.

There was a typo in the V2SF entry for VCONQ, but in practice
no patterns were using that entry until now.

The test passes for both endiannesses, but endianness does change
the mapping between regexps and functions.

Tested on aarch64-linux-gnu and aarch64_be-elf, pushed to trunk.

Richard


gcc/
	* config/aarch64/iterators.md (VMUL_CHANGE_NLANES): Delete.
	(VMULD): New iterator.
	(VCOND): Handle V4HF and V8HF.
	(VCONQ): Fix entry for V2SF.
	* config/aarch64/aarch64-simd.md (mul_lane<mode>3): Use VMULD
	instead of VMUL.  Use a 64-bit vector mode for the indexed operand.
	(*aarch64_mul3_elt_<vswap_width_name><mode>): Merge with...
	(mul_laneq<mode>3): ...this define_insn.  Use VMUL instead of VDQSF.
	Use a 128-bit vector mode for the indexed operand.  Use stype for
	the scheduling type.

gcc/testsuite/
	* gcc.target/aarch64/fmul_lane_1.c: New test.
---
 gcc/config/aarch64/aarch64-simd.md            | 46 +++++----------
 gcc/config/aarch64/iterators.md               | 13 ++--
 .../gcc.target/aarch64/fmul_lane_1.c          | 59 +++++++++++++++++++
 3 files changed, 82 insertions(+), 36 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/fmul_lane_1.c

Patch

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 234762960bd..99620895e78 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -719,51 +719,35 @@  (define_expand "copysign<mode>3"
 )
 
 (define_insn "mul_lane<mode>3"
- [(set (match_operand:VMUL 0 "register_operand" "=w")
-       (mult:VMUL
-	 (vec_duplicate:VMUL
+ [(set (match_operand:VMULD 0 "register_operand" "=w")
+       (mult:VMULD
+	 (vec_duplicate:VMULD
 	   (vec_select:<VEL>
-	     (match_operand:VMUL 2 "register_operand" "<h_con>")
+	     (match_operand:<VCOND> 2 "register_operand" "<h_con>")
 	     (parallel [(match_operand:SI 3 "immediate_operand" "i")])))
-	 (match_operand:VMUL 1 "register_operand" "w")))]
+	 (match_operand:VMULD 1 "register_operand" "w")))]
   "TARGET_SIMD"
   {
-    operands[3] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[3]));
+    operands[3] = aarch64_endian_lane_rtx (<VCOND>mode, INTVAL (operands[3]));
     return "<f>mul\\t%0.<Vtype>, %1.<Vtype>, %2.<Vetype>[%3]";
   }
   [(set_attr "type" "neon<fp>_mul_<stype>_scalar<q>")]
 )
 
 (define_insn "mul_laneq<mode>3"
-  [(set (match_operand:VDQSF 0 "register_operand" "=w")
-	(mult:VDQSF
-	  (vec_duplicate:VDQSF
-	    (vec_select:<VEL>
-	      (match_operand:V4SF 2 "register_operand" "w")
-	      (parallel [(match_operand:SI 3 "immediate_operand" "i")])))
-	  (match_operand:VDQSF 1 "register_operand" "w")))]
-  "TARGET_SIMD"
-  {
-    operands[3] = aarch64_endian_lane_rtx (V4SFmode, INTVAL (operands[3]));
-    return "fmul\\t%0.<Vtype>, %1.<Vtype>, %2.<Vetype>[%3]";
-  }
-  [(set_attr "type" "neon_fp_mul_s_scalar<q>")]
-)
-
-(define_insn "*aarch64_mul3_elt_<vswap_width_name><mode>"
-  [(set (match_operand:VMUL_CHANGE_NLANES 0 "register_operand" "=w")
-     (mult:VMUL_CHANGE_NLANES
-       (vec_duplicate:VMUL_CHANGE_NLANES
+  [(set (match_operand:VMUL 0 "register_operand" "=w")
+     (mult:VMUL
+       (vec_duplicate:VMUL
 	  (vec_select:<VEL>
-	    (match_operand:<VSWAP_WIDTH> 1 "register_operand" "<h_con>")
-	    (parallel [(match_operand:SI 2 "immediate_operand")])))
-      (match_operand:VMUL_CHANGE_NLANES 3 "register_operand" "w")))]
+	    (match_operand:<VCONQ> 2 "register_operand" "<h_con>")
+	    (parallel [(match_operand:SI 3 "immediate_operand")])))
+      (match_operand:VMUL 1 "register_operand" "w")))]
   "TARGET_SIMD"
   {
-    operands[2] = aarch64_endian_lane_rtx (<VSWAP_WIDTH>mode, INTVAL (operands[2]));
-    return "<f>mul\\t%0.<Vtype>, %3.<Vtype>, %1.<Vetype>[%2]";
+    operands[3] = aarch64_endian_lane_rtx (<VCONQ>mode, INTVAL (operands[3]));
+    return "<f>mul\\t%0.<Vtype>, %1.<Vtype>, %2.<Vetype>[%3]";
   }
-  [(set_attr "type" "neon<fp>_mul_<Vetype>_scalar<q>")]
+  [(set_attr "type" "neon<fp>_mul_<stype>_scalar<q>")]
 )
 
 (define_insn "mul_n<mode>3"
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index c57aa6bf2f4..69d9dbebe8f 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -312,15 +312,17 @@  (define_mode_iterator SX2 [SI SF])
 (define_mode_iterator DSX [DF DI SF SI])
 
 
-;; Modes available for Advanced SIMD <f>mul lane operations.
+;; Modes available for Advanced SIMD <f>mul operations.
 (define_mode_iterator VMUL [V4HI V8HI V2SI V4SI
 			    (V4HF "TARGET_SIMD_F16INST")
 			    (V8HF "TARGET_SIMD_F16INST")
 			    V2SF V4SF V2DF])
 
-;; Modes available for Advanced SIMD <f>mul lane operations changing lane
-;; count.
-(define_mode_iterator VMUL_CHANGE_NLANES [V4HI V8HI V2SI V4SI V2SF V4SF])
+;; The subset of VMUL for which VCOND is a vector mode.
+(define_mode_iterator VMULD [V4HI V8HI V2SI V4SI
+			     (V4HF "TARGET_SIMD_F16INST")
+			     (V8HF "TARGET_SIMD_F16INST")
+			     V2SF V4SF])
 
 ;; Iterators for single modes, for "@" patterns.
 (define_mode_iterator VNx16QI_ONLY [VNx16QI])
@@ -1201,6 +1203,7 @@  (define_mode_attr VCOND [(HI "V4HI") (SI "V2SI")
 			 (V4HI "V4HI") (V8HI "V4HI")
 			 (V2SI "V2SI") (V4SI "V2SI")
 			 (DI   "DI") (V2DI "DI")
+			 (V4HF "V4HF") (V8HF "V4HF")
 			 (V2SF "V2SF") (V4SF "V2SF")
 			 (V2DF "DF")])
 
@@ -1210,7 +1213,7 @@  (define_mode_attr VCONQ [(V8QI "V16QI") (V16QI "V16QI")
 			 (V2SI "V4SI") (V4SI "V4SI")
 			 (DI   "V2DI") (V2DI "V2DI")
 			 (V4HF "V8HF") (V8HF "V8HF")
-			 (V2SF "V2SF") (V4SF "V4SF")
+			 (V2SF "V4SF") (V4SF "V4SF")
 			 (V2DF "V2DF") (SI   "V4SI")
 			 (HI   "V8HI") (QI   "V16QI")])
 
diff --git a/gcc/testsuite/gcc.target/aarch64/fmul_lane_1.c b/gcc/testsuite/gcc.target/aarch64/fmul_lane_1.c
new file mode 100644
index 00000000000..a2b57581c84
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fmul_lane_1.c
@@ -0,0 +1,59 @@ 
+/* { dg-options "-O" } */
+
+#pragma GCC target "+simd+fp16"
+
+__Float16x4_t
+f1 (__Float16x4_t x, __Float16x4_t y)
+{
+  return x * y[0];
+}
+
+__Float16x4_t
+f2 (__Float16x4_t x, __Float16x4_t y)
+{
+  return x * y[3];
+}
+
+__Float16x4_t
+f3 (__Float16x4_t x, __Float16x8_t y)
+{
+  return x * y[0];
+}
+
+__Float16x4_t
+f4 (__Float16x4_t x, __Float16x8_t y)
+{
+  return x * y[7];
+}
+
+__Float16x8_t
+f5 (__Float16x8_t x, __Float16x4_t y)
+{
+  return x * y[0];
+}
+
+__Float16x8_t
+f6 (__Float16x8_t x, __Float16x4_t y)
+{
+  return x * y[3];
+}
+
+__Float16x8_t
+f7 (__Float16x8_t x, __Float16x8_t y)
+{
+  return x * y[0];
+}
+
+__Float16x8_t
+f8 (__Float16x8_t x, __Float16x8_t y)
+{
+  return x * y[7];
+}
+
+/* { dg-final { scan-assembler-times {\tfmul\tv0.4h, v0.4h, v1.h\[0\]} 2 } } */
+/* { dg-final { scan-assembler-times {\tfmul\tv0.4h, v0.4h, v1.h\[3\]} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmul\tv0.4h, v0.4h, v1.h\[7\]} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmul\tv0.8h, v0.8h, v1.h\[0\]} 2 } } */
+/* { dg-final { scan-assembler-times {\tfmul\tv0.8h, v0.8h, v1.h\[3\]} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmul\tv0.8h, v0.8h, v1.h\[7\]} 1 } } */