[Arm] ACLE intrinsics for AdvSIMD bfloat16 dot product

Message ID bd4bf7c2-f298-e683-dc9e-0bbee452323c@arm.com
State New
Headers show
Series
  • [Arm] ACLE intrinsics for AdvSIMD bfloat16 dot product
Related show

Commit Message

Dennis Zhang Jan. 7, 2020, 12:12 p.m.
Hi all,

This patch is part of a series adding support for Armv8.6-A features.
It depends on the patch enabling Arm BFmode 
https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01448.html

This patch adds intrinsics for brain half-precision float-point dot product.
ACLE documents are at https://developer.arm.com/docs/101028/latest
ISA documents are at https://developer.arm.com/docs/ddi0596/latest

Regression tested for arm-none-linux-gnueabi-armv8-a.

Is it OK for trunk please?

Thanks,
Dennis

gcc/ChangeLog:

2020-01-03  Dennis Zhang  <dennis.zhang@arm.com>

	* config/arm/arm_neon.h (vbfdot_f32, vbfdotq_f32): New
	(vbfdot_lane_f32, vbfdotq_laneq_f32): New.
	(vbfdot_laneq_f32, vbfdotq_lane_f32): New.
	* config/arm/arm_neon_builtins.def (vbfdot): New.
	(vbfdot_lanev4bf, vbfdot_lanev8bf): New.
	* config/arm/iterators.md (VSF2BF): New mode attribute.
	* config/arm/neon.md (neon_vbfdot<VCVTF:mode>): New.
	(neon_vbfdot_lanev4bf<VCVTF:mode>): New.
	(neon_vbfdot_lanev8bf<VCVTF:mode>): New.

gcc/testsuite/ChangeLog:

2020-01-03  Dennis Zhang  <dennis.zhang@arm.com>

	* gcc.target/arm/simd/bf16_dot_1.c: New test.
	* gcc.target/arm/simd/bf16_dot_2.c: New test.

Patch

diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index 7433559f00020a4f7878dff22ddc2b9d40bb2e06..1d9e7d40ccdd86e9ece300b9e08c78bcffe915a6 100644
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -18745,6 +18745,59 @@  vcmlaq_rot270_laneq_f32 (float32x4_t __r, float32x4_t __a, float32x4_t __b,
 #pragma GCC pop_options
 #endif
 
+/* AdvSIMD Brain half-precision float-point (Bfloat16) intrinsics.  */
+
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.2-a+bf16")
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfdot_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b)
+{
+  return __builtin_neon_vbfdotv2sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfdotq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
+{
+  return __builtin_neon_vbfdotv4sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfdot_lane_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b,
+		 const int __index)
+{
+  return __builtin_neon_vbfdot_lanev4bfv2sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfdotq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b,
+		   const int __index)
+{
+  return __builtin_neon_vbfdot_lanev8bfv4sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfdot_laneq_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x8_t __b,
+		  const int __index)
+{
+  return __builtin_neon_vbfdot_lanev8bfv2sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfdotq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b,
+		  const int __index)
+{
+  return __builtin_neon_vbfdot_lanev4bfv4sf (__r, __a, __b, __index);
+}
+
+#pragma GCC pop_options
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def
index bcccf93f7fa2750e9006e5856efecbec0fb331b9..367fd21f5546c6b5a49d79df2822537cbb98e1f7 100644
--- a/gcc/config/arm/arm_neon_builtins.def
+++ b/gcc/config/arm/arm_neon_builtins.def
@@ -373,3 +373,7 @@  VAR2 (MAC_LANE_PAIR, vcmlaq_lane0, v4sf, v8hf)
 VAR2 (MAC_LANE_PAIR, vcmlaq_lane90, v4sf, v8hf)
 VAR2 (MAC_LANE_PAIR, vcmlaq_lane180, v4sf, v8hf)
 VAR2 (MAC_LANE_PAIR, vcmlaq_lane270, v4sf, v8hf)
+
+VAR2 (TERNOP, vbfdot, v2sf, v4sf)
+VAR2 (MAC_LANE_PAIR, vbfdot_lanev4bf, v2sf, v4sf)
+VAR2 (MAC_LANE_PAIR, vbfdot_lanev8bf, v2sf, v4sf)
\ No newline at end of file
diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 439021fa0733ac31706287c4f98d62b080afc3a1..eb001131dc5cb7bed2afe428664d7c863595c60c 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -834,6 +834,8 @@ 
 (define_mode_attr VSI2QI [(V2SI "V8QI") (V4SI "V16QI")])
 (define_mode_attr vsi2qi [(V2SI "v8qi") (V4SI "v16qi")])
 
+(define_mode_attr VSF2BF [(V2SF "V4BF") (V4SF "V8BF")])
+
 ;;----------------------------------------------------------------------------
 ;; Code attributes
 ;;----------------------------------------------------------------------------
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index 3e7ebd7464d4d42eac6a525b5f1b39eae08c9086..248c5f622421d7e8197adb23d7f28588840ff772 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -6556,3 +6556,51 @@  if (BYTES_BIG_ENDIAN)
  "vabd.<V_if_elem> %<V_reg>0, %<V_reg>1, %<V_reg>2"
  [(set_attr "type" "neon_fp_abd_s<q>")]
 )
+
+(define_insn "neon_vbfdot<VCVTF:mode>"
+  [(set (match_operand:VCVTF 0 "register_operand" "=w")
+	(plus:VCVTF (match_operand:VCVTF 1 "register_operand" "0")
+		    (unspec:VCVTF [
+			    (match_operand:<VSF2BF> 2 "register_operand" "w")
+			    (match_operand:<VSF2BF> 3 "register_operand" "w")]
+		     UNSPEC_DOT_S)))]
+  "TARGET_BF16_SIMD"
+  "vdot.bf16\\t%<V_reg>0, %<V_reg>2, %<V_reg>3"
+  [(set_attr "type" "neon_dot<q>")]
+)
+
+(define_insn "neon_vbfdot_lanev4bf<VCVTF:mode>"
+  [(set (match_operand:VCVTF 0 "register_operand" "=w")
+	(plus:VCVTF (match_operand:VCVTF 1 "register_operand" "0")
+		    (unspec:VCVTF [
+			    (match_operand:<VSF2BF> 2 "register_operand" "w")
+			    (match_operand:V4BF 3 "register_operand" "x")
+			    (match_operand:SI 4 "immediate_operand" "i")]
+		     UNSPEC_DOT_S)))]
+  "TARGET_BF16_SIMD"
+  "vdot.bf16\\t%<V_reg>0, %<V_reg>2, %P3[%c4]"
+  [(set_attr "type" "neon_dot<q>")]
+)
+
+(define_insn "neon_vbfdot_lanev8bf<VCVTF:mode>"
+  [(set (match_operand:VCVTF 0 "register_operand" "=w")
+	(plus:VCVTF (match_operand:VCVTF 1 "register_operand" "0")
+		    (unspec:VCVTF [
+			    (match_operand:<VSF2BF> 2 "register_operand" "w")
+			    (match_operand:V8BF 3 "register_operand" "x")
+			    (match_operand:SI 4 "immediate_operand" "i")]
+		     UNSPEC_DOT_S)))]
+  "TARGET_BF16_SIMD"
+  {
+    int lane = INTVAL (operands[4]);
+    int half = GET_MODE_NUNITS (GET_MODE (operands[3])) / 4;
+    if (lane < half)
+      return "vdot.bf16\\t%<V_reg>0, %<V_reg>2, %e3[%c4]";
+    else
+      {
+	operands[4] = GEN_INT (lane - half);
+	return "vdot.bf16\\t%<V_reg>0, %<V_reg>2, %f3[%c4]";
+      }
+  }
+  [(set_attr "type" "neon_dot<q>")]
+)
\ No newline at end of file
diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_dot_1.c b/gcc/testsuite/gcc.target/arm/simd/bf16_dot_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..284df23a5d4f6bf9b74ba71a4b7ced83588babc6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/bf16_dot_1.c
@@ -0,0 +1,47 @@ 
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-options "-save-temps -O2" } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+
+#include "arm_neon.h"
+
+extern void abort();
+
+#define TEST(n, f, r, a, b, ...) { \
+  float32x##n##_t f##_ref = { __VA_ARGS__ }; \
+  float32x##n##_t f##_out = f (r, a, b); \
+  for (int i = 0; i < n; i++) \
+    if (f##_out[i] != f##_ref[i]) \
+      abort(); \
+}
+
+#define TEST_LANE(n, f, r, a, b, l, ...) { \
+  float32x##n##_t f##_ref = { __VA_ARGS__ }; \
+  float32x##n##_t f##_out = f (r, a, b, l); \
+  for (int i = 0; i < n; i++) \
+    if (f##_out[i] != f##_ref[i]) \
+      abort(); \
+}
+
+int
+main()
+{
+  bfloat16x4_t x4 = {1, 1, 1, 1};
+  bfloat16x8_t x8 = {1, 1, 1, 1, 2, 2, 2, 2};
+  float32x2_t r2 = {0, 0};
+  float32x4_t r4 = {0, 0, 0, 0};
+
+  TEST(2, vbfdot_f32, r2, x4, x4, 2, 2);
+  TEST(4, vbfdotq_f32, r4, x8, x8, 2, 2, 2, 2);
+  TEST_LANE(2, vbfdot_lane_f32, r2, x4, x4, 0, 2, 2);
+  TEST_LANE(4, vbfdotq_lane_f32, r4, x8, x4, 1, 2, 2, 2, 2);
+  TEST_LANE(2, vbfdot_laneq_f32, r2, x4, x8, 2, 2, 2);
+  TEST_LANE(4, vbfdotq_laneq_f32, r4, x8, x8, 3, 2, 2, 2, 2);
+
+  return 0;
+}
+
+/* { dg-final { scan-assembler-times {vdot.bf16\td[0-9]+, d[0-9]+, d[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {vdot.bf16\tq[0-9]+, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {vdot.bf16\td[0-9]+, d[0-9]+, d[0-9]+\[0\]\n} 2 } } */
+/* { dg-final { scan-assembler-times {vdot.bf16\tq[0-9]+, q[0-9]+, d[0-9]+\[1\]\n} 2 } } */
\ No newline at end of file
diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_dot_2.c b/gcc/testsuite/gcc.target/arm/simd/bf16_dot_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..06a3c6d0269f8703b7faf2fd8dd283c6b8f76fc6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/bf16_dot_2.c
@@ -0,0 +1,29 @@ 
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+
+#include "arm_neon.h"
+
+float32x2_t
+test_vbfdot_lane_f32 (float32x2_t r, bfloat16x4_t a, bfloat16x4_t b)
+{
+  return __builtin_neon_vbfdot_lanev4bfv2sf (r, a, b, 2); /* { dg-error {out of range 0 - 1} } */
+}
+
+float32x4_t
+test_vbfdotq_lane_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+  return __builtin_neon_vbfdot_lanev4bfv4sf (r, a, b, 2); /* { dg-error {out of range 0 - 1} } */
+}
+
+float32x2_t
+test_vbfdot_laneq_f32 (float32x2_t r, bfloat16x4_t a, bfloat16x8_t b)
+{
+  return __builtin_neon_vbfdot_lanev8bfv2sf (r, a, b, 4); /* { dg-error {out of range 0 - 3} } */
+}
+
+float32x4_t
+test_vbfdotq_laneq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+  return __builtin_neon_vbfdot_lanev8bfv4sf (r, a, b, 4); /* { dg-error {out of range 0 - 3} } */
+}
\ No newline at end of file