[AArch64] ACLE 8-bit integer matrix multiply-accumulate intrinsics

Message ID 1e248c26-6a5c-a9f6-ce38-95ac6616fd15@arm.com
State New
Headers show
Series
  • [AArch64] ACLE 8-bit integer matrix multiply-accumulate intrinsics
Related show

Commit Message

Dennis Zhang Dec. 16, 2019, 1:53 p.m.
Hi all,

This patch is part of a series adding support for Armv8.6-A features.
It depends on the Armv8.6-A effective target checking patch, 
https://gcc.gnu.org/ml/gcc-patches/2019-12/msg00857.html.

This patch adds intrinsics for matrix multiply-accumulate operations 
including vmmlaq_s32, vmmlaq_u32, and vusmmlaq_s32.

ACLE documents are at https://developer.arm.com/docs/101028/latest
ISA documents are at https://developer.arm.com/docs/ddi0596/latest

Regtested & bootstrapped for aarch64-none-linux-gnu.

Is it OK for trunk please?

Thanks
Dennis

gcc/ChangeLog:

2019-12-13  Dennis Zhang  <dennis.zhang@arm.com>

	* config/aarch64/aarch64-builtins.c (TYPES_TERNOP_SSUS): New macro.
	* config/aarch64/aarch64-simd-builtins.def (simd_smmla): New.
	(simd_ummla, simd_usmmla): New.
	* config/aarch64/aarch64-simd.md (aarch64_simd_<sur>mmlav16qi): New.
	* config/aarch64/arm_neon.h (vmmlaq_s32, vmmlaq_u32): New.
	(vusmmlaq_s32): New.
	* config/aarch64/iterators.md (unspec): Add UNSPEC_SMATMUL,
	UNSPEC_UMATMUL, and UNSPEC_USMATMUL.
	(sur): Likewise.
	(MATMUL): New.

gcc/testsuite/ChangeLog:

2019-12-13  Dennis Zhang  <dennis.zhang@arm.com>

	* gcc.target/aarch64/advsimd-intrinsics/vmmla.c: New test.

Comments

Dennis Zhang Jan. 23, 2020, 3:01 p.m. | #1
Hi all,

On 16/12/2019 13:53, Dennis Zhang wrote:
> Hi all,

> 

> This patch is part of a series adding support for Armv8.6-A features.

> It depends on the Armv8.6-A effective target checking patch, 

> https://gcc.gnu.org/ml/gcc-patches/2019-12/msg00857.html.

> 

> This patch adds intrinsics for matrix multiply-accumulate operations 

> including vmmlaq_s32, vmmlaq_u32, and vusmmlaq_s32.

> 

> ACLE documents are at https://developer.arm.com/docs/101028/latest

> ISA documents are at https://developer.arm.com/docs/ddi0596/latest

> 

> Regtested & bootstrapped for aarch64-none-linux-gnu.

> 

> Is it OK for trunk please?

> 


This patch is rebased to the trunk top.
There is no dependence on any other patches now.
Regtested again.

Is it OK for trunk please?

Cheers
Dennis

gcc/ChangeLog:

2020-01-23  Dennis Zhang  <dennis.zhang@arm.com>

	* config/aarch64/aarch64-builtins.c (TYPES_TERNOP_SSUS): New macro.
	* config/aarch64/aarch64-simd-builtins.def (simd_smmla): New.
	(simd_ummla, simd_usmmla): New.
	* config/aarch64/aarch64-simd.md (aarch64_simd_<sur>mmlav16qi): New.
	* config/aarch64/arm_neon.h (vmmlaq_s32, vmmlaq_u32): New.
	(vusmmlaq_s32): New.
	* config/aarch64/iterators.md (unspec): Add UNSPEC_SMATMUL,
	UNSPEC_UMATMUL, and UNSPEC_USMATMUL.
	(sur): Likewise.
	(MATMUL): New iterator.

gcc/testsuite/ChangeLog:

2020-01-23  Dennis Zhang  <dennis.zhang@arm.com>

	* gcc.target/aarch64/advsimd-intrinsics/vmmla.c: New test.
diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index f0e0461b7f0..033a6d4e92f 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -176,6 +176,10 @@ aarch64_types_ternopu_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_unsigned, qualifier_unsigned,
       qualifier_unsigned, qualifier_immediate };
 #define TYPES_TERNOPUI (aarch64_types_ternopu_imm_qualifiers)
+static enum aarch64_type_qualifiers
+aarch64_types_ternop_ssus_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+  = { qualifier_none, qualifier_none, qualifier_unsigned, qualifier_none };
+#define TYPES_TERNOP_SSUS (aarch64_types_ternop_ssus_qualifiers)
 
 
 static enum aarch64_type_qualifiers
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 57fc5933b43..06025b110cc 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -682,3 +682,8 @@
   BUILTIN_VSFDF (UNOP, frint32x, 0)
   BUILTIN_VSFDF (UNOP, frint64z, 0)
   BUILTIN_VSFDF (UNOP, frint64x, 0)
+
+  /* Implemented by aarch64_simd_<sur>mmlav16qi.  */
+  VAR1 (TERNOP, simd_smmla, 0, v16qi)
+  VAR1 (TERNOPU, simd_ummla, 0, v16qi)
+  VAR1 (TERNOP_SSUS, simd_usmmla, 0, v16qi)
\ No newline at end of file
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 2989096b170..409ec28d293 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -7025,3 +7025,15 @@
   "xtn\t%0.<Vntype>, %1.<Vtype>"
   [(set_attr "type" "neon_shift_imm_narrow_q")]
 )
+
+;; 8-bit integer matrix multiply-accumulate
+(define_insn "aarch64_simd_<sur>mmlav16qi"
+  [(set (match_operand:V4SI 0 "register_operand" "=w")
+	(plus:V4SI (match_operand:V4SI 1 "register_operand" "0")
+		   (unspec:V4SI [(match_operand:V16QI 2 "register_operand" "w")
+				 (match_operand:V16QI 3 "register_operand" "w")]
+		    MATMUL)))]
+  "TARGET_I8MM"
+  "<sur>mmla\\t%0.4s, %2.16b, %3.16b"
+  [(set_attr "type" "neon_mla_s_q")]
+)
\ No newline at end of file
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index eaba156e26c..918000d98dc 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -34609,6 +34609,36 @@ vrnd64xq_f64 (float64x2_t __a)
 
 #pragma GCC pop_options
 
+/* AdvSIMD 8-bit Integer Matrix Multiply (I8MM) intrinsics.  */
+
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.2-a+i8mm")
+
+/* Matrix Multiply-Accumulate.  */
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmmlaq_s32 (int32x4_t __r, int8x16_t __a, int8x16_t __b)
+{
+  return __builtin_aarch64_simd_smmlav16qi (__r, __a, __b);
+}
+
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmmlaq_u32 (uint32x4_t __r, uint8x16_t __a, uint8x16_t __b)
+{
+  return __builtin_aarch64_simd_ummlav16qi_uuuu (__r, __a, __b);
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vusmmlaq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b)
+{
+  return __builtin_aarch64_simd_usmmlav16qi_ssus (__r, __a, __b);
+}
+
+#pragma GCC pop_options
+
 #include "arm_bf16.h"
 
 #undef __aarch64_vget_lane_any
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index b9843b83c5f..57aca36f646 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -581,6 +581,9 @@
     UNSPEC_FMLSL	; Used in aarch64-simd.md.
     UNSPEC_FMLAL2	; Used in aarch64-simd.md.
     UNSPEC_FMLSL2	; Used in aarch64-simd.md.
+    UNSPEC_SMATMUL	; Used in aarch64-simd.md.
+    UNSPEC_UMATMUL	; Used in aarch64-simd.md.
+    UNSPEC_USMATMUL	; Used in aarch64-simd.md.
     UNSPEC_ADR		; Used in aarch64-sve.md.
     UNSPEC_SEL		; Used in aarch64-sve.md.
     UNSPEC_BRKA		; Used in aarch64-sve.md.
@@ -2531,6 +2534,8 @@
 
 (define_int_iterator SVE_PITER [UNSPEC_PFIRST UNSPEC_PNEXT])
 
+(define_int_iterator MATMUL [UNSPEC_SMATMUL UNSPEC_UMATMUL UNSPEC_USMATMUL])
+
 ;; Iterators for atomic operations.
 
 (define_int_iterator ATOMIC_LDOP
@@ -2738,6 +2743,8 @@
 		      (UNSPEC_URSHL  "ur") (UNSPEC_SRSHL  "sr")
 		      (UNSPEC_UQRSHL  "u") (UNSPEC_SQRSHL  "s")
 		      (UNSPEC_SDOT "s") (UNSPEC_UDOT "u")
+		      (UNSPEC_SMATMUL "s") (UNSPEC_UMATMUL "u")
+		      (UNSPEC_USMATMUL "us")
 ])
 
 (define_int_attr r [(UNSPEC_SQDMULH "") (UNSPEC_SQRDMULH "r")
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmmla.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmmla.c
new file mode 100644
index 00000000000..348b2f51779
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmmla.c
@@ -0,0 +1,37 @@
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_v8_2a_i8mm_ok } */
+/* { dg-options "-save-temps -O2" } */
+/* { dg-additional-options "-march=armv8.2-a+i8mm" } */
+
+#include "arm_neon.h"
+
+extern void abort();
+
+#define VAR4(v) {v, v, v, v}
+#define VAR16(v) {v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v}
+#define TEST(t, f, r, a, b, ...) { \
+  t##32x4_t f##_ref = { __VA_ARGS__ }; \
+  t##32x4_t f##_out =  f (r, a, b); \
+  for (int i = 0; i < 4; i++) \
+    if (f##_out[i] != f##_ref[i]) \
+      abort(); \
+}
+
+int
+main()
+{
+  int32x4_t s32 = VAR4(-1);
+  uint32x4_t u32 = VAR4(1);
+  int8x16_t s8 = VAR16(-1);
+  uint8x16_t u8 = VAR16(1);
+
+  TEST(int, vmmlaq_s32, s32, s8, s8, 7, 7, 7, 7);
+  TEST(uint, vmmlaq_u32, u32, u8, u8, 9, 9, 9, 9);
+  TEST(int, vusmmlaq_s32, s32, u8, s8, -9, -9, -9, -9);
+
+  return 0;
+}
+
+/* { dg-final { scan-assembler {smmla\tv[0-9]+.4s, v[0-9]+.16b, v[0-9]+.16b} } } */
+/* { dg-final { scan-assembler {ummla\tv[0-9]+.4s, v[0-9]+.16b, v[0-9]+.16b} } } */
+/* { dg-final { scan-assembler {usmmla\tv[0-9]+.4s, v[0-9]+.16b, v[0-9]+.16b} } } */
\ No newline at end of file
Richard Sandiford Jan. 23, 2020, 3:28 p.m. | #2
Dennis Zhang <Dennis.Zhang@arm.com> writes:
> Hi all,

> On 16/12/2019 13:53, Dennis Zhang wrote:

>> Hi all,

>> 

>> This patch is part of a series adding support for Armv8.6-A features.

>> It depends on the Armv8.6-A effective target checking patch, 

>> https://gcc.gnu.org/ml/gcc-patches/2019-12/msg00857.html.

>> 

>> This patch adds intrinsics for matrix multiply-accumulate operations 

>> including vmmlaq_s32, vmmlaq_u32, and vusmmlaq_s32.

>> 

>> ACLE documents are at https://developer.arm.com/docs/101028/latest

>> ISA documents are at https://developer.arm.com/docs/ddi0596/latest

>> 

>> Regtested & bootstrapped for aarch64-none-linux-gnu.

>> 

>> Is it OK for trunk please?

>> 

>

> This patch is rebased to the trunk top.

> There is no dependence on any other patches now.

> Regtested again.

>

> Is it OK for trunk please?

>

> Cheers

> Dennis

>

> gcc/ChangeLog:

>

> 2020-01-23  Dennis Zhang  <dennis.zhang@arm.com>

>

> 	* config/aarch64/aarch64-builtins.c (TYPES_TERNOP_SSUS): New macro.

> 	* config/aarch64/aarch64-simd-builtins.def (simd_smmla): New.

> 	(simd_ummla, simd_usmmla): New.

> 	* config/aarch64/aarch64-simd.md (aarch64_simd_<sur>mmlav16qi): New.

> 	* config/aarch64/arm_neon.h (vmmlaq_s32, vmmlaq_u32): New.

> 	(vusmmlaq_s32): New.

> 	* config/aarch64/iterators.md (unspec): Add UNSPEC_SMATMUL,

> 	UNSPEC_UMATMUL, and UNSPEC_USMATMUL.

> 	(sur): Likewise.

> 	(MATMUL): New iterator.

>

> gcc/testsuite/ChangeLog:

>

> 2020-01-23  Dennis Zhang  <dennis.zhang@arm.com>

>

> 	* gcc.target/aarch64/advsimd-intrinsics/vmmla.c: New test.

>

> diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c

> index f0e0461b7f0..033a6d4e92f 100644

> --- a/gcc/config/aarch64/aarch64-builtins.c

> +++ b/gcc/config/aarch64/aarch64-builtins.c

> @@ -176,6 +176,10 @@ aarch64_types_ternopu_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS]

>    = { qualifier_unsigned, qualifier_unsigned,

>        qualifier_unsigned, qualifier_immediate };

>  #define TYPES_TERNOPUI (aarch64_types_ternopu_imm_qualifiers)

> +static enum aarch64_type_qualifiers

> +aarch64_types_ternop_ssus_qualifiers[SIMD_MAX_BUILTIN_ARGS]

> +  = { qualifier_none, qualifier_none, qualifier_unsigned, qualifier_none };

> +#define TYPES_TERNOP_SSUS (aarch64_types_ternop_ssus_qualifiers)

>  

>  

>  static enum aarch64_type_qualifiers

> diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def

> index 57fc5933b43..06025b110cc 100644

> --- a/gcc/config/aarch64/aarch64-simd-builtins.def

> +++ b/gcc/config/aarch64/aarch64-simd-builtins.def

> @@ -682,3 +682,8 @@

>    BUILTIN_VSFDF (UNOP, frint32x, 0)

>    BUILTIN_VSFDF (UNOP, frint64z, 0)

>    BUILTIN_VSFDF (UNOP, frint64x, 0)

> +

> +  /* Implemented by aarch64_simd_<sur>mmlav16qi.  */

> +  VAR1 (TERNOP, simd_smmla, 0, v16qi)

> +  VAR1 (TERNOPU, simd_ummla, 0, v16qi)

> +  VAR1 (TERNOP_SSUS, simd_usmmla, 0, v16qi)

> \ No newline at end of file

> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md

> index 2989096b170..409ec28d293 100644

> --- a/gcc/config/aarch64/aarch64-simd.md

> +++ b/gcc/config/aarch64/aarch64-simd.md

> @@ -7025,3 +7025,15 @@

>    "xtn\t%0.<Vntype>, %1.<Vtype>"

>    [(set_attr "type" "neon_shift_imm_narrow_q")]

>  )

> +

> +;; 8-bit integer matrix multiply-accumulate

> +(define_insn "aarch64_simd_<sur>mmlav16qi"

> +  [(set (match_operand:V4SI 0 "register_operand" "=w")

> +	(plus:V4SI (match_operand:V4SI 1 "register_operand" "0")

> +		   (unspec:V4SI [(match_operand:V16QI 2 "register_operand" "w")

> +				 (match_operand:V16QI 3 "register_operand" "w")]

> +		    MATMUL)))]

> +  "TARGET_I8MM"

> +  "<sur>mmla\\t%0.4s, %2.16b, %3.16b"

> +  [(set_attr "type" "neon_mla_s_q")]

> +)

> \ No newline at end of file


(Would be good to add the newline)

The canonical rtl order for commutative operations like plus is
to put the most complicated expression first (roughly speaking --
the rules are a bit more precise than that).  So this should be:

  [(set (match_operand:V4SI 0 "register_operand" "=w")
	(plus:V4SI (unspec:V4SI [(match_operand:V16QI 2 "register_operand" "w")
				 (match_operand:V16QI 3 "register_operand" "w")]
		    MATMUL)
		   (match_operand:V4SI 1 "register_operand" "0")))]

> diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h

> index eaba156e26c..918000d98dc 100644

> --- a/gcc/config/aarch64/arm_neon.h

> +++ b/gcc/config/aarch64/arm_neon.h

> @@ -34609,6 +34609,36 @@ vrnd64xq_f64 (float64x2_t __a)

>  

>  #pragma GCC pop_options

>  

> +/* AdvSIMD 8-bit Integer Matrix Multiply (I8MM) intrinsics.  */

> +

> +#pragma GCC push_options

> +#pragma GCC target ("arch=armv8.2-a+i8mm")

> +

> +/* Matrix Multiply-Accumulate.  */

> +

> +__extension__ extern __inline int32x4_t

> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))

> +vmmlaq_s32 (int32x4_t __r, int8x16_t __a, int8x16_t __b)

> +{

> +  return __builtin_aarch64_simd_smmlav16qi (__r, __a, __b);

> +}

> +

> +__extension__ extern __inline uint32x4_t

> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))

> +vmmlaq_u32 (uint32x4_t __r, uint8x16_t __a, uint8x16_t __b)

> +{

> +  return __builtin_aarch64_simd_ummlav16qi_uuuu (__r, __a, __b);

> +}

> +

> +__extension__ extern __inline int32x4_t

> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))

> +vusmmlaq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b)

> +{

> +  return __builtin_aarch64_simd_usmmlav16qi_ssus (__r, __a, __b);

> +}

> +

> +#pragma GCC pop_options

> +

>  #include "arm_bf16.h"

>  

>  #undef __aarch64_vget_lane_any

> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md

> index b9843b83c5f..57aca36f646 100644

> --- a/gcc/config/aarch64/iterators.md

> +++ b/gcc/config/aarch64/iterators.md

> @@ -581,6 +581,9 @@

>      UNSPEC_FMLSL	; Used in aarch64-simd.md.

>      UNSPEC_FMLAL2	; Used in aarch64-simd.md.

>      UNSPEC_FMLSL2	; Used in aarch64-simd.md.

> +    UNSPEC_SMATMUL	; Used in aarch64-simd.md.

> +    UNSPEC_UMATMUL	; Used in aarch64-simd.md.

> +    UNSPEC_USMATMUL	; Used in aarch64-simd.md.

>      UNSPEC_ADR		; Used in aarch64-sve.md.

>      UNSPEC_SEL		; Used in aarch64-sve.md.

>      UNSPEC_BRKA		; Used in aarch64-sve.md.

> @@ -2531,6 +2534,8 @@

>  

>  (define_int_iterator SVE_PITER [UNSPEC_PFIRST UNSPEC_PNEXT])

>  

> +(define_int_iterator MATMUL [UNSPEC_SMATMUL UNSPEC_UMATMUL UNSPEC_USMATMUL])

> +

>  ;; Iterators for atomic operations.

>  

>  (define_int_iterator ATOMIC_LDOP

> @@ -2738,6 +2743,8 @@

>  		      (UNSPEC_URSHL  "ur") (UNSPEC_SRSHL  "sr")

>  		      (UNSPEC_UQRSHL  "u") (UNSPEC_SQRSHL  "s")

>  		      (UNSPEC_SDOT "s") (UNSPEC_UDOT "u")

> +		      (UNSPEC_SMATMUL "s") (UNSPEC_UMATMUL "u")

> +		      (UNSPEC_USMATMUL "us")

>  ])

>  

>  (define_int_attr r [(UNSPEC_SQDMULH "") (UNSPEC_SQRDMULH "r")

> diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmmla.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmmla.c

> new file mode 100644

> index 00000000000..348b2f51779

> --- /dev/null

> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmmla.c

> @@ -0,0 +1,37 @@

> +/* { dg-do assemble } */


I assume this should be dg-run, otherwise there's no point in having
the main function and comparison.  The dg-run would need to be
conditional on whether the target supports i8mm.

Alternatively, we could keep it simple and stick to an assembler
test, in which case I think we should have one function per call,
with no main.

> +/* { dg-require-effective-target arm_v8_2a_i8mm_ok } */

> +/* { dg-options "-save-temps -O2" } */

> +/* { dg-additional-options "-march=armv8.2-a+i8mm" } */

> +

> +#include "arm_neon.h"

> +

> +extern void abort();

> +

> +#define VAR4(v) {v, v, v, v}

> +#define VAR16(v) {v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v}

> +#define TEST(t, f, r, a, b, ...) { \

> +  t##32x4_t f##_ref = { __VA_ARGS__ }; \

> +  t##32x4_t f##_out =  f (r, a, b); \

> +  for (int i = 0; i < 4; i++) \

> +    if (f##_out[i] != f##_ref[i]) \

> +      abort(); \

> +}

> +

> +int

> +main()

> +{

> +  int32x4_t s32 = VAR4(-1);

> +  uint32x4_t u32 = VAR4(1);

> +  int8x16_t s8 = VAR16(-1);

> +  uint8x16_t u8 = VAR16(1);

> +

> +  TEST(int, vmmlaq_s32, s32, s8, s8, 7, 7, 7, 7);

> +  TEST(uint, vmmlaq_u32, u32, u8, u8, 9, 9, 9, 9);

> +  TEST(int, vusmmlaq_s32, s32, u8, s8, -9, -9, -9, -9);

> +

> +  return 0;

> +}

> +

> +/* { dg-final { scan-assembler {smmla\tv[0-9]+.4s, v[0-9]+.16b, v[0-9]+.16b} } } */

> +/* { dg-final { scan-assembler {ummla\tv[0-9]+.4s, v[0-9]+.16b, v[0-9]+.16b} } } */

> +/* { dg-final { scan-assembler {usmmla\tv[0-9]+.4s, v[0-9]+.16b, v[0-9]+.16b} } } */

> \ No newline at end of file


This is going to look like inventing a new rule, sorry, since nothing
else in the directory does this yet, but: IMO it's better to put a
\t at the beginning of each scan-assembler.  As it stands the
usmmla instruction would satisfy the first scan-assembler too,
so we wouldn't pick up cases in which smmla failed to be generated.

Thanks,
Richard
Dennis Zhang Jan. 27, 2020, 12:09 p.m. | #3
Hi Richard,

On 23/01/2020 15:28, Richard Sandiford wrote:
> Dennis Zhang <Dennis.Zhang@arm.com> writes:

>> Hi all,

>> On 16/12/2019 13:53, Dennis Zhang wrote:

>>> Hi all,

>>>

>>> This patch is part of a series adding support for Armv8.6-A features.

>>> It depends on the Armv8.6-A effective target checking patch,

>>> https://gcc.gnu.org/ml/gcc-patches/2019-12/msg00857.html.

>>>

>>> This patch adds intrinsics for matrix multiply-accumulate operations

>>> including vmmlaq_s32, vmmlaq_u32, and vusmmlaq_s32.

>>>

>>> ACLE documents are at https://developer.arm.com/docs/101028/latest

>>> ISA documents are at https://developer.arm.com/docs/ddi0596/latest

>>>

>>> Regtested & bootstrapped for aarch64-none-linux-gnu.

>>>

>>> Is it OK for trunk please?

>>>

>>

>> This patch is rebased to the trunk top.

>> There is no dependence on any other patches now.

>> Regtested again.

>>

>> Is it OK for trunk please?

>>

>> Cheers

>> Dennis

>>

>> gcc/ChangeLog:

>>

>> 2020-01-23  Dennis Zhang  <dennis.zhang@arm.com>

>>

>> 	* config/aarch64/aarch64-builtins.c (TYPES_TERNOP_SSUS): New macro.

>> 	* config/aarch64/aarch64-simd-builtins.def (simd_smmla): New.

>> 	(simd_ummla, simd_usmmla): New.

>> 	* config/aarch64/aarch64-simd.md (aarch64_simd_<sur>mmlav16qi): New.

>> 	* config/aarch64/arm_neon.h (vmmlaq_s32, vmmlaq_u32): New.

>> 	(vusmmlaq_s32): New.

>> 	* config/aarch64/iterators.md (unspec): Add UNSPEC_SMATMUL,

>> 	UNSPEC_UMATMUL, and UNSPEC_USMATMUL.

>> 	(sur): Likewise.

>> 	(MATMUL): New iterator.

>>

>> gcc/testsuite/ChangeLog:

>>

>> 2020-01-23  Dennis Zhang  <dennis.zhang@arm.com>

>>

>> 	* gcc.target/aarch64/advsimd-intrinsics/vmmla.c: New test.

>>

>> diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c

>> index f0e0461b7f0..033a6d4e92f 100644

>> --- a/gcc/config/aarch64/aarch64-builtins.c

>> +++ b/gcc/config/aarch64/aarch64-builtins.c

>> @@ -176,6 +176,10 @@ aarch64_types_ternopu_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS]

>>     = { qualifier_unsigned, qualifier_unsigned,

>>         qualifier_unsigned, qualifier_immediate };

>>   #define TYPES_TERNOPUI (aarch64_types_ternopu_imm_qualifiers)

>> +static enum aarch64_type_qualifiers

>> +aarch64_types_ternop_ssus_qualifiers[SIMD_MAX_BUILTIN_ARGS]

>> +  = { qualifier_none, qualifier_none, qualifier_unsigned, qualifier_none };

>> +#define TYPES_TERNOP_SSUS (aarch64_types_ternop_ssus_qualifiers)

>>   

>>   

>>   static enum aarch64_type_qualifiers

>> diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def

>> index 57fc5933b43..06025b110cc 100644

>> --- a/gcc/config/aarch64/aarch64-simd-builtins.def

>> +++ b/gcc/config/aarch64/aarch64-simd-builtins.def

>> @@ -682,3 +682,8 @@

>>     BUILTIN_VSFDF (UNOP, frint32x, 0)

>>     BUILTIN_VSFDF (UNOP, frint64z, 0)

>>     BUILTIN_VSFDF (UNOP, frint64x, 0)

>> +

>> +  /* Implemented by aarch64_simd_<sur>mmlav16qi.  */

>> +  VAR1 (TERNOP, simd_smmla, 0, v16qi)

>> +  VAR1 (TERNOPU, simd_ummla, 0, v16qi)

>> +  VAR1 (TERNOP_SSUS, simd_usmmla, 0, v16qi)

>> \ No newline at end of file

>> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md

>> index 2989096b170..409ec28d293 100644

>> --- a/gcc/config/aarch64/aarch64-simd.md

>> +++ b/gcc/config/aarch64/aarch64-simd.md

>> @@ -7025,3 +7025,15 @@

>>     "xtn\t%0.<Vntype>, %1.<Vtype>"

>>     [(set_attr "type" "neon_shift_imm_narrow_q")]

>>   )

>> +

>> +;; 8-bit integer matrix multiply-accumulate

>> +(define_insn "aarch64_simd_<sur>mmlav16qi"

>> +  [(set (match_operand:V4SI 0 "register_operand" "=w")

>> +	(plus:V4SI (match_operand:V4SI 1 "register_operand" "0")

>> +		   (unspec:V4SI [(match_operand:V16QI 2 "register_operand" "w")

>> +				 (match_operand:V16QI 3 "register_operand" "w")]

>> +		    MATMUL)))]

>> +  "TARGET_I8MM"

>> +  "<sur>mmla\\t%0.4s, %2.16b, %3.16b"

>> +  [(set_attr "type" "neon_mla_s_q")]

>> +)

>> \ No newline at end of file

> 

> (Would be good to add the newline)

> 

> The canonical rtl order for commutative operations like plus is

> to put the most complicated expression first (roughly speaking --

> the rules are a bit more precise than that).  So this should be:

> 

>    [(set (match_operand:V4SI 0 "register_operand" "=w")

> 	(plus:V4SI (unspec:V4SI [(match_operand:V16QI 2 "register_operand" "w")

> 				 (match_operand:V16QI 3 "register_operand" "w")]

> 		    MATMUL)

> 		   (match_operand:V4SI 1 "register_operand" "0")))]

> 

>> diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h

>> index eaba156e26c..918000d98dc 100644

>> --- a/gcc/config/aarch64/arm_neon.h

>> +++ b/gcc/config/aarch64/arm_neon.h

>> @@ -34609,6 +34609,36 @@ vrnd64xq_f64 (float64x2_t __a)

>>   

>>   #pragma GCC pop_options

>>   

>> +/* AdvSIMD 8-bit Integer Matrix Multiply (I8MM) intrinsics.  */

>> +

>> +#pragma GCC push_options

>> +#pragma GCC target ("arch=armv8.2-a+i8mm")

>> +

>> +/* Matrix Multiply-Accumulate.  */

>> +

>> +__extension__ extern __inline int32x4_t

>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))

>> +vmmlaq_s32 (int32x4_t __r, int8x16_t __a, int8x16_t __b)

>> +{

>> +  return __builtin_aarch64_simd_smmlav16qi (__r, __a, __b);

>> +}

>> +

>> +__extension__ extern __inline uint32x4_t

>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))

>> +vmmlaq_u32 (uint32x4_t __r, uint8x16_t __a, uint8x16_t __b)

>> +{

>> +  return __builtin_aarch64_simd_ummlav16qi_uuuu (__r, __a, __b);

>> +}

>> +

>> +__extension__ extern __inline int32x4_t

>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))

>> +vusmmlaq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b)

>> +{

>> +  return __builtin_aarch64_simd_usmmlav16qi_ssus (__r, __a, __b);

>> +}

>> +

>> +#pragma GCC pop_options

>> +

>>   #include "arm_bf16.h"

>>   

>>   #undef __aarch64_vget_lane_any

>> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md

>> index b9843b83c5f..57aca36f646 100644

>> --- a/gcc/config/aarch64/iterators.md

>> +++ b/gcc/config/aarch64/iterators.md

>> @@ -581,6 +581,9 @@

>>       UNSPEC_FMLSL	; Used in aarch64-simd.md.

>>       UNSPEC_FMLAL2	; Used in aarch64-simd.md.

>>       UNSPEC_FMLSL2	; Used in aarch64-simd.md.

>> +    UNSPEC_SMATMUL	; Used in aarch64-simd.md.

>> +    UNSPEC_UMATMUL	; Used in aarch64-simd.md.

>> +    UNSPEC_USMATMUL	; Used in aarch64-simd.md.

>>       UNSPEC_ADR		; Used in aarch64-sve.md.

>>       UNSPEC_SEL		; Used in aarch64-sve.md.

>>       UNSPEC_BRKA		; Used in aarch64-sve.md.

>> @@ -2531,6 +2534,8 @@

>>   

>>   (define_int_iterator SVE_PITER [UNSPEC_PFIRST UNSPEC_PNEXT])

>>   

>> +(define_int_iterator MATMUL [UNSPEC_SMATMUL UNSPEC_UMATMUL UNSPEC_USMATMUL])

>> +

>>   ;; Iterators for atomic operations.

>>   

>>   (define_int_iterator ATOMIC_LDOP

>> @@ -2738,6 +2743,8 @@

>>   		      (UNSPEC_URSHL  "ur") (UNSPEC_SRSHL  "sr")

>>   		      (UNSPEC_UQRSHL  "u") (UNSPEC_SQRSHL  "s")

>>   		      (UNSPEC_SDOT "s") (UNSPEC_UDOT "u")

>> +		      (UNSPEC_SMATMUL "s") (UNSPEC_UMATMUL "u")

>> +		      (UNSPEC_USMATMUL "us")

>>   ])

>>   

>>   (define_int_attr r [(UNSPEC_SQDMULH "") (UNSPEC_SQRDMULH "r")

>> diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmmla.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmmla.c

>> new file mode 100644

>> index 00000000000..348b2f51779

>> --- /dev/null

>> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmmla.c

>> @@ -0,0 +1,37 @@

>> +/* { dg-do assemble } */

> 

> I assume this should be dg-run, otherwise there's no point in having

> the main function and comparison.  The dg-run would need to be

> conditional on whether the target supports i8mm.

> 

> Alternatively, we could keep it simple and stick to an assembler

> test, in which case I think we should have one function per call,

> with no main.

> 

>> +/* { dg-require-effective-target arm_v8_2a_i8mm_ok } */

>> +/* { dg-options "-save-temps -O2" } */

>> +/* { dg-additional-options "-march=armv8.2-a+i8mm" } */

>> +

>> +#include "arm_neon.h"

>> +

>> +extern void abort();

>> +

>> +#define VAR4(v) {v, v, v, v}

>> +#define VAR16(v) {v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v}

>> +#define TEST(t, f, r, a, b, ...) { \

>> +  t##32x4_t f##_ref = { __VA_ARGS__ }; \

>> +  t##32x4_t f##_out =  f (r, a, b); \

>> +  for (int i = 0; i < 4; i++) \

>> +    if (f##_out[i] != f##_ref[i]) \

>> +      abort(); \

>> +}

>> +

>> +int

>> +main()

>> +{

>> +  int32x4_t s32 = VAR4(-1);

>> +  uint32x4_t u32 = VAR4(1);

>> +  int8x16_t s8 = VAR16(-1);

>> +  uint8x16_t u8 = VAR16(1);

>> +

>> +  TEST(int, vmmlaq_s32, s32, s8, s8, 7, 7, 7, 7);

>> +  TEST(uint, vmmlaq_u32, u32, u8, u8, 9, 9, 9, 9);

>> +  TEST(int, vusmmlaq_s32, s32, u8, s8, -9, -9, -9, -9);

>> +

>> +  return 0;

>> +}

>> +

>> +/* { dg-final { scan-assembler {smmla\tv[0-9]+.4s, v[0-9]+.16b, v[0-9]+.16b} } } */

>> +/* { dg-final { scan-assembler {ummla\tv[0-9]+.4s, v[0-9]+.16b, v[0-9]+.16b} } } */

>> +/* { dg-final { scan-assembler {usmmla\tv[0-9]+.4s, v[0-9]+.16b, v[0-9]+.16b} } } */

>> \ No newline at end of file

> 

> This is going to look like inventing a new rule, sorry, since nothing

> else in the directory does this yet, but: IMO it's better to put a

> \t at the beginning of each scan-assembler.  As it stands the

> usmmla instruction would satisfy the first scan-assembler too,

> so we wouldn't pick up cases in which smmla failed to be generated.

> 

> Thanks,

> Richard

> 


Thanks a lot for the review.
The patch is updated as suggested:
1, Fix RTL format.
2, Test only for assembler. Each instruction starts with '\t' to avoid 
confusing.

Could you please help to check if it's OK for trunk?

Cheers
Dennis

gcc/ChangeLog:

2020-01-23  Dennis Zhang  <dennis.zhang@arm.com>

	* config/aarch64/aarch64-builtins.c (TYPES_TERNOP_SSUS): New macro.
	* config/aarch64/aarch64-simd-builtins.def (simd_smmla): New.
	(simd_ummla, simd_usmmla): New.
	* config/aarch64/aarch64-simd.md (aarch64_simd_<sur>mmlav16qi): New.
	* config/aarch64/arm_neon.h (vmmlaq_s32, vmmlaq_u32): New.
	(vusmmlaq_s32): New.
	* config/aarch64/iterators.md (unspec): Add UNSPEC_SMATMUL,
	UNSPEC_UMATMUL, and UNSPEC_USMATMUL.
	(sur): Likewise.
	(MATMUL): New iterator.

gcc/testsuite/ChangeLog:

2020-01-23  Dennis Zhang  <dennis.zhang@arm.com>

	* gcc.target/aarch64/simd/vmmla.c: New test.
diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index f0e0461b7f0..033a6d4e92f 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -176,6 +176,10 @@ aarch64_types_ternopu_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_unsigned, qualifier_unsigned,
       qualifier_unsigned, qualifier_immediate };
 #define TYPES_TERNOPUI (aarch64_types_ternopu_imm_qualifiers)
+static enum aarch64_type_qualifiers
+aarch64_types_ternop_ssus_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+  = { qualifier_none, qualifier_none, qualifier_unsigned, qualifier_none };
+#define TYPES_TERNOP_SSUS (aarch64_types_ternop_ssus_qualifiers)
 
 
 static enum aarch64_type_qualifiers
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 57fc5933b43..885c2540514 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -682,3 +682,8 @@
   BUILTIN_VSFDF (UNOP, frint32x, 0)
   BUILTIN_VSFDF (UNOP, frint64z, 0)
   BUILTIN_VSFDF (UNOP, frint64x, 0)
+
+  /* Implemented by aarch64_simd_<sur>mmlav16qi.  */
+  VAR1 (TERNOP, simd_smmla, 0, v16qi)
+  VAR1 (TERNOPU, simd_ummla, 0, v16qi)
+  VAR1 (TERNOP_SSUS, simd_usmmla, 0, v16qi)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 2989096b170..b7659068b7d 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -7025,3 +7025,15 @@
   "xtn\t%0.<Vntype>, %1.<Vtype>"
   [(set_attr "type" "neon_shift_imm_narrow_q")]
 )
+
+;; 8-bit integer matrix multiply-accumulate
+(define_insn "aarch64_simd_<sur>mmlav16qi"
+  [(set (match_operand:V4SI 0 "register_operand" "=w")
+	(plus:V4SI
+	 (unspec:V4SI [(match_operand:V16QI 2 "register_operand" "w")
+		       (match_operand:V16QI 3 "register_operand" "w")] MATMUL)
+	 (match_operand:V4SI 1 "register_operand" "0")))]
+  "TARGET_I8MM"
+  "<sur>mmla\\t%0.4s, %2.16b, %3.16b"
+  [(set_attr "type" "neon_mla_s_q")]
+)
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index eaba156e26c..918000d98dc 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -34609,6 +34609,36 @@ vrnd64xq_f64 (float64x2_t __a)
 
 #pragma GCC pop_options
 
+/* AdvSIMD 8-bit Integer Matrix Multiply (I8MM) intrinsics.  */
+
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.2-a+i8mm")
+
+/* Matrix Multiply-Accumulate.  */
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmmlaq_s32 (int32x4_t __r, int8x16_t __a, int8x16_t __b)
+{
+  return __builtin_aarch64_simd_smmlav16qi (__r, __a, __b);
+}
+
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmmlaq_u32 (uint32x4_t __r, uint8x16_t __a, uint8x16_t __b)
+{
+  return __builtin_aarch64_simd_ummlav16qi_uuuu (__r, __a, __b);
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vusmmlaq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b)
+{
+  return __builtin_aarch64_simd_usmmlav16qi_ssus (__r, __a, __b);
+}
+
+#pragma GCC pop_options
+
 #include "arm_bf16.h"
 
 #undef __aarch64_vget_lane_any
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index b9843b83c5f..57aca36f646 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -581,6 +581,9 @@
     UNSPEC_FMLSL	; Used in aarch64-simd.md.
     UNSPEC_FMLAL2	; Used in aarch64-simd.md.
     UNSPEC_FMLSL2	; Used in aarch64-simd.md.
+    UNSPEC_SMATMUL	; Used in aarch64-simd.md.
+    UNSPEC_UMATMUL	; Used in aarch64-simd.md.
+    UNSPEC_USMATMUL	; Used in aarch64-simd.md.
     UNSPEC_ADR		; Used in aarch64-sve.md.
     UNSPEC_SEL		; Used in aarch64-sve.md.
     UNSPEC_BRKA		; Used in aarch64-sve.md.
@@ -2531,6 +2534,8 @@
 
 (define_int_iterator SVE_PITER [UNSPEC_PFIRST UNSPEC_PNEXT])
 
+(define_int_iterator MATMUL [UNSPEC_SMATMUL UNSPEC_UMATMUL UNSPEC_USMATMUL])
+
 ;; Iterators for atomic operations.
 
 (define_int_iterator ATOMIC_LDOP
@@ -2738,6 +2743,8 @@
 		      (UNSPEC_URSHL  "ur") (UNSPEC_SRSHL  "sr")
 		      (UNSPEC_UQRSHL  "u") (UNSPEC_SQRSHL  "s")
 		      (UNSPEC_SDOT "s") (UNSPEC_UDOT "u")
+		      (UNSPEC_SMATMUL "s") (UNSPEC_UMATMUL "u")
+		      (UNSPEC_USMATMUL "us")
 ])
 
 (define_int_attr r [(UNSPEC_SQDMULH "") (UNSPEC_SQRDMULH "r")
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vmmla.c b/gcc/testsuite/gcc.target/aarch64/simd/vmmla.c
new file mode 100644
index 00000000000..c3d31f128c6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vmmla.c
@@ -0,0 +1,27 @@
+/* { dg-do assemble} */
+/* { dg-require-effective-target arm_v8_2a_i8mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+i8mm" } */
+
+#include "arm_neon.h"
+
+int32x4_t
+test_vmmlaq_s32 (int32x4_t r, int8x16_t a, int8x16_t b)
+{
+  return vmmlaq_s32 (r, a, b);
+}
+
+uint32x4_t
+test_vmmlaq_u32 (uint32x4_t r, uint8x16_t a, uint8x16_t b)
+{
+  return vmmlaq_u32 (r, a, b);
+}
+
+int32x4_t
+test_vusmmlaq_s32 (int32x4_t r, uint8x16_t a, int8x16_t b)
+{
+  return vusmmlaq_s32 (r, a, b);
+}
+
+/* { dg-final { scan-assembler-times "\tsmmla\\tv\[0-9\]\+\\.4s, v\[0-9\]\+\\.16b, v\[0-9\]\+\\.16b" 1 } } */
+/* { dg-final { scan-assembler-times "\tummla\\tv\[0-9\]\+\\.4s, v\[0-9\]\+\\.16b, v\[0-9\]\+\\.16b" 1 } } */
+/* { dg-final { scan-assembler-times "\tusmmla\\tv\[0-9\]\+\\.4s, v\[0-9\]\+\\.16b, v\[0-9\]\+\\.16b" 1 } } */
Richard Sandiford Jan. 27, 2020, 1:01 p.m. | #4
Dennis Zhang <Dennis.Zhang@arm.com> writes:
> [...]

> gcc/ChangeLog:

>

> 2020-01-23  Dennis Zhang  <dennis.zhang@arm.com>

>

> 	* config/aarch64/aarch64-builtins.c (TYPES_TERNOP_SSUS): New macro.

> 	* config/aarch64/aarch64-simd-builtins.def (simd_smmla): New.

> 	(simd_ummla, simd_usmmla): New.

> 	* config/aarch64/aarch64-simd.md (aarch64_simd_<sur>mmlav16qi): New.

> 	* config/aarch64/arm_neon.h (vmmlaq_s32, vmmlaq_u32): New.

> 	(vusmmlaq_s32): New.

> 	* config/aarch64/iterators.md (unspec): Add UNSPEC_SMATMUL,

> 	UNSPEC_UMATMUL, and UNSPEC_USMATMUL.

> 	(sur): Likewise.

> 	(MATMUL): New iterator.

>

> gcc/testsuite/ChangeLog:

>

> 2020-01-23  Dennis Zhang  <dennis.zhang@arm.com>

>

> 	* gcc.target/aarch64/simd/vmmla.c: New test.


OK, thanks.

One note below...

> diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c

> index f0e0461b7f0..033a6d4e92f 100644

> --- a/gcc/config/aarch64/aarch64-builtins.c

> +++ b/gcc/config/aarch64/aarch64-builtins.c

> @@ -176,6 +176,10 @@ aarch64_types_ternopu_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS]

>    = { qualifier_unsigned, qualifier_unsigned,

>        qualifier_unsigned, qualifier_immediate };

>  #define TYPES_TERNOPUI (aarch64_types_ternopu_imm_qualifiers)

> +static enum aarch64_type_qualifiers

> +aarch64_types_ternop_ssus_qualifiers[SIMD_MAX_BUILTIN_ARGS]

> +  = { qualifier_none, qualifier_none, qualifier_unsigned, qualifier_none };

> +#define TYPES_TERNOP_SSUS (aarch64_types_ternop_ssus_qualifiers)

>  

>  

>  static enum aarch64_type_qualifiers

> diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def

> index 57fc5933b43..885c2540514 100644

> --- a/gcc/config/aarch64/aarch64-simd-builtins.def

> +++ b/gcc/config/aarch64/aarch64-simd-builtins.def

> @@ -682,3 +682,8 @@

>    BUILTIN_VSFDF (UNOP, frint32x, 0)

>    BUILTIN_VSFDF (UNOP, frint64z, 0)

>    BUILTIN_VSFDF (UNOP, frint64x, 0)

> +

> +  /* Implemented by aarch64_simd_<sur>mmlav16qi.  */

> +  VAR1 (TERNOP, simd_smmla, 0, v16qi)

> +  VAR1 (TERNOPU, simd_ummla, 0, v16qi)

> +  VAR1 (TERNOP_SSUS, simd_usmmla, 0, v16qi)

> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md

> index 2989096b170..b7659068b7d 100644

> --- a/gcc/config/aarch64/aarch64-simd.md

> +++ b/gcc/config/aarch64/aarch64-simd.md

> @@ -7025,3 +7025,15 @@

>    "xtn\t%0.<Vntype>, %1.<Vtype>"

>    [(set_attr "type" "neon_shift_imm_narrow_q")]

>  )

> +

> +;; 8-bit integer matrix multiply-accumulate

> +(define_insn "aarch64_simd_<sur>mmlav16qi"

> +  [(set (match_operand:V4SI 0 "register_operand" "=w")

> +	(plus:V4SI

> +	 (unspec:V4SI [(match_operand:V16QI 2 "register_operand" "w")

> +		       (match_operand:V16QI 3 "register_operand" "w")] MATMUL)

> +	 (match_operand:V4SI 1 "register_operand" "0")))]

> +  "TARGET_I8MM"

> +  "<sur>mmla\\t%0.4s, %2.16b, %3.16b"

> +  [(set_attr "type" "neon_mla_s_q")]

> +)

> diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h

> index eaba156e26c..918000d98dc 100644

> --- a/gcc/config/aarch64/arm_neon.h

> +++ b/gcc/config/aarch64/arm_neon.h

> @@ -34609,6 +34609,36 @@ vrnd64xq_f64 (float64x2_t __a)

>  

>  #pragma GCC pop_options

>  

> +/* AdvSIMD 8-bit Integer Matrix Multiply (I8MM) intrinsics.  */

> +

> +#pragma GCC push_options

> +#pragma GCC target ("arch=armv8.2-a+i8mm")

> +

> +/* Matrix Multiply-Accumulate.  */

> +

> +__extension__ extern __inline int32x4_t

> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))

> +vmmlaq_s32 (int32x4_t __r, int8x16_t __a, int8x16_t __b)

> +{

> +  return __builtin_aarch64_simd_smmlav16qi (__r, __a, __b);

> +}

> +

> +__extension__ extern __inline uint32x4_t

> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))

> +vmmlaq_u32 (uint32x4_t __r, uint8x16_t __a, uint8x16_t __b)

> +{

> +  return __builtin_aarch64_simd_ummlav16qi_uuuu (__r, __a, __b);

> +}

> +

> +__extension__ extern __inline int32x4_t

> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))

> +vusmmlaq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b)

> +{

> +  return __builtin_aarch64_simd_usmmlav16qi_ssus (__r, __a, __b);

> +}

> +

> +#pragma GCC pop_options

> +

>  #include "arm_bf16.h"

>  

>  #undef __aarch64_vget_lane_any

> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md

> index b9843b83c5f..57aca36f646 100644

> --- a/gcc/config/aarch64/iterators.md

> +++ b/gcc/config/aarch64/iterators.md

> @@ -581,6 +581,9 @@

>      UNSPEC_FMLSL	; Used in aarch64-simd.md.

>      UNSPEC_FMLAL2	; Used in aarch64-simd.md.

>      UNSPEC_FMLSL2	; Used in aarch64-simd.md.

> +    UNSPEC_SMATMUL	; Used in aarch64-simd.md.

> +    UNSPEC_UMATMUL	; Used in aarch64-simd.md.

> +    UNSPEC_USMATMUL	; Used in aarch64-simd.md.

>      UNSPEC_ADR		; Used in aarch64-sve.md.

>      UNSPEC_SEL		; Used in aarch64-sve.md.

>      UNSPEC_BRKA		; Used in aarch64-sve.md.

> @@ -2531,6 +2534,8 @@

>  

>  (define_int_iterator SVE_PITER [UNSPEC_PFIRST UNSPEC_PNEXT])

>  

> +(define_int_iterator MATMUL [UNSPEC_SMATMUL UNSPEC_UMATMUL UNSPEC_USMATMUL])

> +

>  ;; Iterators for atomic operations.

>  

>  (define_int_iterator ATOMIC_LDOP

> @@ -2738,6 +2743,8 @@

>  		      (UNSPEC_URSHL  "ur") (UNSPEC_SRSHL  "sr")

>  		      (UNSPEC_UQRSHL  "u") (UNSPEC_SQRSHL  "s")

>  		      (UNSPEC_SDOT "s") (UNSPEC_UDOT "u")

> +		      (UNSPEC_SMATMUL "s") (UNSPEC_UMATMUL "u")

> +		      (UNSPEC_USMATMUL "us")

>  ])

>  

>  (define_int_attr r [(UNSPEC_SQDMULH "") (UNSPEC_SQRDMULH "r")

> diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vmmla.c b/gcc/testsuite/gcc.target/aarch64/simd/vmmla.c

> new file mode 100644

> index 00000000000..c3d31f128c6

> --- /dev/null

> +++ b/gcc/testsuite/gcc.target/aarch64/simd/vmmla.c

> @@ -0,0 +1,27 @@

> +/* { dg-do assemble} */

> +/* { dg-require-effective-target arm_v8_2a_i8mm_ok } */

> +/* { dg-additional-options "-march=armv8.2-a+i8mm" } */

> +

> +#include "arm_neon.h"

> +

> +int32x4_t

> +test_vmmlaq_s32 (int32x4_t r, int8x16_t a, int8x16_t b)

> +{

> +  return vmmlaq_s32 (r, a, b);

> +}

> +

> +uint32x4_t

> +test_vmmlaq_u32 (uint32x4_t r, uint8x16_t a, uint8x16_t b)

> +{

> +  return vmmlaq_u32 (r, a, b);

> +}

> +

> +int32x4_t

> +test_vusmmlaq_s32 (int32x4_t r, uint8x16_t a, int8x16_t b)

> +{

> +  return vusmmlaq_s32 (r, a, b);

> +}

> +

> +/* { dg-final { scan-assembler-times "\tsmmla\\tv\[0-9\]\+\\.4s, v\[0-9\]\+\\.16b, v\[0-9\]\+\\.16b" 1 } } */

> +/* { dg-final { scan-assembler-times "\tummla\\tv\[0-9\]\+\\.4s, v\[0-9\]\+\\.16b, v\[0-9\]\+\\.16b" 1 } } */

> +/* { dg-final { scan-assembler-times "\tusmmla\\tv\[0-9\]\+\\.4s, v\[0-9\]\+\\.16b, v\[0-9\]\+\\.16b" 1 } } */


It looks like you've switched to using "..." to quote the regexp,
which leads to a lot of extra backslashes.  The {...} quoting you had
in your original patch was better and easier to read IMO.

Both forms are OK though, and both are widely used, so there's no need
to change it.  Just mentioning it for next time really.

Thanks,
Richard
Dennis Zhang Feb. 7, 2020, 3:20 p.m. | #5
Hi all,

On 27/01/2020 13:01, Richard Sandiford wrote:
> Dennis Zhang <Dennis.Zhang@arm.com> writes:

>> [...]

>> gcc/ChangeLog:

>>

>> 2020-01-23  Dennis Zhang  <dennis.zhang@arm.com>

>>

>> 	* config/aarch64/aarch64-builtins.c (TYPES_TERNOP_SSUS): New macro.

>> 	* config/aarch64/aarch64-simd-builtins.def (simd_smmla): New.

>> 	(simd_ummla, simd_usmmla): New.

>> 	* config/aarch64/aarch64-simd.md (aarch64_simd_<sur>mmlav16qi): New.

>> 	* config/aarch64/arm_neon.h (vmmlaq_s32, vmmlaq_u32): New.

>> 	(vusmmlaq_s32): New.

>> 	* config/aarch64/iterators.md (unspec): Add UNSPEC_SMATMUL,

>> 	UNSPEC_UMATMUL, and UNSPEC_USMATMUL.

>> 	(sur): Likewise.

>> 	(MATMUL): New iterator.

>>

>> gcc/testsuite/ChangeLog:

>>

>> 2020-01-23  Dennis Zhang  <dennis.zhang@arm.com>

>>

>> 	* gcc.target/aarch64/simd/vmmla.c: New test.

> 

> OK, thanks.

> 

> One note below...

> 

>> diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c

>> index f0e0461b7f0..033a6d4e92f 100644

>> --- a/gcc/config/aarch64/aarch64-builtins.c

>> +++ b/gcc/config/aarch64/aarch64-builtins.c

>> @@ -176,6 +176,10 @@ aarch64_types_ternopu_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS]

>>     = { qualifier_unsigned, qualifier_unsigned,

>>         qualifier_unsigned, qualifier_immediate };

>>   #define TYPES_TERNOPUI (aarch64_types_ternopu_imm_qualifiers)

>> +static enum aarch64_type_qualifiers

>> +aarch64_types_ternop_ssus_qualifiers[SIMD_MAX_BUILTIN_ARGS]

>> +  = { qualifier_none, qualifier_none, qualifier_unsigned, qualifier_none };

>> +#define TYPES_TERNOP_SSUS (aarch64_types_ternop_ssus_qualifiers)

>>   

>>   

>>   static enum aarch64_type_qualifiers

>> diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def

>> index 57fc5933b43..885c2540514 100644

>> --- a/gcc/config/aarch64/aarch64-simd-builtins.def

>> +++ b/gcc/config/aarch64/aarch64-simd-builtins.def

>> @@ -682,3 +682,8 @@

>>     BUILTIN_VSFDF (UNOP, frint32x, 0)

>>     BUILTIN_VSFDF (UNOP, frint64z, 0)

>>     BUILTIN_VSFDF (UNOP, frint64x, 0)

>> +

>> +  /* Implemented by aarch64_simd_<sur>mmlav16qi.  */

>> +  VAR1 (TERNOP, simd_smmla, 0, v16qi)

>> +  VAR1 (TERNOPU, simd_ummla, 0, v16qi)

>> +  VAR1 (TERNOP_SSUS, simd_usmmla, 0, v16qi)

>> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md

>> index 2989096b170..b7659068b7d 100644

>> --- a/gcc/config/aarch64/aarch64-simd.md

>> +++ b/gcc/config/aarch64/aarch64-simd.md

>> @@ -7025,3 +7025,15 @@

>>     "xtn\t%0.<Vntype>, %1.<Vtype>"

>>     [(set_attr "type" "neon_shift_imm_narrow_q")]

>>   )

>> +

>> +;; 8-bit integer matrix multiply-accumulate

>> +(define_insn "aarch64_simd_<sur>mmlav16qi"

>> +  [(set (match_operand:V4SI 0 "register_operand" "=w")

>> +	(plus:V4SI

>> +	 (unspec:V4SI [(match_operand:V16QI 2 "register_operand" "w")

>> +		       (match_operand:V16QI 3 "register_operand" "w")] MATMUL)

>> +	 (match_operand:V4SI 1 "register_operand" "0")))]

>> +  "TARGET_I8MM"

>> +  "<sur>mmla\\t%0.4s, %2.16b, %3.16b"

>> +  [(set_attr "type" "neon_mla_s_q")]

>> +)

>> diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h

>> index eaba156e26c..918000d98dc 100644

>> --- a/gcc/config/aarch64/arm_neon.h

>> +++ b/gcc/config/aarch64/arm_neon.h

>> @@ -34609,6 +34609,36 @@ vrnd64xq_f64 (float64x2_t __a)

>>   

>>   #pragma GCC pop_options

>>   

>> +/* AdvSIMD 8-bit Integer Matrix Multiply (I8MM) intrinsics.  */

>> +

>> +#pragma GCC push_options

>> +#pragma GCC target ("arch=armv8.2-a+i8mm")

>> +

>> +/* Matrix Multiply-Accumulate.  */

>> +

>> +__extension__ extern __inline int32x4_t

>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))

>> +vmmlaq_s32 (int32x4_t __r, int8x16_t __a, int8x16_t __b)

>> +{

>> +  return __builtin_aarch64_simd_smmlav16qi (__r, __a, __b);

>> +}

>> +

>> +__extension__ extern __inline uint32x4_t

>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))

>> +vmmlaq_u32 (uint32x4_t __r, uint8x16_t __a, uint8x16_t __b)

>> +{

>> +  return __builtin_aarch64_simd_ummlav16qi_uuuu (__r, __a, __b);

>> +}

>> +

>> +__extension__ extern __inline int32x4_t

>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))

>> +vusmmlaq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b)

>> +{

>> +  return __builtin_aarch64_simd_usmmlav16qi_ssus (__r, __a, __b);

>> +}

>> +

>> +#pragma GCC pop_options

>> +

>>   #include "arm_bf16.h"

>>   

>>   #undef __aarch64_vget_lane_any

>> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md

>> index b9843b83c5f..57aca36f646 100644

>> --- a/gcc/config/aarch64/iterators.md

>> +++ b/gcc/config/aarch64/iterators.md

>> @@ -581,6 +581,9 @@

>>       UNSPEC_FMLSL	; Used in aarch64-simd.md.

>>       UNSPEC_FMLAL2	; Used in aarch64-simd.md.

>>       UNSPEC_FMLSL2	; Used in aarch64-simd.md.

>> +    UNSPEC_SMATMUL	; Used in aarch64-simd.md.

>> +    UNSPEC_UMATMUL	; Used in aarch64-simd.md.

>> +    UNSPEC_USMATMUL	; Used in aarch64-simd.md.

>>       UNSPEC_ADR		; Used in aarch64-sve.md.

>>       UNSPEC_SEL		; Used in aarch64-sve.md.

>>       UNSPEC_BRKA		; Used in aarch64-sve.md.

>> @@ -2531,6 +2534,8 @@

>>   

>>   (define_int_iterator SVE_PITER [UNSPEC_PFIRST UNSPEC_PNEXT])

>>   

>> +(define_int_iterator MATMUL [UNSPEC_SMATMUL UNSPEC_UMATMUL UNSPEC_USMATMUL])

>> +

>>   ;; Iterators for atomic operations.

>>   

>>   (define_int_iterator ATOMIC_LDOP

>> @@ -2738,6 +2743,8 @@

>>   		      (UNSPEC_URSHL  "ur") (UNSPEC_SRSHL  "sr")

>>   		      (UNSPEC_UQRSHL  "u") (UNSPEC_SQRSHL  "s")

>>   		      (UNSPEC_SDOT "s") (UNSPEC_UDOT "u")

>> +		      (UNSPEC_SMATMUL "s") (UNSPEC_UMATMUL "u")

>> +		      (UNSPEC_USMATMUL "us")

>>   ])

>>   

>>   (define_int_attr r [(UNSPEC_SQDMULH "") (UNSPEC_SQRDMULH "r")

>> diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vmmla.c b/gcc/testsuite/gcc.target/aarch64/simd/vmmla.c

>> new file mode 100644

>> index 00000000000..c3d31f128c6

>> --- /dev/null

>> +++ b/gcc/testsuite/gcc.target/aarch64/simd/vmmla.c

>> @@ -0,0 +1,27 @@

>> +/* { dg-do assemble} */

>> +/* { dg-require-effective-target arm_v8_2a_i8mm_ok } */

>> +/* { dg-additional-options "-march=armv8.2-a+i8mm" } */

>> +

>> +#include "arm_neon.h"

>> +

>> +int32x4_t

>> +test_vmmlaq_s32 (int32x4_t r, int8x16_t a, int8x16_t b)

>> +{

>> +  return vmmlaq_s32 (r, a, b);

>> +}

>> +

>> +uint32x4_t

>> +test_vmmlaq_u32 (uint32x4_t r, uint8x16_t a, uint8x16_t b)

>> +{

>> +  return vmmlaq_u32 (r, a, b);

>> +}

>> +

>> +int32x4_t

>> +test_vusmmlaq_s32 (int32x4_t r, uint8x16_t a, int8x16_t b)

>> +{

>> +  return vusmmlaq_s32 (r, a, b);

>> +}

>> +

>> +/* { dg-final { scan-assembler-times "\tsmmla\\tv\[0-9\]\+\\.4s, v\[0-9\]\+\\.16b, v\[0-9\]\+\\.16b" 1 } } */

>> +/* { dg-final { scan-assembler-times "\tummla\\tv\[0-9\]\+\\.4s, v\[0-9\]\+\\.16b, v\[0-9\]\+\\.16b" 1 } } */

>> +/* { dg-final { scan-assembler-times "\tusmmla\\tv\[0-9\]\+\\.4s, v\[0-9\]\+\\.16b, v\[0-9\]\+\\.16b" 1 } } */

> 

> It looks like you've switched to using "..." to quote the regexp,

> which leads to a lot of extra backslashes.  The {...} quoting you had

> in your original patch was better and easier to read IMO.

> 

> Both forms are OK though, and both are widely used, so there's no need

> to change it.  Just mentioning it for next time really.

> 

> Thanks,

> Richard

> 


Sorry for the delay since last update.
This patch has been rebased to the top and pushed to master as 
40f648378061c170cf6a9ab680af01b3a3a83569.

Cheers
Dennis

gcc/ChangeLog:

2020-02-07  Dennis Zhang  <dennis.zhang@arm.com>

	* config/aarch64/aarch64-simd-builtins.def (simd_smmla): New entry.
	(simd_ummla, simd_usmmla): Likewise.
	* config/aarch64/aarch64-simd.md (aarch64_simd_<sur>mmlav16qi): New.
	* config/aarch64/arm_neon.h (vmmlaq_s32, vmmlaq_u32): New.
	(vusmmlaq_s32): New.

gcc/testsuite/ChangeLog:

2020-02-07  Dennis Zhang  <dennis.zhang@arm.com>

	* gcc.target/aarch64/simd/vmmla.c: New test.
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 02b2154cf64..fe3c7f1f832 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -703,3 +703,8 @@
   VAR1 (QUADOP_LANE, bfmlalt_lane, 0, v4sf)
   VAR1 (QUADOP_LANE, bfmlalb_lane_q, 0, v4sf)
   VAR1 (QUADOP_LANE, bfmlalt_lane_q, 0, v4sf)
+
+  /* Implemented by aarch64_simd_<sur>mmlav16qi.  */
+  VAR1 (TERNOP, simd_smmla, 0, v16qi)
+  VAR1 (TERNOPU, simd_ummla, 0, v16qi)
+  VAR1 (TERNOP_SSUS, simd_usmmla, 0, v16qi)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index f2b440c36bb..c8e1012bd7f 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -7173,3 +7173,15 @@
 }
   [(set_attr "type" "neon_fp_mla_s_scalar_q")]
 )
+
+;; 8-bit integer matrix multiply-accumulate
+(define_insn "aarch64_simd_<sur>mmlav16qi"
+  [(set (match_operand:V4SI 0 "register_operand" "=w")
+	(plus:V4SI
+	 (unspec:V4SI [(match_operand:V16QI 2 "register_operand" "w")
+		       (match_operand:V16QI 3 "register_operand" "w")] MATMUL)
+	 (match_operand:V4SI 1 "register_operand" "0")))]
+  "TARGET_I8MM"
+  "<sur>mmla\\t%0.4s, %2.16b, %3.16b"
+  [(set_attr "type" "neon_mla_s_q")]
+)
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index db845a3d2d2..a6bcdf1f64a 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -34797,6 +34797,29 @@ vsudotq_laneq_s32 (int32x4_t __r, int8x16_t __a, uint8x16_t __b,
   return __builtin_aarch64_sudot_laneqv16qi_sssus (__r, __a, __b, __index);
 }
 
+/* Matrix Multiply-Accumulate.  */
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmmlaq_s32 (int32x4_t __r, int8x16_t __a, int8x16_t __b)
+{
+  return __builtin_aarch64_simd_smmlav16qi (__r, __a, __b);
+}
+
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmmlaq_u32 (uint32x4_t __r, uint8x16_t __a, uint8x16_t __b)
+{
+  return __builtin_aarch64_simd_ummlav16qi_uuuu (__r, __a, __b);
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vusmmlaq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b)
+{
+  return __builtin_aarch64_simd_usmmlav16qi_ssus (__r, __a, __b);
+}
+
 #pragma GCC pop_options
 
 #undef __aarch64_vget_lane_any
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vmmla.c b/gcc/testsuite/gcc.target/aarch64/simd/vmmla.c
new file mode 100644
index 00000000000..5eec2b5cfb9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vmmla.c
@@ -0,0 +1,27 @@
+/* { dg-do assemble} */
+/* { dg-require-effective-target arm_v8_2a_i8mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+i8mm" } */
+
+#include "arm_neon.h"
+
+int32x4_t
+test_vmmlaq_s32 (int32x4_t r, int8x16_t a, int8x16_t b)
+{
+  return vmmlaq_s32 (r, a, b);
+}
+
+uint32x4_t
+test_vmmlaq_u32 (uint32x4_t r, uint8x16_t a, uint8x16_t b)
+{
+  return vmmlaq_u32 (r, a, b);
+}
+
+int32x4_t
+test_vusmmlaq_s32 (int32x4_t r, uint8x16_t a, int8x16_t b)
+{
+  return vusmmlaq_s32 (r, a, b);
+}
+
+/* { dg-final { scan-assembler-times {\tsmmla\tv[0-9]+.4s, v[0-9]+.16b, v[0-9]+.16b} 1 } } */
+/* { dg-final { scan-assembler-times {\tummla\tv[0-9]+.4s, v[0-9]+.16b, v[0-9]+.16b} 1 } } */
+/* { dg-final { scan-assembler-times {\tusmmla\tv[0-9]+.4s, v[0-9]+.16b, v[0-9]+.16b} 1 } } */

Patch

diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index c35a1b1f029..5b048dc9402 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -173,6 +173,10 @@  aarch64_types_ternopu_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_unsigned, qualifier_unsigned,
       qualifier_unsigned, qualifier_immediate };
 #define TYPES_TERNOPUI (aarch64_types_ternopu_imm_qualifiers)
+static enum aarch64_type_qualifiers
+aarch64_types_ternop_ssus_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+  = { qualifier_none, qualifier_none, qualifier_unsigned, qualifier_none };
+#define TYPES_TERNOP_SSUS (aarch64_types_ternop_ssus_qualifiers)
 
 
 static enum aarch64_type_qualifiers
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index f4ca35a5970..744f880c450 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -682,3 +682,8 @@ 
   BUILTIN_VSFDF (UNOP, frint32x, 0)
   BUILTIN_VSFDF (UNOP, frint64z, 0)
   BUILTIN_VSFDF (UNOP, frint64x, 0)
+
+  /* Implemented by aarch64_simd_<sur>mmlav16qi.  */
+  VAR1 (TERNOP, simd_smmla, 0, v16qi)
+  VAR1 (TERNOPU, simd_ummla, 0, v16qi)
+  VAR1 (TERNOP_SSUS, simd_usmmla, 0, v16qi)
\ No newline at end of file
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index ad4676bc167..fc0c8d21599 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -7025,3 +7025,15 @@ 
   "xtn\t%0.<Vntype>, %1.<Vtype>"
   [(set_attr "type" "neon_shift_imm_narrow_q")]
 )
+
+;; 8-bit integer matrix multiply-accumulate
+(define_insn "aarch64_simd_<sur>mmlav16qi"
+  [(set (match_operand:V4SI 0 "register_operand" "=w")
+	(plus:V4SI (match_operand:V4SI 1 "register_operand" "0")
+		   (unspec:V4SI [(match_operand:V16QI 2 "register_operand" "w")
+				 (match_operand:V16QI 3 "register_operand" "w")]
+		    MATMUL)))]
+  "TARGET_I8MM"
+  "<sur>mmla\\t%0.4s, %2.16b, %3.16b"
+  [(set_attr "type" "neon_mla_s_q")]
+)
\ No newline at end of file
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 8b861601a48..e6af2c2960d 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -34606,6 +34606,36 @@  vrnd64xq_f64 (float64x2_t __a)
 
 #pragma GCC pop_options
 
+/* AdvSIMD 8-bit Integer Matrix Multiply (I8MM) intrinsics.  */
+
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.2-a+i8mm")
+
+/* Matrix Multiply-Accumulate.  */
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmmlaq_s32 (int32x4_t __r, int8x16_t __a, int8x16_t __b)
+{
+  return __builtin_aarch64_simd_smmlav16qi (__r, __a, __b);
+}
+
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmmlaq_u32 (uint32x4_t __r, uint8x16_t __a, uint8x16_t __b)
+{
+  return __builtin_aarch64_simd_ummlav16qi_uuuu (__r, __a, __b);
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vusmmlaq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b)
+{
+  return __builtin_aarch64_simd_usmmlav16qi_ssus (__r, __a, __b);
+}
+
+#pragma GCC pop_options
+
 #undef __aarch64_vget_lane_any
 
 #undef __aarch64_vdup_lane_any
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 83a0d156e84..f2a9298fbf8 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -650,6 +650,9 @@ 
     UNSPEC_UMULHS	; Used in aarch64-sve2.md.
     UNSPEC_UMULHRS	; Used in aarch64-sve2.md.
     UNSPEC_ASRD		; Used in aarch64-sve.md.
+    UNSPEC_SMATMUL	; Used in aarch64-simd.md.
+    UNSPEC_UMATMUL	; Used in aarch64-simd.md.
+    UNSPEC_USMATMUL	; Used in aarch64-simd.md.
 ])
 
 ;; ------------------------------------------------------------------
@@ -2102,6 +2105,8 @@ 
 
 (define_int_iterator SVE_PITER [UNSPEC_PFIRST UNSPEC_PNEXT])
 
+(define_int_iterator MATMUL [UNSPEC_SMATMUL UNSPEC_UMATMUL UNSPEC_USMATMUL])
+
 ;; Iterators for atomic operations.
 
 (define_int_iterator ATOMIC_LDOP
@@ -2292,6 +2297,8 @@ 
 		      (UNSPEC_URSHL  "ur") (UNSPEC_SRSHL  "sr")
 		      (UNSPEC_UQRSHL  "u") (UNSPEC_SQRSHL  "s")
 		      (UNSPEC_SDOT "s") (UNSPEC_UDOT "u")
+		      (UNSPEC_SMATMUL "s") (UNSPEC_UMATMUL "u")
+		      (UNSPEC_USMATMUL "us")
 ])
 
 (define_int_attr r [(UNSPEC_SQDMULH "") (UNSPEC_SQRDMULH "r")
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmmla.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmmla.c
new file mode 100644
index 00000000000..348b2f51779
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmmla.c
@@ -0,0 +1,37 @@ 
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_v8_2a_i8mm_ok } */
+/* { dg-options "-save-temps -O2" } */
+/* { dg-additional-options "-march=armv8.2-a+i8mm" } */
+
+#include "arm_neon.h"
+
+extern void abort();
+
+#define VAR4(v) {v, v, v, v}
+#define VAR16(v) {v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v}
+#define TEST(t, f, r, a, b, ...) { \
+  t##32x4_t f##_ref = { __VA_ARGS__ }; \
+  t##32x4_t f##_out =  f (r, a, b); \
+  for (int i = 0; i < 4; i++) \
+    if (f##_out[i] != f##_ref[i]) \
+      abort(); \
+}
+
+int
+main()
+{
+  int32x4_t s32 = VAR4(-1);
+  uint32x4_t u32 = VAR4(1);
+  int8x16_t s8 = VAR16(-1);
+  uint8x16_t u8 = VAR16(1);
+
+  TEST(int, vmmlaq_s32, s32, s8, s8, 7, 7, 7, 7);
+  TEST(uint, vmmlaq_u32, u32, u8, u8, 9, 9, 9, 9);
+  TEST(int, vusmmlaq_s32, s32, u8, s8, -9, -9, -9, -9);
+
+  return 0;
+}
+
+/* { dg-final { scan-assembler {smmla\tv[0-9]+.4s, v[0-9]+.16b, v[0-9]+.16b} } } */
+/* { dg-final { scan-assembler {ummla\tv[0-9]+.4s, v[0-9]+.16b, v[0-9]+.16b} } } */
+/* { dg-final { scan-assembler {usmmla\tv[0-9]+.4s, v[0-9]+.16b, v[0-9]+.16b} } } */
\ No newline at end of file