[arm] Armv8.6-A Matrix Multiply extension [9/10]

Message ID 5db8636d.1c69fb81.daa1b.9a4fSMTPIN_ADDED_MISSING@mx.google.com
State New
Headers show
Series
  • [arm] Armv8.6-A Matrix Multiply extension [9/10]
Related show

Commit Message

Mihail Ionescu Oct. 29, 2019, 3:20 p.m.
Hi,


This patch is part of a series that adds support for Armv8.6-A
(Matrix Multiply and BFloat16 extensions) to binutils.

This patch introduces the Matrix Multiply (Int8, F32, F64) extensions
to the arm backend.

The following Matrix Multiply instructions are added: vummla, vsmmla,
vusmmla, vusdot, vsudot[1].

[1]https://developer.arm.com/docs/ddi0597/latest/simd-and-floating-point-instructions-alphabetic-order

gas/ChangeLog:

2019-10-29  Mihail Ionescu <mihail.ionescu@arm.com>

	* config/tc-arm.c (arm_ext_i8mm): New feature set.
	(do_vusdot): New.
	(do_vsudot): New.
	(do_vsmmla): New.
	(do_vummla): New.
	(insns): Add vsmmla, vummla, vusmmla, vusdot, vsudot mnemonics.
	(armv86a_ext_table): Add i8mm extension.
	(arm_extensions): Move bf16 extension to context sensitive table.
	(armv82a_ext_table, armv84a_ext_table, armv85a_ext_table):
	Move bf16 extension to context sensitive table.
	(armv86a_ext_table): Add i8mm extension.
	* doc/c-arm.texi: Document i8mm extension.
	* testsuite/gas/arm/i8mm.s: New test.
	* testsuite/gas/arm/i8mm.d: New test.
	* testsuite/gas/arm/bfloat17-cmdline-bad-3.d: Update test.

include/ChangeLog:

2019-10-29  Mihail Ionescu <mihail.ionescu@arm.com>

	* opcode/arm.h (ARM_EXT2_I8MM): New feature macro.

opcodes/ChangeLog:

2019-10-29  Mihail Ionescu <mihail.ionescu@arm.com>

	* arm-dis.c (neon_opcodes): Add i8mm SIMD instructions.


Regression tested on arm-none-eabi.
Is this ok for trunk?

Regards,
Mihail


###############     Attachment also inlined for ease of reply    ###############
diff --git a/gas/config/tc-arm.c b/gas/config/tc-arm.c
index ef2e190688d40587c570db76af30d77eca56c408..b9397528f4c840d3aec462c1e12ef42eebee25b1 100644
--- a/gas/config/tc-arm.c
+++ b/gas/config/tc-arm.c
@@ -277,6 +277,8 @@ static const arm_feature_set arm_ext_predres =
   ARM_FEATURE_CORE_HIGH (ARM_EXT2_PREDRES);
 static const arm_feature_set arm_ext_bf16 =
   ARM_FEATURE_CORE_HIGH (ARM_EXT2_BF16);
+static const arm_feature_set arm_ext_i8mm =
+  ARM_FEATURE_CORE_HIGH (ARM_EXT2_I8MM);
 
 static const arm_feature_set arm_arch_any = ARM_ANY;
 #ifdef OBJ_ELF
@@ -21481,6 +21483,79 @@ do_neon_dotproduct_u (void)
   return do_neon_dotproduct (1);
 }
 
+static void
+do_vusdot (void)
+{
+  enum neon_shape rs;
+  set_pred_insn_type (OUTSIDE_PRED_INSN);
+  if (inst.operands[2].isscalar)
+    {
+      rs = neon_select_shape (NS_DDS, NS_QQS, NS_NULL);
+      neon_check_type (3, rs, N_EQK, N_EQK, N_S8 | N_KEY);
+
+      inst.instruction |= (1 << 25);
+      int index = inst.operands[2].reg & 0xf;
+      constraint ((index != 1 && index != 0), _("index must be 0 or 1"));
+      inst.operands[2].reg >>= 4;
+      constraint (!(inst.operands[2].reg < 16),
+		  _("indexed register must be less than 16"));
+      neon_three_args (rs == NS_QQS);
+      inst.instruction |= (index << 5);
+    }
+  else
+    {
+      inst.instruction |= (1 << 21);
+      rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
+      neon_check_type (3, rs, N_EQK, N_EQK, N_S8 | N_KEY);
+      neon_three_args (rs == NS_QQQ);
+    }
+}
+
+static void
+do_vsudot (void)
+{
+  enum neon_shape rs;
+  set_pred_insn_type (OUTSIDE_PRED_INSN);
+  if (inst.operands[2].isscalar)
+    {
+      rs = neon_select_shape (NS_DDS, NS_QQS, NS_NULL);
+      neon_check_type (3, rs, N_EQK, N_EQK, N_U8 | N_KEY);
+
+      inst.instruction |= (1 << 25);
+      int index = inst.operands[2].reg & 0xf;
+      constraint ((index != 1 && index != 0), _("index must be 0 or 1"));
+      inst.operands[2].reg >>= 4;
+      constraint (!(inst.operands[2].reg < 16),
+		  _("indexed register must be less than 16"));
+      neon_three_args (rs == NS_QQS);
+      inst.instruction |= (index << 5);
+    }
+}
+
+static void
+do_vsmmla (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_QQQ, NS_NULL);
+  neon_check_type (3, rs, N_EQK, N_EQK, N_S8 | N_KEY);
+
+  set_pred_insn_type (OUTSIDE_PRED_INSN);
+
+  neon_three_args (1);
+
+}
+
+static void
+do_vummla (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_QQQ, NS_NULL);
+  neon_check_type (3, rs, N_EQK, N_EQK, N_U8 | N_KEY);
+
+  set_pred_insn_type (OUTSIDE_PRED_INSN);
+
+  neon_three_args (1);
+
+}
+
 /* Crypto v1 instructions.  */
 static void
 do_crypto_2op_1 (unsigned elttype, int op)
@@ -25998,7 +26073,7 @@ static const struct asm_opcode insns[] =
 #define	THUMB_VARIANT &arm_ext_i8mm
  TUF ("vsmmla", c200c40, fc200c40, 3, (RNQ, RNQ, RNQ), vsmmla, vsmmla),
  TUF ("vummla", c200c50, fc200c50, 3, (RNQ, RNQ, RNQ), vummla, vummla),
- TUF ("vusmmla", ca00c40, fca00c40, 3, (RNQ, RNQ, RNQ), vummla, vummla),
+ TUF ("vusmmla", ca00c40, fca00c40, 3, (RNQ, RNQ, RNQ), vsmmla, vsmmla),
  TUF ("vusdot", c800d00, fc800d00, 3, (RNDQ, RNDQ, RNDQ_RNSC), vusdot, vusdot),
  TUF ("vsudot", c800d10, fc800d10, 3, (RNDQ, RNDQ, RNSC), vsudot, vsudot),
 };
@@ -31125,6 +31200,8 @@ static const struct arm_ext_table armv82a_ext_table[] =
   ARM_ADD ("simd", FPU_ARCH_NEON_VFP_ARMV8_1),
   ARM_ADD ("fp16", FPU_ARCH_NEON_VFP_ARMV8_2_FP16),
   ARM_ADD ("fp16fml", FPU_ARCH_NEON_VFP_ARMV8_2_FP16FML),
+  ARM_ADD ("bf16", ARM_FEATURE_CORE_HIGH (ARM_EXT2_BF16)),
+  ARM_ADD ("i8mm", ARM_FEATURE_CORE_HIGH (ARM_EXT2_I8MM)),
   ARM_EXT ("crypto", FPU_ARCH_CRYPTO_NEON_VFP_ARMV8_1,
 	   ARM_FEATURE_COPROC (FPU_CRYPTO_ARMV8)),
   ARM_ADD ("dotprod", FPU_ARCH_DOTPROD_NEON_VFP_ARMV8),
@@ -31141,6 +31218,8 @@ static const struct arm_ext_table armv84a_ext_table[] =
 {
   ARM_ADD ("simd", FPU_ARCH_DOTPROD_NEON_VFP_ARMV8),
   ARM_ADD ("fp16", FPU_ARCH_NEON_VFP_ARMV8_4_FP16FML),
+  ARM_ADD ("bf16", ARM_FEATURE_CORE_HIGH (ARM_EXT2_BF16)),
+  ARM_ADD ("i8mm", ARM_FEATURE_CORE_HIGH (ARM_EXT2_I8MM)),
   ARM_EXT ("crypto", FPU_ARCH_CRYPTO_NEON_VFP_ARMV8_4,
 	   ARM_FEATURE_COPROC (FPU_CRYPTO_ARMV8)),
 
@@ -31156,6 +31235,8 @@ static const struct arm_ext_table armv85a_ext_table[] =
 {
   ARM_ADD ("simd", FPU_ARCH_DOTPROD_NEON_VFP_ARMV8),
   ARM_ADD ("fp16", FPU_ARCH_NEON_VFP_ARMV8_4_FP16FML),
+  ARM_ADD ("bf16", ARM_FEATURE_CORE_HIGH (ARM_EXT2_BF16)),
+  ARM_ADD ("i8mm", ARM_FEATURE_CORE_HIGH (ARM_EXT2_I8MM)),
   ARM_EXT ("crypto", FPU_ARCH_CRYPTO_NEON_VFP_ARMV8_4,
 	   ARM_FEATURE_COPROC (FPU_CRYPTO_ARMV8)),
 
@@ -31167,6 +31248,7 @@ static const struct arm_ext_table armv85a_ext_table[] =
 
 static const struct arm_ext_table armv86a_ext_table[] =
 {
+  ARM_ADD ("i8mm", ARM_FEATURE_CORE_HIGH (ARM_EXT2_I8MM)),
   { NULL, 0, ARM_ARCH_NONE, ARM_ARCH_NONE }
 };
 
@@ -31306,9 +31388,6 @@ struct arm_option_extension_value_table
    use the context sensitive approach using arm_ext_table's.  */
 static const struct arm_option_extension_value_table arm_extensions[] =
 {
-  ARM_EXT_OPT ("bf16",  ARM_FEATURE_CORE_HIGH (ARM_EXT2_BF16),
-			ARM_FEATURE_CORE_HIGH (ARM_EXT2_BF16),
-			ARM_ARCH_V8_2A),
   ARM_EXT_OPT ("crc",  ARCH_CRC_ARMV8, ARM_FEATURE_COPROC (CRC_EXT_ARMV8),
 			 ARM_FEATURE_CORE_LOW (ARM_EXT_V8)),
   ARM_EXT_OPT ("crypto", FPU_ARCH_CRYPTO_NEON_VFP_ARMV8,
diff --git a/gas/doc/c-arm.texi b/gas/doc/c-arm.texi
index 8afee70120f122e3724b27a71443d03b7c2ad719..a67bb59899abfe6638977234a1c43e196d8e395c 100644
--- a/gas/doc/c-arm.texi
+++ b/gas/doc/c-arm.texi
@@ -181,6 +181,7 @@ been added, again in ascending alphabetical order.  For example,
 
 The following extensions are currently supported:
 @code{bf16} (BFloat16 extensions for v8.6-A architecture),
+@code{i8mm} (Int8 Matrix Multiply extensions for v8.6-A architecture),
 @code{crc}
 @code{crypto} (Cryptography Extensions for v8-A architecture, implies @code{fp+simd}),
 @code{dotprod} (Dot Product Extensions for v8.2-A architecture, implies @code{fp+simd}),
diff --git a/gas/testsuite/gas/arm/bfloat16-cmdline-bad-3.d b/gas/testsuite/gas/arm/bfloat16-cmdline-bad-3.d
index 5dfdeb4d6ccc6575e357835e10dcb2638c03de35..ad99cda5dc4bc158b70c5a678e432c2ad64fd7d3 100644
--- a/gas/testsuite/gas/arm/bfloat16-cmdline-bad-3.d
+++ b/gas/testsuite/gas/arm/bfloat16-cmdline-bad-3.d
@@ -1,4 +1,4 @@
 #name: Bfloat 16 bad extension
 #source: bfloat16-non-neon.s
 #as: -mno-warn-deprecated -march=armv8.1-a+bf16
-#error: .*Error: extension does not apply to the base architecture.*
+#error: .*Error: unknown architectural extension `bf16'*
diff --git a/gas/testsuite/gas/arm/i8mm.d b/gas/testsuite/gas/arm/i8mm.d
new file mode 100644
index 0000000000000000000000000000000000000000..6d7f1d74e443a047cc71ec3e2e2b515431eecceb
--- /dev/null
+++ b/gas/testsuite/gas/arm/i8mm.d
@@ -0,0 +1,36 @@
+#name: Int8 Matrix Multiply extension
+#source: i8mm.s
+#as: -mno-warn-deprecated -march=armv8.6-a+i8mm+simd -I$srcdir/$subdir
+#objdump: -dr --show-raw-insn
+
+.*: +file format .*arm.*
+
+Disassembly of section \.text:
+
+00000000 <\.text>:
+ *[0-9a-f]+:	fcea4c40 	vusmmla\.s8	q10, q5, q0
+ *[0-9a-f]+:	fc6a4c50 	vummla\.u8	q10, q5, q0
+ *[0-9a-f]+:	fc6a4c40 	vsmmla\.s8	q10, q5, q0
+ *[0-9a-f]+:	fcea4d40 	vusdot\.s8	q10, q5, q0
+ *[0-9a-f]+:	feca4d50 	vsudot\.u8	q10, q5, d0\[0\]
+ *[0-9a-f]+:	feca4d70 	vsudot\.u8	q10, q5, d0\[1\]
+ *[0-9a-f]+:	feca4d40 	vusdot\.s8	q10, q5, d0\[0\]
+ *[0-9a-f]+:	feca4d60 	vusdot\.s8	q10, q5, d0\[1\]
+ *[0-9a-f]+:	fca5ad00 	vusdot\.s8	d10, d5, d0
+ *[0-9a-f]+:	fe85ad00 	vusdot\.s8	d10, d5, d0\[0\]
+ *[0-9a-f]+:	fe85ad20 	vusdot\.s8	d10, d5, d0\[1\]
+ *[0-9a-f]+:	fe85ad10 	vsudot\.u8	d10, d5, d0\[0\]
+ *[0-9a-f]+:	fe85ad30 	vsudot\.u8	d10, d5, d0\[1\]
+ *[0-9a-f]+:	fcea4c40 	vusmmla\.s8	q10, q5, q0
+ *[0-9a-f]+:	fc6a4c50 	vummla\.u8	q10, q5, q0
+ *[0-9a-f]+:	fc6a4c40 	vsmmla\.s8	q10, q5, q0
+ *[0-9a-f]+:	fcea4d40 	vusdot\.s8	q10, q5, q0
+ *[0-9a-f]+:	feca4d50 	vsudot\.u8	q10, q5, d0\[0\]
+ *[0-9a-f]+:	feca4d70 	vsudot\.u8	q10, q5, d0\[1\]
+ *[0-9a-f]+:	feca4d40 	vusdot\.s8	q10, q5, d0\[0\]
+ *[0-9a-f]+:	feca4d60 	vusdot\.s8	q10, q5, d0\[1\]
+ *[0-9a-f]+:	fca5ad00 	vusdot\.s8	d10, d5, d0
+ *[0-9a-f]+:	fe85ad00 	vusdot\.s8	d10, d5, d0\[0\]
+ *[0-9a-f]+:	fe85ad20 	vusdot\.s8	d10, d5, d0\[1\]
+ *[0-9a-f]+:	fe85ad10 	vsudot\.u8	d10, d5, d0\[0\]
+ *[0-9a-f]+:	fe85ad30 	vsudot\.u8	d10, d5, d0\[1\]
diff --git a/gas/testsuite/gas/arm/i8mm.s b/gas/testsuite/gas/arm/i8mm.s
new file mode 100644
index 0000000000000000000000000000000000000000..20d04309ce009a6c56f50f4080756a913a92177e
--- /dev/null
+++ b/gas/testsuite/gas/arm/i8mm.s
@@ -0,0 +1,32 @@
+vusmmla.s8 q10, q5, q0
+vummla.u8 q10, q5, q0
+vsmmla.s8 q10, q5, q0
+
+vusdot.s8 q10, q5, q0
+vsudot.u8 q10, q5, d0[0]
+vsudot.u8 q10, q5, d0[1]
+vusdot.s8 q10, q5, d0[0]
+vusdot.s8 q10, q5, d0[1]
+
+vusdot.s8 d10, d5, d0
+vusdot.s8 d10, d5, d0[0]
+vusdot.s8 d10, d5, d0[1]
+vsudot.u8 d10, d5, d0[0]
+vsudot.u8 d10, d5, d0[1]
+
+
+vusmmla q10.s8, q5.s8, q0.s8
+vummla q10.u8, q5.u8, q0.u8
+vsmmla q10.s8, q5.s8, q0.s8
+
+vusdot q10.s8, q5.s8, q0.s8
+vsudot q10.u8, q5.u8, d0.u8[0]
+vsudot q10.u8, q5.u8, d0.u8[1]
+vusdot q10.s8, q5.s8, d0.s8[0]
+vusdot q10.s8, q5.s8, d0.s8[1]
+
+vusdot d10.s8, d5.s8, d0.s8
+vusdot d10.s8, d5.s8, d0.s8[0]
+vusdot d10.s8, d5.s8, d0.s8[1]
+vsudot d10.u8, d5.u8, d0.u8[0]
+vsudot d10.u8, d5.u8, d0.u8[1]
diff --git a/include/opcode/arm.h b/include/opcode/arm.h
index 7aea4d6e56805731d8d91f9a908c1cca332f3ab9..982da5abbdefb7ca6f76287dae9281e2615d4312 100644
--- a/include/opcode/arm.h
+++ b/include/opcode/arm.h
@@ -75,6 +75,7 @@
 #define ARM_EXT2_V8_1M_MAIN  0x00008000 /* ARMv8.1-M Mainline.		     */
 #define ARM_EXT2_V8_6A	     0x00010000	/* ARM V8.6A.			     */
 #define ARM_EXT2_BF16	     0x00020000 /* ARMv8 bfloat16.		     */
+#define ARM_EXT2_I8MM	     0x00040000 /* ARMv8.6A i8mm.		     */
 
 /* Co-processor space extensions.  */
 #define ARM_CEXT_XSCALE	     0x00000001	/* Allow MIA etc.	 	   */
diff --git a/opcodes/arm-dis.c b/opcodes/arm-dis.c
index 50ae9576561477a7c6e50628ffb20e005d9e9e59..8f82cb24e46defbfe5963237a6b5a2cc94adb258 100644
--- a/opcodes/arm-dis.c
+++ b/opcodes/arm-dis.c
@@ -1471,6 +1471,20 @@ static const struct opcode32 neon_opcodes[] =
   {ARM_FEATURE_CORE_HIGH (ARM_EXT2_BF16),
     0xfe300810, 0xffb00f10, "vfma%6?tb.bf16\t%12-15,22Q, %16-19,7Q, %0-2D[%3,5d]"},
 
+  /* Matrix Multiply instructions.  */
+  {ARM_FEATURE_CORE_HIGH (ARM_EXT2_I8MM),
+    0xfc200c40, 0xffb00f50, "vsmmla.s8\t%12-15,22R, %16-19,7R, %0-3,5R"},
+  {ARM_FEATURE_CORE_HIGH (ARM_EXT2_I8MM),
+    0xfc200c50, 0xffb00f50, "vummla.u8\t%12-15,22R, %16-19,7R, %0-3,5R"},
+  {ARM_FEATURE_CORE_HIGH (ARM_EXT2_I8MM),
+    0xfca00c40, 0xffb00f50, "vusmmla.s8\t%12-15,22R, %16-19,7R, %0-3,5R"},
+  {ARM_FEATURE_CORE_HIGH (ARM_EXT2_I8MM),
+    0xfca00d00, 0xffb00f10, "vusdot.s8\t%12-15,22R, %16-19,7R, %0-3,5R"},
+  {ARM_FEATURE_CORE_HIGH (ARM_EXT2_I8MM),
+    0xfe800d00, 0xffb00f10, "vusdot.s8\t%12-15,22R, %16-19,7R, d%0-3d[%5d]"},
+  {ARM_FEATURE_CORE_HIGH (ARM_EXT2_I8MM),
+    0xfe800d10, 0xffb00f10, "vsudot.u8\t%12-15,22R, %16-19,7R, d%0-3d[%5d]"},
+
   /* Two registers, miscellaneous.  */
   {ARM_FEATURE_COPROC (FPU_NEON_EXT_ARMV8),
     0xf3ba0400, 0xffbf0c10, "vrint%7-9?p?m?zaxn%u.f32\t%12-15,22R, %0-3,5R"},

Patch

diff --git a/gas/config/tc-arm.c b/gas/config/tc-arm.c
index ef2e190688d40587c570db76af30d77eca56c408..b9397528f4c840d3aec462c1e12ef42eebee25b1 100644
--- a/gas/config/tc-arm.c
+++ b/gas/config/tc-arm.c
@@ -277,6 +277,8 @@  static const arm_feature_set arm_ext_predres =
   ARM_FEATURE_CORE_HIGH (ARM_EXT2_PREDRES);
 static const arm_feature_set arm_ext_bf16 =
   ARM_FEATURE_CORE_HIGH (ARM_EXT2_BF16);
+static const arm_feature_set arm_ext_i8mm =
+  ARM_FEATURE_CORE_HIGH (ARM_EXT2_I8MM);
 
 static const arm_feature_set arm_arch_any = ARM_ANY;
 #ifdef OBJ_ELF
@@ -21481,6 +21483,79 @@  do_neon_dotproduct_u (void)
   return do_neon_dotproduct (1);
 }
 
+static void
+do_vusdot (void)
+{
+  enum neon_shape rs;
+  set_pred_insn_type (OUTSIDE_PRED_INSN);
+  if (inst.operands[2].isscalar)
+    {
+      rs = neon_select_shape (NS_DDS, NS_QQS, NS_NULL);
+      neon_check_type (3, rs, N_EQK, N_EQK, N_S8 | N_KEY);
+
+      inst.instruction |= (1 << 25);
+      int index = inst.operands[2].reg & 0xf;
+      constraint ((index != 1 && index != 0), _("index must be 0 or 1"));
+      inst.operands[2].reg >>= 4;
+      constraint (!(inst.operands[2].reg < 16),
+		  _("indexed register must be less than 16"));
+      neon_three_args (rs == NS_QQS);
+      inst.instruction |= (index << 5);
+    }
+  else
+    {
+      inst.instruction |= (1 << 21);
+      rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
+      neon_check_type (3, rs, N_EQK, N_EQK, N_S8 | N_KEY);
+      neon_three_args (rs == NS_QQQ);
+    }
+}
+
+static void
+do_vsudot (void)
+{
+  enum neon_shape rs;
+  set_pred_insn_type (OUTSIDE_PRED_INSN);
+  if (inst.operands[2].isscalar)
+    {
+      rs = neon_select_shape (NS_DDS, NS_QQS, NS_NULL);
+      neon_check_type (3, rs, N_EQK, N_EQK, N_U8 | N_KEY);
+
+      inst.instruction |= (1 << 25);
+      int index = inst.operands[2].reg & 0xf;
+      constraint ((index != 1 && index != 0), _("index must be 0 or 1"));
+      inst.operands[2].reg >>= 4;
+      constraint (!(inst.operands[2].reg < 16),
+		  _("indexed register must be less than 16"));
+      neon_three_args (rs == NS_QQS);
+      inst.instruction |= (index << 5);
+    }
+}
+
+static void
+do_vsmmla (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_QQQ, NS_NULL);
+  neon_check_type (3, rs, N_EQK, N_EQK, N_S8 | N_KEY);
+
+  set_pred_insn_type (OUTSIDE_PRED_INSN);
+
+  neon_three_args (1);
+
+}
+
+static void
+do_vummla (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_QQQ, NS_NULL);
+  neon_check_type (3, rs, N_EQK, N_EQK, N_U8 | N_KEY);
+
+  set_pred_insn_type (OUTSIDE_PRED_INSN);
+
+  neon_three_args (1);
+
+}
+
 /* Crypto v1 instructions.  */
 static void
 do_crypto_2op_1 (unsigned elttype, int op)
@@ -25998,7 +26073,7 @@  static const struct asm_opcode insns[] =
 #define	THUMB_VARIANT &arm_ext_i8mm
  TUF ("vsmmla", c200c40, fc200c40, 3, (RNQ, RNQ, RNQ), vsmmla, vsmmla),
  TUF ("vummla", c200c50, fc200c50, 3, (RNQ, RNQ, RNQ), vummla, vummla),
- TUF ("vusmmla", ca00c40, fca00c40, 3, (RNQ, RNQ, RNQ), vummla, vummla),
+ TUF ("vusmmla", ca00c40, fca00c40, 3, (RNQ, RNQ, RNQ), vsmmla, vsmmla),
  TUF ("vusdot", c800d00, fc800d00, 3, (RNDQ, RNDQ, RNDQ_RNSC), vusdot, vusdot),
  TUF ("vsudot", c800d10, fc800d10, 3, (RNDQ, RNDQ, RNSC), vsudot, vsudot),
 };
@@ -31125,6 +31200,8 @@  static const struct arm_ext_table armv82a_ext_table[] =
   ARM_ADD ("simd", FPU_ARCH_NEON_VFP_ARMV8_1),
   ARM_ADD ("fp16", FPU_ARCH_NEON_VFP_ARMV8_2_FP16),
   ARM_ADD ("fp16fml", FPU_ARCH_NEON_VFP_ARMV8_2_FP16FML),
+  ARM_ADD ("bf16", ARM_FEATURE_CORE_HIGH (ARM_EXT2_BF16)),
+  ARM_ADD ("i8mm", ARM_FEATURE_CORE_HIGH (ARM_EXT2_I8MM)),
   ARM_EXT ("crypto", FPU_ARCH_CRYPTO_NEON_VFP_ARMV8_1,
 	   ARM_FEATURE_COPROC (FPU_CRYPTO_ARMV8)),
   ARM_ADD ("dotprod", FPU_ARCH_DOTPROD_NEON_VFP_ARMV8),
@@ -31141,6 +31218,8 @@  static const struct arm_ext_table armv84a_ext_table[] =
 {
   ARM_ADD ("simd", FPU_ARCH_DOTPROD_NEON_VFP_ARMV8),
   ARM_ADD ("fp16", FPU_ARCH_NEON_VFP_ARMV8_4_FP16FML),
+  ARM_ADD ("bf16", ARM_FEATURE_CORE_HIGH (ARM_EXT2_BF16)),
+  ARM_ADD ("i8mm", ARM_FEATURE_CORE_HIGH (ARM_EXT2_I8MM)),
   ARM_EXT ("crypto", FPU_ARCH_CRYPTO_NEON_VFP_ARMV8_4,
 	   ARM_FEATURE_COPROC (FPU_CRYPTO_ARMV8)),
 
@@ -31156,6 +31235,8 @@  static const struct arm_ext_table armv85a_ext_table[] =
 {
   ARM_ADD ("simd", FPU_ARCH_DOTPROD_NEON_VFP_ARMV8),
   ARM_ADD ("fp16", FPU_ARCH_NEON_VFP_ARMV8_4_FP16FML),
+  ARM_ADD ("bf16", ARM_FEATURE_CORE_HIGH (ARM_EXT2_BF16)),
+  ARM_ADD ("i8mm", ARM_FEATURE_CORE_HIGH (ARM_EXT2_I8MM)),
   ARM_EXT ("crypto", FPU_ARCH_CRYPTO_NEON_VFP_ARMV8_4,
 	   ARM_FEATURE_COPROC (FPU_CRYPTO_ARMV8)),
 
@@ -31167,6 +31248,7 @@  static const struct arm_ext_table armv85a_ext_table[] =
 
 static const struct arm_ext_table armv86a_ext_table[] =
 {
+  ARM_ADD ("i8mm", ARM_FEATURE_CORE_HIGH (ARM_EXT2_I8MM)),
   { NULL, 0, ARM_ARCH_NONE, ARM_ARCH_NONE }
 };
 
@@ -31306,9 +31388,6 @@  struct arm_option_extension_value_table
    use the context sensitive approach using arm_ext_table's.  */
 static const struct arm_option_extension_value_table arm_extensions[] =
 {
-  ARM_EXT_OPT ("bf16",  ARM_FEATURE_CORE_HIGH (ARM_EXT2_BF16),
-			ARM_FEATURE_CORE_HIGH (ARM_EXT2_BF16),
-			ARM_ARCH_V8_2A),
   ARM_EXT_OPT ("crc",  ARCH_CRC_ARMV8, ARM_FEATURE_COPROC (CRC_EXT_ARMV8),
 			 ARM_FEATURE_CORE_LOW (ARM_EXT_V8)),
   ARM_EXT_OPT ("crypto", FPU_ARCH_CRYPTO_NEON_VFP_ARMV8,
diff --git a/gas/doc/c-arm.texi b/gas/doc/c-arm.texi
index 8afee70120f122e3724b27a71443d03b7c2ad719..a67bb59899abfe6638977234a1c43e196d8e395c 100644
--- a/gas/doc/c-arm.texi
+++ b/gas/doc/c-arm.texi
@@ -181,6 +181,7 @@  been added, again in ascending alphabetical order.  For example,
 
 The following extensions are currently supported:
 @code{bf16} (BFloat16 extensions for v8.6-A architecture),
+@code{i8mm} (Int8 Matrix Multiply extensions for v8.6-A architecture),
 @code{crc}
 @code{crypto} (Cryptography Extensions for v8-A architecture, implies @code{fp+simd}),
 @code{dotprod} (Dot Product Extensions for v8.2-A architecture, implies @code{fp+simd}),
diff --git a/gas/testsuite/gas/arm/bfloat16-cmdline-bad-3.d b/gas/testsuite/gas/arm/bfloat16-cmdline-bad-3.d
index 5dfdeb4d6ccc6575e357835e10dcb2638c03de35..ad99cda5dc4bc158b70c5a678e432c2ad64fd7d3 100644
--- a/gas/testsuite/gas/arm/bfloat16-cmdline-bad-3.d
+++ b/gas/testsuite/gas/arm/bfloat16-cmdline-bad-3.d
@@ -1,4 +1,4 @@ 
 #name: Bfloat 16 bad extension
 #source: bfloat16-non-neon.s
 #as: -mno-warn-deprecated -march=armv8.1-a+bf16
-#error: .*Error: extension does not apply to the base architecture.*
+#error: .*Error: unknown architectural extension `bf16'*
diff --git a/gas/testsuite/gas/arm/i8mm.d b/gas/testsuite/gas/arm/i8mm.d
new file mode 100644
index 0000000000000000000000000000000000000000..6d7f1d74e443a047cc71ec3e2e2b515431eecceb
--- /dev/null
+++ b/gas/testsuite/gas/arm/i8mm.d
@@ -0,0 +1,36 @@ 
+#name: Int8 Matrix Multiply extension
+#source: i8mm.s
+#as: -mno-warn-deprecated -march=armv8.6-a+i8mm+simd -I$srcdir/$subdir
+#objdump: -dr --show-raw-insn
+
+.*: +file format .*arm.*
+
+Disassembly of section \.text:
+
+00000000 <\.text>:
+ *[0-9a-f]+:	fcea4c40 	vusmmla\.s8	q10, q5, q0
+ *[0-9a-f]+:	fc6a4c50 	vummla\.u8	q10, q5, q0
+ *[0-9a-f]+:	fc6a4c40 	vsmmla\.s8	q10, q5, q0
+ *[0-9a-f]+:	fcea4d40 	vusdot\.s8	q10, q5, q0
+ *[0-9a-f]+:	feca4d50 	vsudot\.u8	q10, q5, d0\[0\]
+ *[0-9a-f]+:	feca4d70 	vsudot\.u8	q10, q5, d0\[1\]
+ *[0-9a-f]+:	feca4d40 	vusdot\.s8	q10, q5, d0\[0\]
+ *[0-9a-f]+:	feca4d60 	vusdot\.s8	q10, q5, d0\[1\]
+ *[0-9a-f]+:	fca5ad00 	vusdot\.s8	d10, d5, d0
+ *[0-9a-f]+:	fe85ad00 	vusdot\.s8	d10, d5, d0\[0\]
+ *[0-9a-f]+:	fe85ad20 	vusdot\.s8	d10, d5, d0\[1\]
+ *[0-9a-f]+:	fe85ad10 	vsudot\.u8	d10, d5, d0\[0\]
+ *[0-9a-f]+:	fe85ad30 	vsudot\.u8	d10, d5, d0\[1\]
+ *[0-9a-f]+:	fcea4c40 	vusmmla\.s8	q10, q5, q0
+ *[0-9a-f]+:	fc6a4c50 	vummla\.u8	q10, q5, q0
+ *[0-9a-f]+:	fc6a4c40 	vsmmla\.s8	q10, q5, q0
+ *[0-9a-f]+:	fcea4d40 	vusdot\.s8	q10, q5, q0
+ *[0-9a-f]+:	feca4d50 	vsudot\.u8	q10, q5, d0\[0\]
+ *[0-9a-f]+:	feca4d70 	vsudot\.u8	q10, q5, d0\[1\]
+ *[0-9a-f]+:	feca4d40 	vusdot\.s8	q10, q5, d0\[0\]
+ *[0-9a-f]+:	feca4d60 	vusdot\.s8	q10, q5, d0\[1\]
+ *[0-9a-f]+:	fca5ad00 	vusdot\.s8	d10, d5, d0
+ *[0-9a-f]+:	fe85ad00 	vusdot\.s8	d10, d5, d0\[0\]
+ *[0-9a-f]+:	fe85ad20 	vusdot\.s8	d10, d5, d0\[1\]
+ *[0-9a-f]+:	fe85ad10 	vsudot\.u8	d10, d5, d0\[0\]
+ *[0-9a-f]+:	fe85ad30 	vsudot\.u8	d10, d5, d0\[1\]
diff --git a/gas/testsuite/gas/arm/i8mm.s b/gas/testsuite/gas/arm/i8mm.s
new file mode 100644
index 0000000000000000000000000000000000000000..20d04309ce009a6c56f50f4080756a913a92177e
--- /dev/null
+++ b/gas/testsuite/gas/arm/i8mm.s
@@ -0,0 +1,32 @@ 
+vusmmla.s8 q10, q5, q0
+vummla.u8 q10, q5, q0
+vsmmla.s8 q10, q5, q0
+
+vusdot.s8 q10, q5, q0
+vsudot.u8 q10, q5, d0[0]
+vsudot.u8 q10, q5, d0[1]
+vusdot.s8 q10, q5, d0[0]
+vusdot.s8 q10, q5, d0[1]
+
+vusdot.s8 d10, d5, d0
+vusdot.s8 d10, d5, d0[0]
+vusdot.s8 d10, d5, d0[1]
+vsudot.u8 d10, d5, d0[0]
+vsudot.u8 d10, d5, d0[1]
+
+
+vusmmla q10.s8, q5.s8, q0.s8
+vummla q10.u8, q5.u8, q0.u8
+vsmmla q10.s8, q5.s8, q0.s8
+
+vusdot q10.s8, q5.s8, q0.s8
+vsudot q10.u8, q5.u8, d0.u8[0]
+vsudot q10.u8, q5.u8, d0.u8[1]
+vusdot q10.s8, q5.s8, d0.s8[0]
+vusdot q10.s8, q5.s8, d0.s8[1]
+
+vusdot d10.s8, d5.s8, d0.s8
+vusdot d10.s8, d5.s8, d0.s8[0]
+vusdot d10.s8, d5.s8, d0.s8[1]
+vsudot d10.u8, d5.u8, d0.u8[0]
+vsudot d10.u8, d5.u8, d0.u8[1]
diff --git a/include/opcode/arm.h b/include/opcode/arm.h
index 7aea4d6e56805731d8d91f9a908c1cca332f3ab9..982da5abbdefb7ca6f76287dae9281e2615d4312 100644
--- a/include/opcode/arm.h
+++ b/include/opcode/arm.h
@@ -75,6 +75,7 @@ 
 #define ARM_EXT2_V8_1M_MAIN  0x00008000 /* ARMv8.1-M Mainline.		     */
 #define ARM_EXT2_V8_6A	     0x00010000	/* ARM V8.6A.			     */
 #define ARM_EXT2_BF16	     0x00020000 /* ARMv8 bfloat16.		     */
+#define ARM_EXT2_I8MM	     0x00040000 /* ARMv8.6A i8mm.		     */
 
 /* Co-processor space extensions.  */
 #define ARM_CEXT_XSCALE	     0x00000001	/* Allow MIA etc.	 	   */
diff --git a/opcodes/arm-dis.c b/opcodes/arm-dis.c
index 50ae9576561477a7c6e50628ffb20e005d9e9e59..8f82cb24e46defbfe5963237a6b5a2cc94adb258 100644
--- a/opcodes/arm-dis.c
+++ b/opcodes/arm-dis.c
@@ -1471,6 +1471,20 @@  static const struct opcode32 neon_opcodes[] =
   {ARM_FEATURE_CORE_HIGH (ARM_EXT2_BF16),
     0xfe300810, 0xffb00f10, "vfma%6?tb.bf16\t%12-15,22Q, %16-19,7Q, %0-2D[%3,5d]"},
 
+  /* Matrix Multiply instructions.  */
+  {ARM_FEATURE_CORE_HIGH (ARM_EXT2_I8MM),
+    0xfc200c40, 0xffb00f50, "vsmmla.s8\t%12-15,22R, %16-19,7R, %0-3,5R"},
+  {ARM_FEATURE_CORE_HIGH (ARM_EXT2_I8MM),
+    0xfc200c50, 0xffb00f50, "vummla.u8\t%12-15,22R, %16-19,7R, %0-3,5R"},
+  {ARM_FEATURE_CORE_HIGH (ARM_EXT2_I8MM),
+    0xfca00c40, 0xffb00f50, "vusmmla.s8\t%12-15,22R, %16-19,7R, %0-3,5R"},
+  {ARM_FEATURE_CORE_HIGH (ARM_EXT2_I8MM),
+    0xfca00d00, 0xffb00f10, "vusdot.s8\t%12-15,22R, %16-19,7R, %0-3,5R"},
+  {ARM_FEATURE_CORE_HIGH (ARM_EXT2_I8MM),
+    0xfe800d00, 0xffb00f10, "vusdot.s8\t%12-15,22R, %16-19,7R, d%0-3d[%5d]"},
+  {ARM_FEATURE_CORE_HIGH (ARM_EXT2_I8MM),
+    0xfe800d10, 0xffb00f10, "vsudot.u8\t%12-15,22R, %16-19,7R, d%0-3d[%5d]"},
+
   /* Two registers, miscellaneous.  */
   {ARM_FEATURE_COPROC (FPU_NEON_EXT_ARMV8),
     0xf3ba0400, 0xffbf0c10, "vrint%7-9?p?m?zaxn%u.f32\t%12-15,22R, %0-3,5R"},