rs6000: Add execution tests for mma builtins.

Message ID 20200707174508.39156-1-acsawdey@linux.ibm.com
State New
Headers show
Series
  • rs6000: Add execution tests for mma builtins.
Related show

Commit Message

Qing Zhao via Gcc-patches July 7, 2020, 5:45 p.m.
Updated slightly, removed -Wno-psabi as requested and also fixed the
fact that it wasn't actually checking __builtin_cpu_is or
__builtin_cpu_supports. OK for trunk and backport to 10?

Thanks,
    Aaron

2020-06-30  Rajalakshmi Srinivasaraghavan  <rajis@linux.vnet.ibm.com>
	    Aaron Sawdey  <acsawdey@linux.ibm.com>

gcc/testsuite/
	* gcc.target/powerpc/mma-single-test.c: New file.
	* gcc.target/powerpc/mma-double-test.c: New file.
---
 .../gcc.target/powerpc/mma-double-test.c      | 204 +++++++++++++++++
 .../gcc.target/powerpc/mma-single-test.c      | 213 ++++++++++++++++++
 2 files changed, 417 insertions(+)
 create mode 100755 gcc/testsuite/gcc.target/powerpc/mma-double-test.c
 create mode 100755 gcc/testsuite/gcc.target/powerpc/mma-single-test.c

-- 
2.17.1

Comments

Segher Boessenkool July 8, 2020, 1:29 a.m. | #1
On Tue, Jul 07, 2020 at 12:45:08PM -0500, Aaron Sawdey via Gcc-patches wrote:
> Updated slightly, removed -Wno-psabi as requested and also fixed the

> fact that it wasn't actually checking __builtin_cpu_is or

> __builtin_cpu_supports. OK for trunk and backport to 10?


But you don't need that, and neither is it wanted even (we also want
the tests to run on systems with an older glibc, or no glibc at all).
Instead, you want to have a mma_hw selector (or abbreviation, with
a name with "powerpc" in it).

> +/* { dg-require-effective-target power10_hw } */


This already means that we are running on a system that can execute
ISA 3.1 insns (it tests if some specific "pli" works).

> +  if ( !__builtin_cpu_is ("power10"))

> +    {

> +      printf ("Error: __builtin_cpu_is says this is not power10\n");

> +      ret++;

> +    }


This means it will not run on later CPUs?  Not good.

> +  if ( !__builtin_cpu_supports ("arch_3_1"))

> +    {

> +      printf ("Error: __builtin_cpu_supports says arch_3_1 not supported.\n");

> +      ret++;

> +    }


This is always already tested for by that power10_hw selector.

> +  if ( !__builtin_cpu_supports ("mma"))

> +    {

> +      printf ("Error: __builtin_cpu_supports says mma not supported.\n");

> +      ret++;

> +    }


And for this, we probably want a mma_hw sooner rather than later.


Segher

Patch

diff --git a/gcc/testsuite/gcc.target/powerpc/mma-double-test.c b/gcc/testsuite/gcc.target/powerpc/mma-double-test.c
new file mode 100755
index 00000000000..9fdf6d9d2a9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/mma-double-test.c
@@ -0,0 +1,204 @@ 
+/* { dg-do run } */
+/* { dg-require-effective-target power10_hw } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <altivec.h>
+
+typedef unsigned char vec_t __attribute__ ((vector_size (16)));
+typedef double v4sf_t __attribute__ ((vector_size (16)));
+#define SAVE_ACC(ACC, ldc, J)  \
+	  __builtin_mma_disassemble_acc (result, ACC); \
+	  rowC = (v4sf_t *) &CO[0*ldc+J]; \
+          rowC[0] += result[3] ; \
+          rowC = (v4sf_t *) &CO[1*ldc+J]; \
+          rowC[0] += result[2] ; \
+          rowC = (v4sf_t *) &CO[2*ldc+J]; \
+          rowC[0] += result[1] ; \
+          rowC = (v4sf_t *) &CO[3*ldc+J]; \
+	  rowC[0] += result[0] ;
+
+void
+MMA (int m, int n, int k, double *A, double *B, double *C)
+{
+  __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
+  v4sf_t result[4];
+  v4sf_t *rowC;
+  for (int l = 0; l < n; l += 4)
+    {
+      double *CO;
+      double *AO;
+      AO = A;
+      CO = C;
+      C += m * 4;
+      for (int j = 0; j < m; j += 16)
+	{
+	  double *BO = B;
+	  __builtin_mma_xxsetaccz (&acc0);
+	  __builtin_mma_xxsetaccz (&acc1);
+	  __builtin_mma_xxsetaccz (&acc2);
+	  __builtin_mma_xxsetaccz (&acc3);
+	  __builtin_mma_xxsetaccz (&acc4);
+	  __builtin_mma_xxsetaccz (&acc5);
+	  __builtin_mma_xxsetaccz (&acc6);
+	  __builtin_mma_xxsetaccz (&acc7);
+	  unsigned long i;
+
+	  for (i = 0; i < k; i++)
+	    {
+	      vec_t *rowA = (vec_t *) & AO[i * 16];
+	      __vector_pair rowB;
+	      vec_t *rb = (vec_t *) & BO[i * 4];
+	      __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
+	      __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
+	      __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
+	      __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]);
+	      __builtin_mma_xvf64gerpp (&acc4, rowB, rowA[4]);
+	      __builtin_mma_xvf64gerpp (&acc5, rowB, rowA[5]);
+	      __builtin_mma_xvf64gerpp (&acc6, rowB, rowA[6]);
+	      __builtin_mma_xvf64gerpp (&acc7, rowB, rowA[7]);
+	    }
+	  SAVE_ACC (&acc0, m, 0);
+	  SAVE_ACC (&acc2, m, 4);
+	  SAVE_ACC (&acc1, m, 2);
+	  SAVE_ACC (&acc3, m, 6);
+	  SAVE_ACC (&acc4, m, 8);
+	  SAVE_ACC (&acc6, m, 12);
+	  SAVE_ACC (&acc5, m, 10);
+	  SAVE_ACC (&acc7, m, 14);
+	  AO += k * 16;
+	  BO += k * 4;
+	  CO += 16;
+	}
+      B += k * 4;
+    }
+}
+
+void
+init (double *matrix, int row, int column)
+{
+  for (int j = 0; j < column; j++)
+    {
+      for (int i = 0; i < row; i++)
+	{
+	  matrix[j * row + i] = (i * 16 + 2 + j) / 0.123;
+	}
+    }
+}
+
+void
+init0 (double *matrix, double *matrix1, int row, int column)
+{
+  for (int j = 0; j < column; j++)
+    for (int i = 0; i < row; i++)
+      matrix[j * row + i] = matrix1[j * row + i] = 0;
+}
+
+
+void
+print (const char *name, const double *matrix, int row, int column)
+{
+  printf ("Matrix %s has %d rows and %d columns:\n", name, row, column);
+  for (int i = 0; i < row; i++)
+    {
+      for (int j = 0; j < column; j++)
+	{
+	  printf ("%f ", matrix[j * row + i]);
+	}
+      printf ("\n");
+    }
+  printf ("\n");
+}
+
+int
+main (int argc, char *argv[])
+{
+  int rowsA, colsB, common;
+  int i, j, k;
+  int ret = 0;
+
+  for (int t = 16; t <= 128; t += 16)
+    {
+      for (int t1 = 4; t1 <= 16; t1 += 4)
+	{
+	  rowsA = t;
+	  colsB = t1;
+	  common = 1;
+	  /* printf ("Running test for rows = %d,cols = %d\n", t, t1); */
+	  double A[rowsA * common];
+	  double B[common * colsB];
+	  double C[rowsA * colsB];
+	  double D[rowsA * colsB];
+
+
+	  init (A, rowsA, common);
+	  init (B, common, colsB);
+	  init0 (C, D, rowsA, colsB);
+	  MMA (rowsA, colsB, common, A, B, C);
+
+	  for (i = 0; i < colsB; i++)
+	    {
+	      for (j = 0; j < rowsA; j++)
+		{
+		  D[i * rowsA + j] = 0;
+		  for (k = 0; k < common; k++)
+		    {
+		      D[i * rowsA + j] +=
+			A[k * rowsA + j] * B[k + common * i];
+		    }
+		}
+	    }
+	  for (i = 0; i < colsB; i++)
+	    {
+	      for (j = 0; j < rowsA; j++)
+		{
+		  for (k = 0; k < common; k++)
+		    {
+		      if (D[i * rowsA + j] != C[i * rowsA + j])
+			{
+			  printf ("Error %d,%d,%d\n",i,j,k);
+			  ret++;
+			}
+		    }
+		}
+	    }
+	  /*
+	     print ("A", A, rowsA, common);
+	     print ("B", B, common, colsB);
+	     print ("C", C, rowsA, colsB);
+	     print ("D", D, rowsA, colsB);
+	   */
+	}
+    }
+  
+  if (ret)
+    printf ("MMA double test fail: %d errors\n",ret);
+
+#ifdef __BUILTIN_CPU_SUPPORTS__
+#ifdef VERBOSE
+  printf ("MMA single test success: 0 MMA errors\n");
+#endif
+      
+  if ( !__builtin_cpu_is ("power10"))
+    {
+      printf ("Error: __builtin_cpu_is says this is not power10\n");
+      ret++;
+    }
+  
+  if ( !__builtin_cpu_supports ("arch_3_1"))
+    {
+      printf ("Error: __builtin_cpu_supports says arch_3_1 not supported.\n");
+      ret++;
+    }
+  
+  if ( !__builtin_cpu_supports ("mma"))
+    {
+      printf ("Error: __builtin_cpu_supports says mma not supported.\n");
+      ret++;
+    }
+#endif
+
+  return ret;
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/mma-single-test.c b/gcc/testsuite/gcc.target/powerpc/mma-single-test.c
new file mode 100755
index 00000000000..c1698ea2774
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/mma-single-test.c
@@ -0,0 +1,213 @@ 
+/* { dg-do run } */
+/* { dg-require-effective-target power10_hw } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <altivec.h>
+
+typedef unsigned char vec_t __attribute__ ((vector_size (16)));
+typedef float v4sf_t __attribute__ ((vector_size (16)));
+#define SAVE_ACC(ACC, ldc,J)  \
+	  __builtin_mma_disassemble_acc (result, ACC); \
+	  rowC = (v4sf_t *) &CO[0*ldc+J]; \
+          rowC[0] += result[3] ; \
+          rowC = (v4sf_t *) &CO[1*ldc+J]; \
+          rowC[0] += result[2] ; \
+          rowC = (v4sf_t *) &CO[2*ldc+J]; \
+          rowC[0] += result[1] ; \
+          rowC = (v4sf_t *) &CO[3*ldc+J]; \
+	  rowC[0] += result[0] ;
+
+#define SAVE_ACC1(ACC,ldc, J)  \
+	  __builtin_mma_disassemble_acc (result, ACC); \
+	  rowC = (v4sf_t *) &CO[4* ldc+J]; \
+          rowC[0] += result[3] ; \
+          rowC = (v4sf_t *) &CO[5*ldc+J]; \
+          rowC[0] += result[2] ; \
+          rowC = (v4sf_t *) &CO[6*ldc+J]; \
+          rowC[0] += result[1] ; \
+          rowC = (v4sf_t *) &CO[7*ldc+J]; \
+	  rowC[0] += result[0] ;
+void
+MMA (int m, int n, int k, float *A, float *B, float *C)
+{
+  __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
+  v4sf_t result[4];
+  v4sf_t *rowC;
+  for (int l = 0; l < n; l += 8)
+    {
+      float *CO;
+      float *AO;
+      AO = A;
+      CO = C;
+      C += m * 8;
+      for (int j = 0; j < m; j += 16)
+	{
+	  float *BO = B;
+	  __builtin_mma_xxsetaccz (&acc0);
+	  __builtin_mma_xxsetaccz (&acc1);
+	  __builtin_mma_xxsetaccz (&acc2);
+	  __builtin_mma_xxsetaccz (&acc3);
+	  __builtin_mma_xxsetaccz (&acc4);
+	  __builtin_mma_xxsetaccz (&acc5);
+	  __builtin_mma_xxsetaccz (&acc6);
+	  __builtin_mma_xxsetaccz (&acc7);
+	  unsigned long i;
+
+	  for (i = 0; i < k; i++)
+	    {
+	      vec_t *rowA = (vec_t *) & AO[i * 16];
+	      vec_t *rowB = (vec_t *) & BO[i * 8];
+	      __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
+	      __builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]);
+	      __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[1]);
+	      __builtin_mma_xvf32gerpp (&acc3, rowB[1], rowA[1]);
+	      __builtin_mma_xvf32gerpp (&acc4, rowB[0], rowA[2]);
+	      __builtin_mma_xvf32gerpp (&acc5, rowB[1], rowA[2]);
+	      __builtin_mma_xvf32gerpp (&acc6, rowB[0], rowA[3]);
+	      __builtin_mma_xvf32gerpp (&acc7, rowB[1], rowA[3]);
+	    }
+	  SAVE_ACC (&acc0, m, 0);
+	  SAVE_ACC (&acc2, m, 4);
+	  SAVE_ACC1 (&acc1, m, 0);
+	  SAVE_ACC1 (&acc3, m, 4);
+	  SAVE_ACC (&acc4, m, 8);
+	  SAVE_ACC (&acc6, m, 12);
+	  SAVE_ACC1 (&acc5, m, 8);
+	  SAVE_ACC1 (&acc7, m, 12);
+	  AO += k * 16;
+	  BO += k * 8;
+	  CO += 16;
+	}
+      B += k * 8;
+    }
+}
+
+void
+init (float *matrix, int row, int column)
+{
+  for (int j = 0; j < column; j++)
+    {
+      for (int i = 0; i < row; i++)
+	{
+	  matrix[j * row + i] = (i * 16 + 2 + j) / 0.123;
+	}
+    }
+}
+
+void
+init0 (float *matrix, float *matrix1, int row, int column)
+{
+  for (int j = 0; j < column; j++)
+    for (int i = 0; i < row; i++)
+      matrix[j * row + i] = matrix1[j * row + i] = 0;
+}
+
+
+void
+print (const char *name, const float *matrix, int row, int column)
+{
+  printf ("Matrix %s has %d rows and %d columns:\n", name, row, column);
+  for (int i = 0; i < row; i++)
+    {
+      for (int j = 0; j < column; j++)
+	{
+	  printf ("%f ", matrix[j * row + i]);
+	}
+      printf ("\n");
+    }
+  printf ("\n");
+}
+
+int
+main (int argc, char *argv[])
+{
+  int rowsA, colsB, common;
+  int i, j, k;
+  int ret = 0;
+
+  for (int t = 16; t <= 128; t += 16)
+    {
+      for (int t1 = 8; t1 <= 16; t1 += 8)
+	{
+	  rowsA = t;
+	  colsB = t1;
+	  common = 1;
+	  /* printf ("Running test for rows = %d,cols = %d\n", t, t1); */
+	  float A[rowsA * common];
+	  float B[common * colsB];
+	  float C[rowsA * colsB];
+	  float D[rowsA * colsB];
+
+
+	  init (A, rowsA, common);
+	  init (B, common, colsB);
+	  init0 (C, D, rowsA, colsB);
+	  MMA (rowsA, colsB, common, A, B, C);
+
+	  for (i = 0; i < colsB; i++)
+	    {
+	      for (j = 0; j < rowsA; j++)
+		{
+		  D[i * rowsA + j] = 0;
+		  for (k = 0; k < common; k++)
+		    {
+		      D[i * rowsA + j] +=
+			A[k * rowsA + j] * B[k + common * i];
+		    }
+		}
+	    }
+	  for (i = 0; i < colsB; i++)
+	    {
+	      for (j = 0; j < rowsA; j++)
+		{
+		  for (k = 0; k < common; k++)
+		    {
+		      if (D[i * rowsA + j] != C[i * rowsA + j])
+			{
+			  printf ("Error %d,%d,%d\n",i,j,k);
+			  ret++;
+			}
+		    }
+		}
+	    }
+	  if (ret)
+	    {
+	      print ("A", A, rowsA, common);
+	      print ("B", B, common, colsB);
+	      print ("C", C, rowsA, colsB);
+	      print ("D", D, rowsA, colsB);
+	    }
+	}
+    }
+
+  if (ret)
+    printf ("MMA single test fail: %d errors\n",ret);
+
+#ifdef __BUILTIN_CPU_SUPPORTS__
+#ifdef VERBOSE
+  printf ("MMA single test success: 0 MMA errors\n");
+#endif
+      
+  if ( !__builtin_cpu_is ("power10"))
+    {
+      printf ("Error: __builtin_cpu_is says this is not power10\n");
+      ret++;
+    }
+  
+  if ( !__builtin_cpu_supports ("arch_3_1"))
+    {
+      printf ("Error: __builtin_cpu_supports says arch_3_1 not supported.\n");
+      ret++;
+    }
+  
+  if ( !__builtin_cpu_supports ("mma"))
+    {
+      printf ("Error: __builtin_cpu_supports says mma not supported.\n");
+      ret++;
+    }
+#endif
+
+  return ret;
+}