[i386] Fix PR83358 - increase divide/mod latencies a bit

Message ID 20171212082848.GA245@x4
State New
Headers show
Series
  • [i386] Fix PR83358 - increase divide/mod latencies a bit
Related show

Commit Message

Markus Trippelsdorf Dec. 12, 2017, 8:28 a.m.
As the testcase shows, trunk currently generates horrible code for
divisions used in tight loops. This happens because the algorithm
expanding div/mod doesn't take parallelism into account and this makes
the cost model unrealistic.
Fix the issue by increasing the estimated latencies a bit.

Tested on X86_64.
OK for trunk?
(I will keep an eye on the periodic SPEC testers.)

	PR target/83358
	* x86-tune-costs.h (skylake_cost, core_cost): Increase div/mod
	latencies a bit.

-- 
Markus

Comments

Jeff Law Dec. 16, 2017, 4:29 a.m. | #1
On 12/12/2017 01:28 AM, Markus Trippelsdorf wrote:
> As the testcase shows, trunk currently generates horrible code for

> divisions used in tight loops. This happens because the algorithm

> expanding div/mod doesn't take parallelism into account and this makes

> the cost model unrealistic.

> Fix the issue by increasing the estimated latencies a bit.

> 

> Tested on X86_64.

> OK for trunk?

> (I will keep an eye on the periodic SPEC testers.)

> 

> 	PR target/83358

> 	* x86-tune-costs.h (skylake_cost, core_cost): Increase div/mod

> 	latencies a bit.

THanks.  I added a ChangeLog for the testsuite and installed this onto
the trunk.

We can iterate on other micro-architecture costs if necessary.

jeff

Patch

diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index 312467d9788f..648219338308 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -1541,9 +1541,11 @@  struct processor_costs skylake_cost = {
    COSTS_N_INSNS (4),			/*				 DI */
    COSTS_N_INSNS (4)},			/*			      other */
   0,					/* cost of multiply per each bit set */
-  {COSTS_N_INSNS (8),			/* cost of a divide/mod for QI */
-   COSTS_N_INSNS (8),			/*			    HI */
-   COSTS_N_INSNS (11),			/*			    SI */
+  /* Expanding div/mod currently doesn't consider parallelism. So the cost
+     model is not realistic. We compensate by increasing the latencies a bit.  */
+  {COSTS_N_INSNS (11),			/* cost of a divide/mod for QI */
+   COSTS_N_INSNS (11),			/*			    HI */
+   COSTS_N_INSNS (14),			/*			    SI */
    COSTS_N_INSNS (76),			/*			    DI */
    COSTS_N_INSNS (76)},			/*			    other */
   COSTS_N_INSNS (1),			/* cost of movsx */
@@ -2342,11 +2344,11 @@  struct processor_costs core_cost = {
    COSTS_N_INSNS (4),			/*				 DI */
    COSTS_N_INSNS (4)},			/*			      other */
   0,					/* cost of multiply per each bit set */
-  {COSTS_N_INSNS (8),			/* cost of a divide/mod for QI */
-   COSTS_N_INSNS (8),			/*			    HI */
-   /* 8-11 */
-   COSTS_N_INSNS (11),			/*			    SI */
-   /* 24-81 */
+  /* Expanding div/mod currently doesn't consider parallelism. So the cost
+     model is not realistic. We compensate by increasing the latencies a bit.  */
+  {COSTS_N_INSNS (11),			/* cost of a divide/mod for QI */
+   COSTS_N_INSNS (11),			/*			    HI */
+   COSTS_N_INSNS (14),			/*			    SI */
    COSTS_N_INSNS (81),			/*			    DI */
    COSTS_N_INSNS (81)},			/*			    other */
   COSTS_N_INSNS (1),			/* cost of movsx */
diff --git a/gcc/testsuite/gcc.target/i386/pr83358-1.c b/gcc/testsuite/gcc.target/i386/pr83358-1.c
new file mode 100644
index 000000000000..96427b2f56dd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr83358-1.c
@@ -0,0 +1,31 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mtune=core2" } */
+
+#include <stdint.h>
+
+void bin2ascii(uint64_t val, char *dst) {
+  const int64_t POW10_10 = ((int64_t)10) * 1000 * 1000 * 1000;
+  int64_t hix = val / POW10_10;
+  int64_t lox = val % POW10_10;
+  int32_t v0 = hix / 100000;
+  int32_t v1 = hix % 100000;
+  int32_t v2 = lox / 100000;
+  int32_t v3 = lox % 100000;
+  for (int i = 4; i != 0; --i) {
+    dst[i + 0 * 5] = v0 % 10 + '0';
+    v0 /= 10;
+    dst[i + 1 * 5] = v1 % 10 + '0';
+    v1 /= 10;
+    dst[i + 2 * 5] = v2 % 10 + '0';
+    v2 /= 10;
+    dst[i + 3 * 5] = v3 % 10 + '0';
+    v3 /= 10;
+  }
+  dst[0 * 5] = v0 + '0';
+  dst[1 * 5] = v1 + '0';
+  dst[2 * 5] = v2 + '0';
+  dst[3 * 5] = v3 + '0';
+  dst[4 * 5] = 0;
+}
+
+/* { dg-final { scan-assembler-not "idiv" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr83358-2.c b/gcc/testsuite/gcc.target/i386/pr83358-2.c
new file mode 100644
index 000000000000..f6039bf72feb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr83358-2.c
@@ -0,0 +1,31 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mtune=skylake-avx512" } */
+
+#include <stdint.h>
+
+void bin2ascii(uint64_t val, char *dst) {
+  const int64_t POW10_10 = ((int64_t)10) * 1000 * 1000 * 1000;
+  int64_t hix = val / POW10_10;
+  int64_t lox = val % POW10_10;
+  int32_t v0 = hix / 100000;
+  int32_t v1 = hix % 100000;
+  int32_t v2 = lox / 100000;
+  int32_t v3 = lox % 100000;
+  for (int i = 4; i != 0; --i) {
+    dst[i + 0 * 5] = v0 % 10 + '0';
+    v0 /= 10;
+    dst[i + 1 * 5] = v1 % 10 + '0';
+    v1 /= 10;
+    dst[i + 2 * 5] = v2 % 10 + '0';
+    v2 /= 10;
+    dst[i + 3 * 5] = v3 % 10 + '0';
+    v3 /= 10;
+  }
+  dst[0 * 5] = v0 + '0';
+  dst[1 * 5] = v1 + '0';
+  dst[2 * 5] = v2 + '0';
+  dst[3 * 5] = v3 + '0';
+  dst[4 * 5] = 0;
+}
+
+/* { dg-final { scan-assembler-not "idiv" } } */