[i386] Correct imul (r64) latency for modern Intel CPUs

Message ID 20171217092009.GA16559@x4
State New
Headers show
Series
  • [i386] Correct imul (r64) latency for modern Intel CPUs
Related show

Commit Message

Markus Trippelsdorf Dec. 17, 2017, 9:20 a.m.
Since Nehalem the 64bit multiplication latency is three cycles, not
four. So update the costs to reflect reality.

Tested on X86_64.
OK for trunk?

Thanks.

	* x86-tune-costs.h (skylake_cost, core_cost): Decrease r64 multiply
	latencies.

	* gcc.target/i386/wmul-3.c: New test.

-- 
Markus

Comments

Jan Hubicka Dec. 17, 2017, 11:26 a.m. | #1
> Since Nehalem the 64bit multiplication latency is three cycles, not

> four. So update the costs to reflect reality.


I looked into the imul latencies and was a bit confused, decided to look
into it later and forgot.

Agner Fog's table http://www.agner.org/optimize/instruction_tables.pdf
lists for sandybridge-skylake costs of 3 cycles for everything except for
16bit (which is 4 cycles).

For earlier chips this is more varying.
 Merom (core duo) and Wolfdale is 3 or everything except for 5 for 64bit.
 Nehalem is 3 for everything however 64bit has smaller throughput.

We do not have spearate table for sandybridge, but I guess we do not care
that much about older cores (Vladimir is still running a tester on one,
so we do benchmark them).

So I guess the change is OK.  I do not see this as very good reason to split
the sandybridge table out, but perhaps drop a comment in case we will want
to do it in future.

Honza
> 

> Tested on X86_64.

> OK for trunk?

> 

> Thanks.

> 

> 	* x86-tune-costs.h (skylake_cost, core_cost): Decrease r64 multiply

> 	latencies.

> 

> 	* gcc.target/i386/wmul-3.c: New test.

> 

> diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h

> index 648219338308..ddb47ba44056 100644

> --- a/gcc/config/i386/x86-tune-costs.h

> +++ b/gcc/config/i386/x86-tune-costs.h

> @@ -1538,8 +1538,8 @@ struct processor_costs skylake_cost = {

>    {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */

>     COSTS_N_INSNS (4),			/*				 HI */

>     COSTS_N_INSNS (3),			/*				 SI */

> -   COSTS_N_INSNS (4),			/*				 DI */

> -   COSTS_N_INSNS (4)},			/*			      other */

> +   COSTS_N_INSNS (3),			/*				 DI */

> +   COSTS_N_INSNS (3)},			/*			      other */

>    0,					/* cost of multiply per each bit set */

>    /* Expanding div/mod currently doesn't consider parallelism. So the cost

>       model is not realistic. We compensate by increasing the latencies a bit.  */

> @@ -2341,8 +2341,8 @@ struct processor_costs core_cost = {

>    {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */

>     COSTS_N_INSNS (4),			/*				 HI */

>     COSTS_N_INSNS (3),			/*				 SI */

> -   COSTS_N_INSNS (4),			/*				 DI */

> -   COSTS_N_INSNS (4)},			/*			      other */

> +   COSTS_N_INSNS (3),			/*				 DI */

> +   COSTS_N_INSNS (3)},			/*			      other */

>    0,					/* cost of multiply per each bit set */

>    /* Expanding div/mod currently doesn't consider parallelism. So the cost

>       model is not realistic. We compensate by increasing the latencies a bit.  */

> diff --git a/gcc/testsuite/gcc.target/i386/wmul-3.c b/gcc/testsuite/gcc.target/i386/wmul-3.c

> new file mode 100644

> index 000000000000..66c077c2cc0d

> --- /dev/null

> +++ b/gcc/testsuite/gcc.target/i386/wmul-3.c

> @@ -0,0 +1,66 @@

> +/* { dg-do compile { target { ! ia32 } } } */

> +/* { dg-options "-O2 -march=haswell" } */

> +

> +#include <stdint.h>

> +#include <string.h>

> +

> +static const char b100_tab[200] = {

> +    '0', '0', '0', '1', '0', '2', '0', '3', '0', '4',

> +    '0', '5', '0', '6', '0', '7', '0', '8', '0', '9',

> +    '1', '0', '1', '1', '1', '2', '1', '3', '1', '4',

> +    '1', '5', '1', '6', '1', '7', '1', '8', '1', '9',

> +    '2', '0', '2', '1', '2', '2', '2', '3', '2', '4',

> +    '2', '5', '2', '6', '2', '7', '2', '8', '2', '9',

> +    '3', '0', '3', '1', '3', '2', '3', '3', '3', '4',

> +    '3', '5', '3', '6', '3', '7', '3', '8', '3', '9',

> +    '4', '0', '4', '1', '4', '2', '4', '3', '4', '4',

> +    '4', '5', '4', '6', '4', '7', '4', '8', '4', '9',

> +    '5', '0', '5', '1', '5', '2', '5', '3', '5', '4',

> +    '5', '5', '5', '6', '5', '7', '5', '8', '5', '9',

> +    '6', '0', '6', '1', '6', '2', '6', '3', '6', '4',

> +    '6', '5', '6', '6', '6', '7', '6', '8', '6', '9',

> +    '7', '0', '7', '1', '7', '2', '7', '3', '7', '4',

> +    '7', '5', '7', '6', '7', '7', '7', '8', '7', '9',

> +    '8', '0', '8', '1', '8', '2', '8', '3', '8', '4',

> +    '8', '5', '8', '6', '8', '7', '8', '8', '8', '9',

> +    '9', '0', '9', '1', '9', '2', '9', '3', '9', '4',

> +    '9', '5', '9', '6', '9', '7', '9', '8', '9', '9',

> +};

> +

> +void uint64_to_ascii_ta7_32_base100(uint64_t val, char *dst) {

> +  const int64_t POW10_10 = ((int64_t)10) * 1000 * 1000 * 1000;

> +  const uint64_t POW2_57_DIV_POW100_4 =

> +      ((int64_t)(1) << 57) / 100 / 100 / 100 / 100 + 1;

> +  const uint64_t MASK32 = ((int64_t)(1) << 32) - 1;

> +  int64_t hix = val / POW10_10;

> +  int64_t lox = val % POW10_10;

> +  int64_t lor = lox & (uint64_t)(-2);

> +  uint64_t hi = hix * POW2_57_DIV_POW100_4;

> +  uint64_t lo = lor * POW2_57_DIV_POW100_4;

> +  memcpy(dst + 0 * 10 + 0, &b100_tab[(hi >> 57) * 2], 2);

> +  memcpy(dst + 1 * 10 + 0, &b100_tab[(lo >> 57) * 2], 2);

> +  hi = (hi >> 25) + 1;

> +  lo = (lo >> 25) + 1;

> +  hi = (hi & MASK32) * 100;

> +  lo = (lo & MASK32) * 100;

> +  memcpy(dst + 0 * 10 + 2, &b100_tab[(hi >> 32) * 2], 2);

> +  hi = (hi & MASK32) * 100;

> +  memcpy(dst + 1 * 10 + 2, &b100_tab[(lo >> 32) * 2], 2);

> +  lo = (lo & MASK32) * 100;

> +  memcpy(dst + 0 * 10 + 4, &b100_tab[(hi >> 32) * 2], 2);

> +  hi = (hi & MASK32) * 100;

> +  memcpy(dst + 1 * 10 + 4, &b100_tab[(lo >> 32) * 2], 2);

> +  lo = (lo & MASK32) * 100;

> +  memcpy(dst + 0 * 10 + 6, &b100_tab[(hi >> 32) * 2], 2);

> +  hi = (hi & MASK32) * 100;

> +  memcpy(dst + 1 * 10 + 6, &b100_tab[(lo >> 32) * 2], 2);

> +  lo = (lo & MASK32) * 100;

> +  hi >>= 32;

> +  lo >>= 32;

> +  lo = (lo & (-2)) | (lox & 1);

> +  memcpy(dst + 0 * 10 + 8, &b100_tab[hi * 2], 2);

> +  memcpy(dst + 1 * 10 + 8, &b100_tab[lo * 2], 2);

> +  dst[2 * 10] = 0;

> +}

> +

> +/* { dg-final { scan-assembler-times "imulq" 11 } } */

> -- 

> Markus
Markus Trippelsdorf Dec. 17, 2017, 12:11 p.m. | #2
On 2017.12.17 at 12:26 +0100, Jan Hubicka wrote:
> > Since Nehalem the 64bit multiplication latency is three cycles, not

> > four. So update the costs to reflect reality.

> 

> I looked into the imul latencies and was a bit confused, decided to look

> into it later and forgot.

> 

> Agner Fog's table http://www.agner.org/optimize/instruction_tables.pdf

> lists for sandybridge-skylake costs of 3 cycles for everything except for

> 16bit (which is 4 cycles).

> 

> For earlier chips this is more varying.

>  Merom (core duo) and Wolfdale is 3 or everything except for 5 for 64bit.

>  Nehalem is 3 for everything however 64bit has smaller throughput.

> 

> We do not have spearate table for sandybridge, but I guess we do not care

> that much about older cores (Vladimir is still running a tester on one,

> so we do benchmark them).

> 

> So I guess the change is OK.  I do not see this as very good reason to split

> the sandybridge table out, but perhaps drop a comment in case we will want

> to do it in future.


Thanks. Here is what I have committed (r255760):

diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index 648219338308..477e478f1f77 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -1538,8 +1538,8 @@ struct processor_costs skylake_cost = {
   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
    COSTS_N_INSNS (4),			/*				 HI */
    COSTS_N_INSNS (3),			/*				 SI */
-   COSTS_N_INSNS (4),			/*				 DI */
-   COSTS_N_INSNS (4)},			/*			      other */
+   COSTS_N_INSNS (3),			/*				 DI */
+   COSTS_N_INSNS (3)},			/*			      other */
   0,					/* cost of multiply per each bit set */
   /* Expanding div/mod currently doesn't consider parallelism. So the cost
      model is not realistic. We compensate by increasing the latencies a bit.  */
@@ -2341,8 +2341,9 @@ struct processor_costs core_cost = {
   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
    COSTS_N_INSNS (4),			/*				 HI */
    COSTS_N_INSNS (3),			/*				 SI */
-   COSTS_N_INSNS (4),			/*				 DI */
-   COSTS_N_INSNS (4)},			/*			      other */
+   /* Here we tune for Sandybridge or newer.  */
+   COSTS_N_INSNS (3),			/*				 DI */
+   COSTS_N_INSNS (3)},			/*			      other */
   0,					/* cost of multiply per each bit set */
   /* Expanding div/mod currently doesn't consider parallelism. So the cost
      model is not realistic. We compensate by increasing the latencies a bit.  */
diff --git a/gcc/testsuite/gcc.target/i386/wmul-3.c b/gcc/testsuite/gcc.target/i386/wmul-3.c
new file mode 100644
index 000000000000..5f1619076386
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/wmul-3.c
@@ -0,0 +1,66 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -march=sandybridge" } */
+
+#include <stdint.h>
+#include <string.h>
+
+static const char b100_tab[200] = {
+    '0', '0', '0', '1', '0', '2', '0', '3', '0', '4',
+    '0', '5', '0', '6', '0', '7', '0', '8', '0', '9',
+    '1', '0', '1', '1', '1', '2', '1', '3', '1', '4',
+    '1', '5', '1', '6', '1', '7', '1', '8', '1', '9',
+    '2', '0', '2', '1', '2', '2', '2', '3', '2', '4',
+    '2', '5', '2', '6', '2', '7', '2', '8', '2', '9',
+    '3', '0', '3', '1', '3', '2', '3', '3', '3', '4',
+    '3', '5', '3', '6', '3', '7', '3', '8', '3', '9',
+    '4', '0', '4', '1', '4', '2', '4', '3', '4', '4',
+    '4', '5', '4', '6', '4', '7', '4', '8', '4', '9',
+    '5', '0', '5', '1', '5', '2', '5', '3', '5', '4',
+    '5', '5', '5', '6', '5', '7', '5', '8', '5', '9',
+    '6', '0', '6', '1', '6', '2', '6', '3', '6', '4',
+    '6', '5', '6', '6', '6', '7', '6', '8', '6', '9',
+    '7', '0', '7', '1', '7', '2', '7', '3', '7', '4',
+    '7', '5', '7', '6', '7', '7', '7', '8', '7', '9',
+    '8', '0', '8', '1', '8', '2', '8', '3', '8', '4',
+    '8', '5', '8', '6', '8', '7', '8', '8', '8', '9',
+    '9', '0', '9', '1', '9', '2', '9', '3', '9', '4',
+    '9', '5', '9', '6', '9', '7', '9', '8', '9', '9',
+};
+
+void uint64_to_ascii_ta7_32_base100(uint64_t val, char *dst) {
+  const int64_t POW10_10 = ((int64_t)10) * 1000 * 1000 * 1000;
+  const uint64_t POW2_57_DIV_POW100_4 =
+      ((int64_t)(1) << 57) / 100 / 100 / 100 / 100 + 1;
+  const uint64_t MASK32 = ((int64_t)(1) << 32) - 1;
+  int64_t hix = val / POW10_10;
+  int64_t lox = val % POW10_10;
+  int64_t lor = lox & (uint64_t)(-2);
+  uint64_t hi = hix * POW2_57_DIV_POW100_4;
+  uint64_t lo = lor * POW2_57_DIV_POW100_4;
+  memcpy(dst + 0 * 10 + 0, &b100_tab[(hi >> 57) * 2], 2);
+  memcpy(dst + 1 * 10 + 0, &b100_tab[(lo >> 57) * 2], 2);
+  hi = (hi >> 25) + 1;
+  lo = (lo >> 25) + 1;
+  hi = (hi & MASK32) * 100;
+  lo = (lo & MASK32) * 100;
+  memcpy(dst + 0 * 10 + 2, &b100_tab[(hi >> 32) * 2], 2);
+  hi = (hi & MASK32) * 100;
+  memcpy(dst + 1 * 10 + 2, &b100_tab[(lo >> 32) * 2], 2);
+  lo = (lo & MASK32) * 100;
+  memcpy(dst + 0 * 10 + 4, &b100_tab[(hi >> 32) * 2], 2);
+  hi = (hi & MASK32) * 100;
+  memcpy(dst + 1 * 10 + 4, &b100_tab[(lo >> 32) * 2], 2);
+  lo = (lo & MASK32) * 100;
+  memcpy(dst + 0 * 10 + 6, &b100_tab[(hi >> 32) * 2], 2);
+  hi = (hi & MASK32) * 100;
+  memcpy(dst + 1 * 10 + 6, &b100_tab[(lo >> 32) * 2], 2);
+  lo = (lo & MASK32) * 100;
+  hi >>= 32;
+  lo >>= 32;
+  lo = (lo & (-2)) | (lox & 1);
+  memcpy(dst + 0 * 10 + 8, &b100_tab[hi * 2], 2);
+  memcpy(dst + 1 * 10 + 8, &b100_tab[lo * 2], 2);
+  dst[2 * 10] = 0;
+}
+
+/* { dg-final { scan-assembler-times "imulq" 11 } } */
-- 
Markus

Patch

diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index 648219338308..ddb47ba44056 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -1538,8 +1538,8 @@  struct processor_costs skylake_cost = {
   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
    COSTS_N_INSNS (4),			/*				 HI */
    COSTS_N_INSNS (3),			/*				 SI */
-   COSTS_N_INSNS (4),			/*				 DI */
-   COSTS_N_INSNS (4)},			/*			      other */
+   COSTS_N_INSNS (3),			/*				 DI */
+   COSTS_N_INSNS (3)},			/*			      other */
   0,					/* cost of multiply per each bit set */
   /* Expanding div/mod currently doesn't consider parallelism. So the cost
      model is not realistic. We compensate by increasing the latencies a bit.  */
@@ -2341,8 +2341,8 @@  struct processor_costs core_cost = {
   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
    COSTS_N_INSNS (4),			/*				 HI */
    COSTS_N_INSNS (3),			/*				 SI */
-   COSTS_N_INSNS (4),			/*				 DI */
-   COSTS_N_INSNS (4)},			/*			      other */
+   COSTS_N_INSNS (3),			/*				 DI */
+   COSTS_N_INSNS (3)},			/*			      other */
   0,					/* cost of multiply per each bit set */
   /* Expanding div/mod currently doesn't consider parallelism. So the cost
      model is not realistic. We compensate by increasing the latencies a bit.  */
diff --git a/gcc/testsuite/gcc.target/i386/wmul-3.c b/gcc/testsuite/gcc.target/i386/wmul-3.c
new file mode 100644
index 000000000000..66c077c2cc0d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/wmul-3.c
@@ -0,0 +1,66 @@ 
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -march=haswell" } */
+
+#include <stdint.h>
+#include <string.h>
+
+static const char b100_tab[200] = {
+    '0', '0', '0', '1', '0', '2', '0', '3', '0', '4',
+    '0', '5', '0', '6', '0', '7', '0', '8', '0', '9',
+    '1', '0', '1', '1', '1', '2', '1', '3', '1', '4',
+    '1', '5', '1', '6', '1', '7', '1', '8', '1', '9',
+    '2', '0', '2', '1', '2', '2', '2', '3', '2', '4',
+    '2', '5', '2', '6', '2', '7', '2', '8', '2', '9',
+    '3', '0', '3', '1', '3', '2', '3', '3', '3', '4',
+    '3', '5', '3', '6', '3', '7', '3', '8', '3', '9',
+    '4', '0', '4', '1', '4', '2', '4', '3', '4', '4',
+    '4', '5', '4', '6', '4', '7', '4', '8', '4', '9',
+    '5', '0', '5', '1', '5', '2', '5', '3', '5', '4',
+    '5', '5', '5', '6', '5', '7', '5', '8', '5', '9',
+    '6', '0', '6', '1', '6', '2', '6', '3', '6', '4',
+    '6', '5', '6', '6', '6', '7', '6', '8', '6', '9',
+    '7', '0', '7', '1', '7', '2', '7', '3', '7', '4',
+    '7', '5', '7', '6', '7', '7', '7', '8', '7', '9',
+    '8', '0', '8', '1', '8', '2', '8', '3', '8', '4',
+    '8', '5', '8', '6', '8', '7', '8', '8', '8', '9',
+    '9', '0', '9', '1', '9', '2', '9', '3', '9', '4',
+    '9', '5', '9', '6', '9', '7', '9', '8', '9', '9',
+};
+
+void uint64_to_ascii_ta7_32_base100(uint64_t val, char *dst) {
+  const int64_t POW10_10 = ((int64_t)10) * 1000 * 1000 * 1000;
+  const uint64_t POW2_57_DIV_POW100_4 =
+      ((int64_t)(1) << 57) / 100 / 100 / 100 / 100 + 1;
+  const uint64_t MASK32 = ((int64_t)(1) << 32) - 1;
+  int64_t hix = val / POW10_10;
+  int64_t lox = val % POW10_10;
+  int64_t lor = lox & (uint64_t)(-2);
+  uint64_t hi = hix * POW2_57_DIV_POW100_4;
+  uint64_t lo = lor * POW2_57_DIV_POW100_4;
+  memcpy(dst + 0 * 10 + 0, &b100_tab[(hi >> 57) * 2], 2);
+  memcpy(dst + 1 * 10 + 0, &b100_tab[(lo >> 57) * 2], 2);
+  hi = (hi >> 25) + 1;
+  lo = (lo >> 25) + 1;
+  hi = (hi & MASK32) * 100;
+  lo = (lo & MASK32) * 100;
+  memcpy(dst + 0 * 10 + 2, &b100_tab[(hi >> 32) * 2], 2);
+  hi = (hi & MASK32) * 100;
+  memcpy(dst + 1 * 10 + 2, &b100_tab[(lo >> 32) * 2], 2);
+  lo = (lo & MASK32) * 100;
+  memcpy(dst + 0 * 10 + 4, &b100_tab[(hi >> 32) * 2], 2);
+  hi = (hi & MASK32) * 100;
+  memcpy(dst + 1 * 10 + 4, &b100_tab[(lo >> 32) * 2], 2);
+  lo = (lo & MASK32) * 100;
+  memcpy(dst + 0 * 10 + 6, &b100_tab[(hi >> 32) * 2], 2);
+  hi = (hi & MASK32) * 100;
+  memcpy(dst + 1 * 10 + 6, &b100_tab[(lo >> 32) * 2], 2);
+  lo = (lo & MASK32) * 100;
+  hi >>= 32;
+  lo >>= 32;
+  lo = (lo & (-2)) | (lox & 1);
+  memcpy(dst + 0 * 10 + 8, &b100_tab[hi * 2], 2);
+  memcpy(dst + 1 * 10 + 8, &b100_tab[lo * 2], 2);
+  dst[2 * 10] = 0;
+}
+
+/* { dg-final { scan-assembler-times "imulq" 11 } } */