i386: Add pass_remove_partial_avx_dependency

Message ID 20180828180402.GA24142@intel.com
State New
Headers show
Series
  • i386: Add pass_remove_partial_avx_dependency
Related show

Commit Message

H.J. Lu Aug. 28, 2018, 6:04 p.m.
With -mavx, for

[hjl@gnu-cfl-1 skx-2]$ cat foo.i
extern float f;
extern double d;
extern int i;

void
foo (void)
{
  d = f;
  f = i;
}

we need to generate

	vxorp[ds]	%xmmN, %xmmN, %xmmN
	...
	vcvtss2sd	f(%rip), %xmmN, %xmmX
	...
	vcvtsi2ss	i(%rip), %xmmN, %xmmY

to avoid partial XMM register stall.  This patch adds a pass to generate
a single

	vxorps		%xmmN, %xmmN, %xmmN

at function entry, which is shared by all SF and DF conversions, instead
of generating one

	vxorp[ds]	%xmmN, %xmmN, %xmmN

for each SF/DF conversion.

Performance impacts on SPEC CPU 2017 rate with 1 copy using

-Ofast -march=native -mfpmath=sse -fno-associative-math -funroll-loops

are

1. On Broadwell server:

500.perlbench_r (-0.82%)
502.gcc_r (0.73%)
505.mcf_r (-0.24%)
520.omnetpp_r (-2.22%)
523.xalancbmk_r (-1.47%)
525.x264_r (0.31%)
531.deepsjeng_r (0.27%)
541.leela_r (0.85%)
548.exchange2_r (-0.11%)
557.xz_r (-0.34%)
Geomean: (-0.23%)

503.bwaves_r (0.00%)
507.cactuBSSN_r (-1.88%)
508.namd_r (0.00%)
510.parest_r (-0.56%)
511.povray_r (0.49%)
519.lbm_r (-1.28%)
521.wrf_r (-0.28%)
526.blender_r (0.55%)
527.cam4_r (-0.20%)
538.imagick_r (2.52%)
544.nab_r (-0.18%)
549.fotonik3d_r (-0.51%)
554.roms_r (-0.22%)
Geomean: (0.00%)

2. On Skylake client:

500.perlbench_r (-0.29%)
502.gcc_r (-0.36%)
505.mcf_r (1.77%)
520.omnetpp_r (-0.26%)
523.xalancbmk_r (-3.69%)
525.x264_r (-0.32%)
531.deepsjeng_r (0.00%)
541.leela_r (-0.46%)
548.exchange2_r (0.00%)
557.xz_r (0.00%)
Geomean: (-0.34%)

503.bwaves_r (0.00%)
507.cactuBSSN_r (-0.56%)
508.namd_r (0.87%)
510.parest_r (0.00%)
511.povray_r (-0.73%)
519.lbm_r (0.84%)
521.wrf_r (0.00%)
526.blender_r (-0.81%)
527.cam4_r (-0.43%)
538.imagick_r (2.55%)
544.nab_r (0.28%)
549.fotonik3d_r (0.00%)
554.roms_r (0.32%)
Geomean: (0.12%)

3. On Skylake server:

500.perlbench_r (-0.55%)
502.gcc_r (0.69%)
505.mcf_r (0.00%)
520.omnetpp_r (-0.33%)
523.xalancbmk_r (-0.21%)
525.x264_r (-0.27%)
531.deepsjeng_r (0.00%)
541.leela_r (0.00%)
548.exchange2_r (-0.11%)
557.xz_r (0.00%)
Geomean: (0.00%)

503.bwaves_r (0.58%)
507.cactuBSSN_r (0.00%)
508.namd_r (0.00%)
510.parest_r (0.18%)
511.povray_r (-0.58%)
519.lbm_r (0.25%)
521.wrf_r (0.40%)
526.blender_r (0.34%)
527.cam4_r (0.19%)
538.imagick_r (5.87%)
544.nab_r (0.17%)
549.fotonik3d_r (0.00%)
554.roms_r (0.00%)
Geomean: (0.62%)

On Skylake client, impacts on 538.imagick_r are

size before:

   text	   data	    bss	    dec	    hex	filename
2555577	  10876	   5576	2572029	 273efd	imagick_r.exe

size after:

   text	   data	    bss	    dec	    hex	filename
2511825	  10876	   5576	2528277	 269415	imagick_r.exe

number of vxorp[ds]:

before		after		difference
14570		4515		-69%

OK for trunk?

Thanks.


H.J.
---
gcc/

2018-08-28  H.J. Lu  <hongjiu.lu@intel.com>
	    Sunil K Pandey  <sunil.k.pandey@intel.com>

	PR target/87007
	* config/i386/i386-passes.def: Add
	pass_remove_partial_avx_dependency.
	* config/i386/i386-protos.h
	(make_pass_remove_partial_avx_dependency): New.
	* config/i386/i386.c (make_pass_remove_partial_avx_dependency):
	New function.
	(pass_data_remove_partial_avx_dependency): New.
	(pass_remove_partial_avx_dependency): Likewise.
	(make_pass_remove_partial_avx_dependency): Likewise.
	* config/i386/i386.md (SF/DF conversion splitters): Disabled
	for TARGET_AVX.

gcc/testsuite/

2018-08-28  H.J. Lu  <hongjiu.lu@intel.com>
	    Sunil K Pandey  <sunil.k.pandey@intel.com>

	PR target/87007
	* gcc.target/i386/pr87007.c: New file.
---
 gcc/config/i386/i386-passes.def         |   2 +
 gcc/config/i386/i386-protos.h           |   2 +
 gcc/config/i386/i386.c                  | 168 ++++++++++++++++++++++++
 gcc/config/i386/i386.md                 |   9 +-
 gcc/testsuite/gcc.target/i386/pr87007.c |  15 +++
 5 files changed, 193 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr87007.c

-- 
2.17.1

Patch

diff --git a/gcc/config/i386/i386-passes.def b/gcc/config/i386/i386-passes.def
index 4a6a95cc2d9..b439f3a9028 100644
--- a/gcc/config/i386/i386-passes.def
+++ b/gcc/config/i386/i386-passes.def
@@ -31,3 +31,5 @@  along with GCC; see the file COPYING3.  If not see
   INSERT_PASS_BEFORE (pass_cse2, 1, pass_stv, true /* timode_p */);
 
   INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_insert_endbranch);
+
+  INSERT_PASS_AFTER (pass_combine, 1, pass_remove_partial_avx_dependency);
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index d1d59633dc0..1626c9d0d70 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -358,3 +358,5 @@  class rtl_opt_pass;
 extern rtl_opt_pass *make_pass_insert_vzeroupper (gcc::context *);
 extern rtl_opt_pass *make_pass_stv (gcc::context *);
 extern rtl_opt_pass *make_pass_insert_endbranch (gcc::context *);
+extern rtl_opt_pass *make_pass_remove_partial_avx_dependency
+  (gcc::context *);
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index c437c18a29c..b78c76e49c2 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -2752,6 +2752,174 @@  make_pass_insert_endbranch (gcc::context *ctxt)
   return new pass_insert_endbranch (ctxt);
 }
 
+/* At function entry, generate a single
+	vxorps %xmmN, %xmmN, %xmmN
+   for all
+	vcvtss2sd  op, %xmmN, %xmmX
+	vcvtsd2ss  op, %xmmN, %xmmX
+	vcvtsi2ss  op, %xmmN, %xmmX
+	vcvtsi2sd  op, %xmmN, %xmmX
+ */
+
+static unsigned int
+remove_partial_avx_dependency (void)
+{
+  timevar_push (TV_MACH_DEP);
+
+  calculate_dominance_info (CDI_DOMINATORS);
+  df_set_flags (DF_DEFER_INSN_RESCAN);
+  df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
+  df_md_add_problem ();
+  df_analyze ();
+
+  basic_block bb;
+  rtx_insn *insn, *set_insn;
+  rtx set;
+  rtx v4sf_const0 = NULL_RTX;
+
+  FOR_EACH_BB_FN (bb, cfun)
+    {
+      FOR_BB_INSNS (bb, insn)
+	{
+	  if (!NONDEBUG_INSN_P (insn))
+	    continue;
+
+	  set = single_set (insn);
+	  if (set)
+	    {
+	      machine_mode dest_vecmode, dest_mode;
+	      rtx src = SET_SRC (set);
+	      rtx dest, vec, zero;
+
+	      /* Check for conversions to SF or DF.  */
+	      switch (GET_CODE (src))
+		{
+		case FLOAT_TRUNCATE:
+		  /* DF -> SF.  */
+		  if (GET_MODE (XEXP (src, 0)) != DFmode)
+		    continue;
+		  /* Fall through.  */
+		case FLOAT_EXTEND:
+		  /* SF -> DF.  */
+		case FLOAT:
+		  /* SI -> SF, SI -> DF, DI -> SF, DI -> DF.  */
+		  dest = SET_DEST (set);
+		  dest_mode = GET_MODE (dest);
+		  switch (dest_mode)
+		    {
+		    case E_SFmode:
+		      dest_vecmode = V4SFmode;
+		      break;
+		    case E_DFmode:
+		      dest_vecmode = V2DFmode;
+		      break;
+		    default:
+		      continue;
+		    }
+
+		  if (!TARGET_64BIT
+		      && GET_MODE (XEXP (src, 0)) == DImode)
+		    continue;
+
+		  if (!v4sf_const0)
+		    v4sf_const0 = gen_reg_rtx (V4SFmode);
+
+		  if (dest_vecmode == V4SFmode)
+		    zero = v4sf_const0;
+		  else
+		    zero = gen_rtx_SUBREG (V2DFmode, v4sf_const0, 0);
+
+		  /* Change source to vector mode.  */
+		  src = gen_rtx_VEC_DUPLICATE (dest_vecmode, src);
+		  src = gen_rtx_VEC_MERGE (dest_vecmode, src, zero,
+					   GEN_INT (HOST_WIDE_INT_1U));
+		  /* Change destination to vector mode.  */
+		  vec = gen_reg_rtx (dest_vecmode);
+		  /* Generate a XMM vector SET.  */
+		  set = gen_rtx_SET (vec, src);
+		  set_insn = emit_insn_before (set, insn);
+		  df_insn_rescan (set_insn);
+
+		  src = gen_rtx_SUBREG (dest_mode, vec, 0);
+		  set = gen_rtx_SET (dest, src);
+
+		  /* Drop possible dead definitions.  */
+		  PATTERN (insn) = set;
+
+		  INSN_CODE (insn) = -1;
+		  recog_memoized (insn);
+		  df_insn_rescan (insn);
+		  break;
+
+		default:
+		  break;
+		}
+	    }
+	}
+    }
+
+  if (v4sf_const0)
+    {
+      /* Generate a single vxorps at function entry and preform df
+	 rescan. */
+      bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
+      insn = BB_HEAD (bb);
+      set = gen_rtx_SET (v4sf_const0, CONST0_RTX (V4SFmode));
+      set_insn = emit_insn_after (set, insn);
+      df_insn_rescan (set_insn);
+      df_process_deferred_rescans ();
+    }
+
+  timevar_pop (TV_MACH_DEP);
+  return 0;
+}
+
+namespace {
+
+const pass_data pass_data_remove_partial_avx_dependency =
+{
+  RTL_PASS, /* type */
+  "rpad", /* name */
+  OPTGROUP_NONE, /* optinfo_flags */
+  TV_MACH_DEP, /* tv_id */
+  0, /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  TODO_df_finish, /* todo_flags_finish */
+};
+
+class pass_remove_partial_avx_dependency : public rtl_opt_pass
+{
+public:
+  pass_remove_partial_avx_dependency (gcc::context *ctxt)
+    : rtl_opt_pass (pass_data_remove_partial_avx_dependency, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  virtual bool gate (function *)
+    {
+      return (TARGET_AVX
+	      && TARGET_SSE_PARTIAL_REG_DEPENDENCY
+	      && TARGET_SSE_MATH
+	      && optimize
+	      && optimize_function_for_speed_p (cfun));
+    }
+
+  virtual unsigned int execute (function *)
+    {
+      return remove_partial_avx_dependency ();
+    }
+}; // class pass_rpad
+
+} // anon namespace
+
+rtl_opt_pass *
+make_pass_remove_partial_avx_dependency (gcc::context *ctxt)
+{
+  return new pass_remove_partial_avx_dependency (ctxt);
+}
+
 /* Return true if a red-zone is in use.  We can't use red-zone when
    there are local indirect jumps, like "indirect_jump" or "tablejump",
    which jumps to another place in the function, since "call" in the
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 62dab1662a3..ff42d0f1574 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -5471,7 +5471,8 @@ 
 (define_split
   [(set (match_operand:MODEF 0 "sse_reg_operand")
 	(float:MODEF (match_operand:SWI48 1 "nonimmediate_operand")))]
-  "TARGET_SSE_PARTIAL_REG_DEPENDENCY && epilogue_completed
+  "!TARGET_AVX
+   && TARGET_SSE_PARTIAL_REG_DEPENDENCY && epilogue_completed
    && optimize_function_for_speed_p (cfun)
    && (!EXT_REX_SSE_REG_P (operands[0])
        || TARGET_AVX512VL)"
@@ -5497,7 +5498,8 @@ 
   [(set (match_operand:SF 0 "sse_reg_operand")
         (float_truncate:SF
 	  (match_operand:DF 1 "nonimmediate_operand")))]
-  "TARGET_SSE_PARTIAL_REG_DEPENDENCY && epilogue_completed
+  "!TARGET_AVX
+   && TARGET_SSE_PARTIAL_REG_DEPENDENCY && epilogue_completed
    && optimize_function_for_speed_p (cfun)
    && (!REG_P (operands[1])
        || REGNO (operands[0]) != REGNO (operands[1]))
@@ -5523,7 +5525,8 @@ 
   [(set (match_operand:DF 0 "sse_reg_operand")
         (float_extend:DF
           (match_operand:SF 1 "nonimmediate_operand")))]
-  "TARGET_SSE_PARTIAL_REG_DEPENDENCY && epilogue_completed
+  "!TARGET_AVX
+   && TARGET_SSE_PARTIAL_REG_DEPENDENCY && epilogue_completed
    && optimize_function_for_speed_p (cfun)
    && (!REG_P (operands[1])
        || REGNO (operands[0]) != REGNO (operands[1]))
diff --git a/gcc/testsuite/gcc.target/i386/pr87007.c b/gcc/testsuite/gcc.target/i386/pr87007.c
new file mode 100644
index 00000000000..e6f17ad6dc0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr87007.c
@@ -0,0 +1,15 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake" } */
+
+extern float f;
+extern double d;
+extern int i;
+
+void
+foo (void)
+{
+  d = f;
+  f = i;
+}
+
+/* { dg-final { scan-assembler-times "vxorp\[ds\]\[^\n\r\]*xmm\[0-9\]" 1 } } */