[OG9,committed] Backport GCN expcnt patches

Message ID 8dfcafc5-a3b5-e7a9-3fc6-be8ee471e84b@codesourcery.com
State New
Headers show
Series
  • [OG9,committed] Backport GCN expcnt patches
Related show

Commit Message

Andrew Stubbs Sept. 5, 2019, 4:07 p.m.
I just committed the attached patch to the openacc-gcc-9-branch.

The patch removes the redundant s_waitcnt instruction from store 
instructions. The s_waitcnt with expcnt was a misunderstanding of the 
documentation.

Andrew

Comments

Bernhard Reutner-Fischer Sept. 6, 2019, 7:42 a.m. | #1
On 5 September 2019 18:07:25 CEST, Andrew Stubbs <ams@codesourcery.com> wrote:
>I just committed the attached patch to the openacc-gcc-9-branch.


+	  /* Store that requires input registers are not overwritten by
+	     following instruction.  */
+	  if ((prev_insn->age + nops_rqd) < 1
+	      && prev_insn->delayeduse == DELAYEDUSE_YES
+	      && ((hard_reg_set_intersect_p
+		   (prev_insn->reads, iwrites))))
+	    nops_rqd = 1 - prev_insn->age;

Excess braces everywhere, fwiw.
Andrew Stubbs Sept. 6, 2019, 8:44 a.m. | #2
On 06/09/2019 08:42, Bernhard Reutner-Fischer wrote:
> On 5 September 2019 18:07:25 CEST, Andrew Stubbs <ams@codesourcery.com> wrote:

>> I just committed the attached patch to the openacc-gcc-9-branch.

> 

> +	  /* Store that requires input registers are not overwritten by

> +	     following instruction.  */

> +	  if ((prev_insn->age + nops_rqd) < 1

> +	      && prev_insn->delayeduse == DELAYEDUSE_YES

> +	      && ((hard_reg_set_intersect_p

> +		   (prev_insn->reads, iwrites))))

> +	    nops_rqd = 1 - prev_insn->age;

> 

> Excess braces everywhere, fwiw.


So it does.

Oh well, it's done now so it can stay.

Thanks anyway

Andrew

Patch

Backport expcnt patches.

2019-09-05  Andrew Stubbs  <ams@codesourcery.com>

	Backport from mainline:

	2019-07-31  Andrew Stubbs  <ams@codesourcery.com>

	gcc/
	* config/gcn/gcn-valu.md
	(scatter<mode>_insn_1offset<exec_scatter>): Remove s_waitcnt.
	(scatter<mode>_insn_1offset_ds<exec_scatter>): Likewise.
	(scatter<mode>_insn_2offsets<exec_scatter>): Likewise.
	* config/gcn/gcn.c (gcn_md_reorg): Add delayeduse and reads to
	struct ilist. Add nops for delayeduse insns.
	* config/gcn/gcn.md (delayeduse): New attribute.
	(*movbi): Remove s_waitcnt from stores.
	(*mov<mode>_insn): Likewise.
	(*movti_insn): Likewise. Add delayeduse attribute.
	(sync_compare_and_swap<mode>_insn): Add delayeduse attribute.
	(atomic_store<mode>): Remove or adjust s_waitcnt.

	2019-09-05  Andrew Stubbs  <ams@codesourcery.com>

	gcc/
	* config/gcn/gcn.md (*movti_insn): Set delayeduse for global_store.
	(sync_compare_and_swap<mode>_insn): Likewise.

diff --git a/gcc/config/gcn/gcn-valu.md b/gcc/config/gcn/gcn-valu.md
index 3cc59dd1cd6..3a2287d0ebf 100644
--- a/gcc/config/gcn/gcn-valu.md
+++ b/gcc/config/gcn/gcn-valu.md
@@ -863,15 +863,12 @@ 
     if (AS_FLAT_P (as))
       {
 	if (TARGET_GCN5_PLUS)
-	  sprintf (buf, "flat_store%%s2\t%%0, %%2 offset:%%1%s\;"
-		   "s_waitcnt\texpcnt(0)", glc);
+	  sprintf (buf, "flat_store%%s2\t%%0, %%2 offset:%%1%s", glc);
 	else
-	  sprintf (buf, "flat_store%%s2\t%%0, %%2%s\;s_waitcnt\texpcnt(0)",
-		   glc);
+	  sprintf (buf, "flat_store%%s2\t%%0, %%2%s", glc);
       }
     else if (AS_GLOBAL_P (as))
-      sprintf (buf, "global_store%%s2\t%%0, %%2, off offset:%%1%s\;"
-	       "s_waitcnt\texpcnt(0)", glc);
+      sprintf (buf, "global_store%%s2\t%%0, %%2, off offset:%%1%s", glc);
     else
       gcc_unreachable ();
 
@@ -895,7 +892,7 @@ 
   {
     addr_space_t as = INTVAL (operands[3]);
     static char buf[200];
-    sprintf (buf, "ds_write%%b2\t%%0, %%2 offset:%%1%s\;s_waitcnt\texpcnt(0)",
+    sprintf (buf, "ds_write%%b2\t%%0, %%2 offset:%%1%s",
 	     (AS_GDS_P (as) ? " gds" : ""));
     return buf;
   }
@@ -929,8 +926,8 @@ 
 	/* Work around assembler bug in which a 64-bit register is expected,
 	but a 32-bit value would be correct.  */
 	int reg = REGNO (operands[1]) - FIRST_VGPR_REG;
-	sprintf (buf, "global_store%%s3\tv[%d:%d], %%3, %%0 offset:%%2%s\;"
-		      "s_waitcnt\texpcnt(0)", reg, reg + 1, glc);
+	sprintf (buf, "global_store%%s3\tv[%d:%d], %%3, %%0 offset:%%2%s",
+		 reg, reg + 1, glc);
       }
     else
       gcc_unreachable ();
diff --git a/gcc/config/gcn/gcn.c b/gcc/config/gcn/gcn.c
index b7cf6f093fa..66854b6f9c5 100644
--- a/gcc/config/gcn/gcn.c
+++ b/gcc/config/gcn/gcn.c
@@ -4509,7 +4509,9 @@  gcn_md_reorg (void)
   {
     rtx_insn *insn;
     attr_unit unit;
+    attr_delayeduse delayeduse;
     HARD_REG_SET writes;
+    HARD_REG_SET reads;
     int age;
   } back[max_waits];
   int oldest = 0;
@@ -4528,6 +4530,7 @@  gcn_md_reorg (void)
 
       attr_type itype = get_attr_type (insn);
       attr_unit iunit = get_attr_unit (insn);
+      attr_delayeduse idelayeduse = get_attr_delayeduse (insn);
       HARD_REG_SET ireads, iwrites;
       CLEAR_HARD_REG_SET (ireads);
       CLEAR_HARD_REG_SET (iwrites);
@@ -4603,6 +4606,14 @@  gcn_md_reorg (void)
 		  (regs, reg_class_contents[(int) VGPR_REGS]))
 		nops_rqd = 2 - prev_insn->age;
 	    }
+
+	  /* Store that requires input registers are not overwritten by
+	     following instruction.  */
+	  if ((prev_insn->age + nops_rqd) < 1
+	      && prev_insn->delayeduse == DELAYEDUSE_YES
+	      && ((hard_reg_set_intersect_p
+		   (prev_insn->reads, iwrites))))
+	    nops_rqd = 1 - prev_insn->age;
 	}
 
       /* Insert the required number of NOPs.  */
@@ -4630,7 +4641,9 @@  gcn_md_reorg (void)
       /* Track the current instruction as a previous instruction.  */
       back[oldest].insn = insn;
       back[oldest].unit = iunit;
+      back[oldest].delayeduse = idelayeduse;
       COPY_HARD_REG_SET (back[oldest].writes, iwrites);
+      COPY_HARD_REG_SET (back[oldest].reads, ireads);
       back[oldest].age = 0;
       oldest = (oldest + 1) % max_waits;
 
diff --git a/gcc/config/gcn/gcn.md b/gcc/config/gcn/gcn.md
index 2b805a73c56..537bb260ff7 100644
--- a/gcc/config/gcn/gcn.md
+++ b/gcc/config/gcn/gcn.md
@@ -285,6 +285,11 @@ 
 
 (define_attr "laneselect" "yes,no" (const_string "no"))
 
+; Identify instructions that require a "Manually Inserted Wait State" if
+; their inputs are overwritten by subsequent instructions.
+
+(define_attr "delayeduse" "yes,no" (const_string "no"))
+
 ;; }}}
 ;; {{{ Iterators useful across the wole machine description
 
@@ -475,15 +480,15 @@ 
     case 6:
       return "s_load_dword\t%0, %A1\;s_waitcnt\tlgkmcnt(0)";
     case 7:
-      return "s_store_dword\t%1, %A0\;s_waitcnt\texpcnt(0)";
+      return "s_store_dword\t%1, %A0";
     case 8:
       return "flat_load_dword\t%0, %A1%O1%g1\;s_waitcnt\t0";
     case 9:
-      return "flat_store_dword\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)";
+      return "flat_store_dword\t%A0, %1%O0%g0";
     case 10:
       return "global_load_dword\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)";
     case 11:
-      return "global_store_dword\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)";
+      return "global_store_dword\t%A0, %1%O0%g0";
     default:
       gcc_unreachable ();
     }
@@ -506,20 +511,20 @@ 
   s_movk_i32\t%0, %1
   s_mov_b32\t%0, %1
   s_buffer_load%s0\t%0, s[0:3], %1\;s_waitcnt\tlgkmcnt(0)
-  s_buffer_store%s1\t%1, s[0:3], %0\;s_waitcnt\texpcnt(0)
+  s_buffer_store%s1\t%1, s[0:3], %0
   s_load_dword\t%0, %A1\;s_waitcnt\tlgkmcnt(0)
-  s_store_dword\t%1, %A0\;s_waitcnt\texpcnt(0)
+  s_store_dword\t%1, %A0
   v_mov_b32\t%0, %1
   v_readlane_b32\t%0, %1, 0
   v_writelane_b32\t%0, %1, 0
   flat_load_dword\t%0, %A1%O1%g1\;s_waitcnt\t0
-  flat_store_dword\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)
+  flat_store_dword\t%A0, %1%O0%g0
   v_mov_b32\t%0, %1
-  ds_write_b32\t%A0, %1%O0\;s_waitcnt\texpcnt(0)
+  ds_write_b32\t%A0, %1%O0
   ds_read_b32\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
   s_mov_b32\t%0, %1
   global_load_dword\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
-  global_store_dword\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)"
+  global_store_dword\t%A0, %1%O0%g0"
   [(set_attr "type" "sop1,sopk,sop1,smem,smem,smem,smem,vop1,vop3a,vop3a,flat,
 		     flat,vop1,ds,ds,sop1,flat,flat")
    (set_attr "exec" "*,*,*,*,*,*,*,*,none,none,*,*,*,*,*,*,*,*")
@@ -541,12 +546,12 @@ 
   v_readlane_b32\t%0, %1, 0
   v_writelane_b32\t%0, %1, 0
   flat_load%o1\t%0, %A1%O1%g1\;s_waitcnt\t0
-  flat_store%s0\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)
+  flat_store%s0\t%A0, %1%O0%g0
   v_mov_b32\t%0, %1
-  ds_write%b0\t%A0, %1%O0\;s_waitcnt\texpcnt(0)
+  ds_write%b0\t%A0, %1%O0
   ds_read%u1\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
   global_load%o1\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
-  global_store%s0\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)"
+  global_store%s0\t%A0, %1%O0%g0"
   [(set_attr "type"
 	     "sop1,sopk,sop1,vop1,vop3a,vop3a,flat,flat,vop1,ds,ds,flat,flat")
    (set_attr "exec" "*,*,*,*,none,none,*,*,*,*,*,*,*")
@@ -564,18 +569,18 @@ 
   s_mov_b64\t%0, %1
   s_mov_b64\t%0, %1
   #
-  s_store_dwordx2\t%1, %A0\;s_waitcnt\texpcnt(0)
+  s_store_dwordx2\t%1, %A0
   s_load_dwordx2\t%0, %A1\;s_waitcnt\tlgkmcnt(0)
   #
   #
   #
   #
   flat_load_dwordx2\t%0, %A1%O1%g1\;s_waitcnt\t0
-  flat_store_dwordx2\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)
-  ds_write_b64\t%A0, %1%O0\;s_waitcnt\texpcnt(0)
+  flat_store_dwordx2\t%A0, %1%O0%g0
+  ds_write_b64\t%A0, %1%O0
   ds_read_b64\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
   global_load_dwordx2\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
-  global_store_dwordx2\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)"
+  global_store_dwordx2\t%A0, %1%O0%g0"
   "(reload_completed && !MEM_P (operands[0]) && !MEM_P (operands[1])
     && !gcn_sgpr_move_p (operands[0], operands[1]))
    || (GET_CODE (operands[1]) == CONST_INT && !gcn_constant64_p (operands[1]))"
@@ -617,16 +622,16 @@ 
   ""
   "@
   #
-  s_store_dwordx4\t%1, %A0\;s_waitcnt\texpcnt(0)
+  s_store_dwordx4\t%1, %A0
   s_load_dwordx4\t%0, %A1\;s_waitcnt\tlgkmcnt(0)
-  flat_store_dwordx4\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)
+  flat_store_dwordx4\t%A0, %1%O0%g0
   flat_load_dwordx4\t%0, %A1%O1%g1\;s_waitcnt\t0
   #
   #
   #
-  global_store_dwordx4\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)
+  global_store_dwordx4\t%A0, %1%O0%g0
   global_load_dwordx4\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
-  ds_write_b128\t%A0, %1%O0\;s_waitcnt\texpcnt(0)
+  ds_write_b128\t%A0, %1%O0
   ds_read_b128\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)"
   "reload_completed
    && REG_P (operands[0])
@@ -647,6 +652,7 @@ 
   }
   [(set_attr "type" "mult,smem,smem,flat,flat,vmult,vmult,vmult,flat,flat,\
 		     ds,ds")
+   (set_attr "delayeduse" "*,*,yes,*,*,*,*,*,yes,*,*,*")
    (set_attr "length" "*,12,12,12,12,*,*,*,12,12,12,12")])
 
 ;; }}}
@@ -1594,7 +1600,8 @@ 
    global_atomic_cmpswap<X>\t%0, %A1, %2%O1 glc\;s_waitcnt\tvmcnt(0)"
   [(set_attr "type" "smem,flat,flat")
    (set_attr "length" "12")
-   (set_attr "gcn_version" "gcn5,*,gcn5")])
+   (set_attr "gcn_version" "gcn5,*,gcn5")
+   (set_attr "delayeduse" "*,yes,yes")])
 
 (define_insn "sync_compare_and_swap<mode>_lds_insn"
   [(set (match_operand:SIDI 0 "register_operand"    "= v")
@@ -1697,14 +1704,11 @@ 
 	switch (which_alternative)
 	  {
 	  case 0:
-	    return "s_dcache_wb_vol\;s_store%o1\t%1, %A0 glc\;"
-		   "s_waitcnt\texpcnt(0)";
+	    return "s_dcache_wb_vol\;s_store%o1\t%1, %A0 glc";
 	  case 1:
-	    return "buffer_wbinvl1_vol\;flat_store%o1\t%A0, %1%O0 glc\;"
-		   "s_waitcnt\texpcnt(0)";
+	    return "buffer_wbinvl1_vol\;flat_store%o1\t%A0, %1%O0 glc";
 	  case 2:
-	    return "buffer_wbinvl1_vol\;global_store%o1\t%A0, %1%O0 glc\;"
-	           "s_waitcnt\texpcnt(0)";
+	    return "buffer_wbinvl1_vol\;global_store%o1\t%A0, %1%O0 glc";
 	  }
 	break;
       case MEMMODEL_ACQ_REL:
@@ -1714,13 +1718,13 @@ 
 	  {
 	  case 0:
 	    return "s_dcache_wb_vol\;s_store%o1\t%1, %A0 glc\;"
-		   "s_waitcnt\texpcnt(0)\;s_dcache_inv_vol";
+		   "s_waitcnt\tlgkmcnt(0)\;s_dcache_inv_vol";
 	  case 1:
 	    return "buffer_wbinvl1_vol\;flat_store%o1\t%A0, %1%O0 glc\;"
-		   "s_waitcnt\texpcnt(0)\;buffer_wbinvl1_vol";
+		   "s_waitcnt\t0\;buffer_wbinvl1_vol";
 	  case 2:
 	    return "buffer_wbinvl1_vol\;global_store%o1\t%A0, %1%O0 glc\;"
-		   "s_waitcnt\texpcnt(0)\;buffer_wbinvl1_vol";
+		   "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol";
 	  }
 	break;
       }