[04/10] GCN machine description

Message ID ee7b3ebace72700f5190b3ec9902107713f820e6.1542381960.git.ams@codesourcery.com
State New
Headers show
Series
  • AMD GCN Port v2
Related show

Commit Message

Andrew Stubbs Nov. 16, 2018, 4:27 p.m.
This patch contains the machine description portion of the GCN back-end.  I've
broken it out mainly to avoid the mailing list size limit.

2018-11-16  Andrew Stubbs  <ams@codesourcery.com>
	    Kwok Cheung Yeung  <kcy@codesourcery.com>
	    Julian Brown  <julian@codesourcery.com>
	    Tom de Vries  <tom@codesourcery.com>
	    Jan Hubicka  <hubicka@ucw.cz>
	    Martin Jambor  <mjambor@suse.cz>

	gcc/
	* config/gcn/constraints.md: New file.
	* config/gcn/gcn-valu.md: New file.
	* config/gcn/gcn.md: New file.
	* config/gcn/predicates.md: New file.
---
 gcc/config/gcn/constraints.md |  139 ++
 gcc/config/gcn/gcn-valu.md    | 3437 +++++++++++++++++++++++++++++++++++++++++
 gcc/config/gcn/gcn.md         | 2152 ++++++++++++++++++++++++++
 gcc/config/gcn/predicates.md  |  193 +++
 4 files changed, 5921 insertions(+)
 create mode 100644 gcc/config/gcn/constraints.md
 create mode 100644 gcc/config/gcn/gcn-valu.md
 create mode 100644 gcc/config/gcn/gcn.md
 create mode 100644 gcc/config/gcn/predicates.md

Patch

diff --git a/gcc/config/gcn/constraints.md b/gcc/config/gcn/constraints.md
new file mode 100644
index 0000000..864dbd5
--- /dev/null
+++ b/gcc/config/gcn/constraints.md
@@ -0,0 +1,139 @@ 
+;; Constraint definitions for GCN.
+;; Copyright (C) 2016-2018 Free Software Foundation, Inc.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;; GNU General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+
+(define_constraint "I"
+  "Inline integer constant"
+  (and (match_code "const_int")
+       (match_test "ival >= -16 && ival <= 64")))
+
+(define_constraint "J"
+  "Signed integer 16-bit inline constant"
+  (and (match_code "const_int")
+       (match_test "((unsigned HOST_WIDE_INT) ival + 0x8000) < 0x10000")))
+
+(define_constraint "Kf"
+  "Immeditate constant -1"
+  (and (match_code "const_int")
+       (match_test "ival == -1")))
+
+(define_constraint "L"
+  "Unsigned integer 15-bit constant"
+  (and (match_code "const_int")
+       (match_test "((unsigned HOST_WIDE_INT) ival) < 0x8000")))
+
+(define_constraint "A"
+  "Inline immediate parameter"
+  (and (match_code "const_int,const_double,const_vector")
+       (match_test "gcn_inline_constant_p (op)")))
+
+(define_constraint "B"
+  "Immediate 32-bit parameter"
+  (and (match_code "const_int,const_double,const_vector")
+	(match_test "gcn_constant_p (op)")))
+
+(define_constraint "C"
+  "Immediate 32-bit parameter zero-extended to 64-bits"
+  (and (match_code "const_int,const_double,const_vector")
+	(match_test "gcn_constant64_p (op)")))
+
+(define_constraint "DA"
+  "Splittable inline immediate 64-bit parameter"
+  (and (match_code "const_int,const_double,const_vector")
+       (match_test "gcn_inline_constant64_p (op)")))
+
+(define_constraint "DB"
+  "Splittable immediate 64-bit parameter"
+  (match_code "const_int,const_double,const_vector"))
+
+(define_constraint "U"
+  "unspecified value"
+  (match_code "unspec"))
+
+(define_constraint "Y"
+  "Symbol or label for relative calls"
+  (match_code "symbol_ref,label_ref"))
+
+(define_register_constraint "v" "VGPR_REGS"
+  "VGPR registers")
+
+(define_register_constraint "Sg" "SGPR_REGS"
+  "SGPR registers")
+
+(define_register_constraint "SD" "SGPR_DST_REGS"
+  "registers useable as a destination of scalar operation")
+
+(define_register_constraint "SS" "SGPR_SRC_REGS"
+  "registers useable as a source of scalar operation")
+
+(define_register_constraint "Sm" "SGPR_MEM_SRC_REGS"
+  "registers useable as a source of scalar memory operation")
+
+(define_register_constraint "Sv" "SGPR_VOP3A_SRC_REGS"
+  "registers useable as a source of VOP3A instruction")
+
+(define_register_constraint "ca" "ALL_CONDITIONAL_REGS"
+  "SCC VCCZ or EXECZ")
+
+(define_register_constraint "cs" "SCC_CONDITIONAL_REG"
+  "SCC")
+
+(define_register_constraint "cV" "VCC_CONDITIONAL_REG"
+  "VCC")
+
+(define_register_constraint "e" "EXEC_MASK_REG"
+  "EXEC")
+
+(define_special_memory_constraint "RB"
+  "Buffer memory address to scratch memory."
+  (and (match_code "mem")
+       (match_test "AS_SCRATCH_P (MEM_ADDR_SPACE (op))")))
+
+(define_special_memory_constraint "RF"
+  "Buffer memory address to flat memory."
+  (and (match_code "mem")
+       (match_test "AS_FLAT_P (MEM_ADDR_SPACE (op))
+		    && gcn_flat_address_p (XEXP (op, 0), mode)")))
+
+(define_special_memory_constraint "RS"
+  "Buffer memory address to scalar flat memory."
+  (and (match_code "mem")
+       (match_test "AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op))
+		    && gcn_scalar_flat_mem_p (op)")))
+
+(define_special_memory_constraint "RL"
+  "Buffer memory address to LDS memory."
+  (and (match_code "mem")
+       (match_test "AS_LDS_P (MEM_ADDR_SPACE (op))")))
+
+(define_special_memory_constraint "RG"
+  "Buffer memory address to GDS memory."
+  (and (match_code "mem")
+       (match_test "AS_GDS_P (MEM_ADDR_SPACE (op))")))
+
+(define_special_memory_constraint "RD"
+  "Buffer memory address to GDS or LDS memory."
+  (and (match_code "mem")
+       (ior (match_test "AS_GDS_P (MEM_ADDR_SPACE (op))")
+	    (match_test "AS_LDS_P (MEM_ADDR_SPACE (op))"))))
+
+(define_special_memory_constraint "RM"
+  "Memory address to global (main) memory."
+  (and (match_code "mem")
+       (match_test "AS_GLOBAL_P (MEM_ADDR_SPACE (op))
+		    && gcn_global_address_p (XEXP (op, 0))")))
diff --git a/gcc/config/gcn/gcn-valu.md b/gcc/config/gcn/gcn-valu.md
new file mode 100644
index 0000000..3d85a64
--- /dev/null
+++ b/gcc/config/gcn/gcn-valu.md
@@ -0,0 +1,3437 @@ 
+;; Copyright (C) 2016-2018 Free Software Foundation, Inc.
+
+;; This file is free software; you can redistribute it and/or modify it under
+;; the terms of the GNU General Public License as published by the Free
+;; Software Foundation; either version 3 of the License, or (at your option)
+;; any later version.
+
+;; This file is distributed in the hope that it will be useful, but WITHOUT
+;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+;; FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+;; for more details.
+
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+
+;; {{{ Vector iterators
+
+; Vector modes for one vector register
+(define_mode_iterator VEC_1REG_MODE
+		      [V64QI V64HI V64SI V64HF V64SF])
+(define_mode_iterator VEC_1REG_ALT
+		      [V64QI V64HI V64SI V64HF V64SF])
+
+(define_mode_iterator VEC_1REG_INT_MODE
+		      [V64QI V64HI V64SI])
+(define_mode_iterator VEC_1REG_INT_ALT
+		      [V64QI V64HI V64SI])
+
+(define_mode_iterator SCALAR_1REG_INT_MODE
+		      [QI HI SI])
+
+; Vector modes for two vector registers
+(define_mode_iterator VEC_2REG_MODE
+		      [V64DI V64DF])
+
+; All of above
+(define_mode_iterator VEC_REG_MODE
+		      [V64QI V64HI V64SI V64HF V64SF    ; Single reg
+		       V64DI V64DF])		        ; Double reg
+
+(define_mode_attr scalar_mode
+  [(V64QI "qi") (V64HI "hi") (V64SI "si")
+   (V64HF "hf") (V64SF "sf") (V64DI "di") (V64DF "df")])
+
+(define_mode_attr SCALAR_MODE
+  [(V64QI "QI") (V64HI "HI") (V64SI "SI")
+   (V64HF "HF") (V64SF "SF") (V64DI "DI") (V64DF "DF")])
+
+;; }}}
+;; {{{ Vector moves
+
+; This is the entry point for all vector register moves.  Memory accesses can
+; come this way also, but will more usually use the reload_in/out,
+; gather/scatter, maskload/store, etc.
+
+(define_expand "mov<mode>"
+  [(set (match_operand:VEC_REG_MODE 0 "nonimmediate_operand")
+	(match_operand:VEC_REG_MODE 1 "general_operand"))]
+  ""
+  {
+    if (MEM_P (operands[0]) && !lra_in_progress && !reload_completed)
+      {
+	rtx exec = gcn_full_exec_reg ();
+	operands[1] = force_reg (<MODE>mode, operands[1]);
+	rtx scratch = gen_rtx_SCRATCH (V64DImode);
+	rtx a = gen_rtx_CONST_INT (VOIDmode, MEM_ADDR_SPACE (operands[0]));
+	rtx v = gen_rtx_CONST_INT (VOIDmode, MEM_VOLATILE_P (operands[0]));
+	rtx expr = gcn_expand_scalar_to_vector_address (<MODE>mode, exec,
+							operands[0],
+							scratch);
+	emit_insn (gen_scatter<mode>_expr (expr, operands[1], a, v, exec));
+	DONE;
+      }
+    else if (MEM_P (operands[1]) && !lra_in_progress && !reload_completed)
+      {
+	rtx exec = gcn_full_exec_reg ();
+	rtx undef = gcn_gen_undef (<MODE>mode);
+	rtx scratch = gen_rtx_SCRATCH (V64DImode);
+	rtx a = gen_rtx_CONST_INT (VOIDmode, MEM_ADDR_SPACE (operands[1]));
+	rtx v = gen_rtx_CONST_INT (VOIDmode, MEM_VOLATILE_P (operands[1]));
+	rtx expr = gcn_expand_scalar_to_vector_address (<MODE>mode, exec,
+							operands[1],
+							scratch);
+	emit_insn (gen_gather<mode>_expr (operands[0], expr, a, v, undef,
+					  exec));
+	DONE;
+      }
+    else if ((MEM_P (operands[0]) || MEM_P (operands[1]))
+	     && !reload_completed)
+      {
+	rtx scratch = gen_reg_rtx (V64DImode);
+	emit_insn (gen_mov<mode>_sgprbase (operands[0], operands[1], scratch));
+	DONE;
+      }
+    else if (!lra_in_progress && !reload_completed
+	     && !(GET_CODE (operands[1]) == UNSPEC
+		  && XINT (operands[1], 1) == UNSPEC_VECTOR))
+      {
+	rtx exec = gcn_full_exec_reg ();
+	rtx undef = gcn_gen_undef (<MODE>mode);
+	emit_insn (gen_mov<mode>_vector (operands[0], operands[1], exec,
+					 undef));
+	DONE;
+      }
+  })
+
+; A pseudo instruction that helps LRA use the "U0" constraint.
+
+(define_insn "mov<mode>_unspec"
+  [(set (match_operand:VEC_REG_MODE 0 "nonimmediate_operand" "=v")
+	(match_operand:VEC_REG_MODE 1 "gcn_unspec_operand"   " U"))]
+  ""
+  ""
+  [(set_attr "type" "unknown")
+   (set_attr "length" "0")
+   (set_attr "exec" "auto")])
+
+; A vector move that does not reference EXEC explicitly, and therefore is
+; suitable for use during or after LRA.  It uses the "exec" attribure instead.
+
+(define_insn "mov<mode>_full"
+  [(set (match_operand:VEC_1REG_MODE 0 "nonimmediate_operand" "=v,v")
+	(match_operand:VEC_1REG_MODE 1 "general_operand"      "vA,B"))]
+  "lra_in_progress || reload_completed"
+  "v_mov_b32\t%0, %1"
+  [(set_attr "type" "vop1,vop1")
+   (set_attr "length" "4,8")
+   (set_attr "exec" "full")])
+
+(define_insn "mov<mode>_full"
+  [(set (match_operand:VEC_2REG_MODE 0 "nonimmediate_operand"  "=v")
+	(match_operand:VEC_2REG_MODE 1 "general_operand"      "vDB"))]
+  "lra_in_progress || reload_completed"
+  {
+    if (!REG_P (operands[1]) || REGNO (operands[0]) <= REGNO (operands[1]))
+      return "v_mov_b32\t%L0, %L1\;v_mov_b32\t%H0, %H1";
+    else
+      return "v_mov_b32\t%H0, %H1\;v_mov_b32\t%L0, %L1";
+  }
+  [(set_attr "type" "vmult")
+   (set_attr "length" "16")
+   (set_attr "exec" "full")])
+
+; A SGPR-base load looks like:
+;   <load> v, Sg
+;
+; There's no hardware instruction that corresponds to this, but vector base
+; addresses are placed in an SGPR because it is easier to add to a vector.
+; We also have a temporary vT, and the vector v1 holding numbered lanes.
+;
+; Rewrite as:
+;   vT = v1 << log2(element-size)
+;   vT += Sg
+;   flat_load v, vT
+
+(define_insn "mov<mode>_sgprbase"
+  [(set (match_operand:VEC_1REG_MODE 0 "nonimmediate_operand" "= v, v, v, m")
+	(unspec:VEC_1REG_MODE
+	  [(match_operand:VEC_1REG_MODE 1 "general_operand"   " vA,vB, m, v")]
+	  UNSPEC_SGPRBASE))
+   (clobber (match_operand:V64DI 2 "register_operand"	      "=&v,&v,&v,&v"))]
+  "lra_in_progress || reload_completed"
+  "@
+   v_mov_b32\t%0, %1
+   v_mov_b32\t%0, %1
+   #
+   #"
+  [(set_attr "type" "vop1,vop1,*,*")
+   (set_attr "length" "4,8,12,12")
+   (set_attr "exec" "full")])
+
+(define_insn "mov<mode>_sgprbase"
+  [(set (match_operand:VEC_2REG_MODE 0 "nonimmediate_operand" "= v, v, m")
+	(unspec:VEC_2REG_MODE
+	  [(match_operand:VEC_2REG_MODE 1 "general_operand"   "vDB, m, v")]
+	  UNSPEC_SGPRBASE))
+   (clobber (match_operand:V64DI 2 "register_operand"	      "=&v,&v,&v"))]
+  "lra_in_progress || reload_completed"
+  "@
+   * if (!REG_P (operands[1]) || REGNO (operands[0]) <= REGNO (operands[1])) \
+       return \"v_mov_b32\t%L0, %L1\;v_mov_b32\t%H0, %H1\"; \
+     else \
+       return \"v_mov_b32\t%H0, %H1\;v_mov_b32\t%L0, %L1\";
+   #
+   #"
+  [(set_attr "type" "vmult,*,*")
+   (set_attr "length" "8,12,12")
+   (set_attr "exec" "full")])
+
+; reload_in was once a standard name, but here it's only referenced by
+; gcn_secondary_reload.  It allows a reload with a scratch register.
+
+(define_expand "reload_in<mode>"
+  [(set (match_operand:VEC_REG_MODE 0 "register_operand" "= v")
+	(match_operand:VEC_REG_MODE 1 "memory_operand"   "  m"))
+   (clobber (match_operand:V64DI 2 "register_operand"    "=&v"))]
+  ""
+  {
+    emit_insn (gen_mov<mode>_sgprbase (operands[0], operands[1], operands[2]));
+    DONE;
+  })
+
+; reload_out is similar to reload_in, above.
+
+(define_expand "reload_out<mode>"
+  [(set (match_operand:VEC_REG_MODE 0 "memory_operand"   "= m")
+	(match_operand:VEC_REG_MODE 1 "register_operand" "  v"))
+   (clobber (match_operand:V64DI 2 "register_operand"    "=&v"))]
+  ""
+  {
+    emit_insn (gen_mov<mode>_sgprbase (operands[0], operands[1], operands[2]));
+    DONE;
+  })
+
+; This is the 'normal' kind of vector move created before register allocation.
+
+(define_insn "mov<mode>_vector"
+  [(set (match_operand:VEC_1REG_MODE 0 "nonimmediate_operand"
+							 "=v, v, v, v, v, m")
+	(vec_merge:VEC_1REG_MODE
+	  (match_operand:VEC_1REG_MODE 1 "general_operand"
+							 "vA, B, v,vA, m, v")
+	  (match_operand:VEC_1REG_MODE 3 "gcn_alu_or_unspec_operand"
+							 "U0,U0,vA,vA,U0,U0")
+	  (match_operand:DI 2 "register_operand"	 " e, e,cV,Sg, e, e")))
+   (clobber (match_scratch:V64DI 4			 "=X, X, X, X,&v,&v"))]
+  "!MEM_P (operands[0]) || REG_P (operands[1])"
+  "@
+   v_mov_b32\t%0, %1
+   v_mov_b32\t%0, %1
+   v_cndmask_b32\t%0, %3, %1, vcc
+   v_cndmask_b32\t%0, %3, %1, %2
+   #
+   #"
+  [(set_attr "type" "vop1,vop1,vop2,vop3a,*,*")
+   (set_attr "length" "4,8,4,8,16,16")
+   (set_attr "exec" "*,*,full,full,*,*")])
+
+; This variant does not accept an unspec, but does permit MEM
+; read/modify/write which is necessary for maskstore.
+
+(define_insn "*mov<mode>_vector_match"
+  [(set (match_operand:VEC_1REG_MODE 0 "nonimmediate_operand" "=v,v, v, m")
+	(vec_merge:VEC_1REG_MODE
+	  (match_operand:VEC_1REG_MODE 1 "general_operand"    "vA,B, m, v")
+	  (match_dup 0)
+	  (match_operand:DI 2 "gcn_exec_reg_operand"	      " e,e, e, e")))
+   (clobber (match_scratch:V64DI 3			      "=X,X,&v,&v"))]
+  "!MEM_P (operands[0]) || REG_P (operands[1])"
+  "@
+  v_mov_b32\t%0, %1
+  v_mov_b32\t%0, %1
+  #
+  #"
+  [(set_attr "type" "vop1,vop1,*,*")
+   (set_attr "length" "4,8,16,16")])
+
+(define_insn "mov<mode>_vector"
+  [(set (match_operand:VEC_2REG_MODE 0 "nonimmediate_operand"
+						       "= v,   v,   v, v, m")
+	(vec_merge:VEC_2REG_MODE
+	  (match_operand:VEC_2REG_MODE 1 "general_operand"
+						       "vDB,  v0,  v0, m, v")
+	  (match_operand:VEC_2REG_MODE 3 "gcn_alu_or_unspec_operand"
+						       " U0,vDA0,vDA0,U0,U0")
+	  (match_operand:DI 2 "register_operand"       "  e,  cV,  Sg, e, e")))
+   (clobber (match_scratch:V64DI 4		       "= X,   X,   X,&v,&v"))]
+  "!MEM_P (operands[0]) || REG_P (operands[1])"
+  {
+    if (!REG_P (operands[1]) || REGNO (operands[0]) <= REGNO (operands[1]))
+      switch (which_alternative)
+	{
+	case 0:
+	  return "v_mov_b32\t%L0, %L1\;v_mov_b32\t%H0, %H1";
+	case 1:
+	  return "v_cndmask_b32\t%L0, %L3, %L1, vcc\;"
+		 "v_cndmask_b32\t%H0, %H3, %H1, vcc";
+	case 2:
+	  return "v_cndmask_b32\t%L0, %L3, %L1, %2\;"
+		 "v_cndmask_b32\t%H0, %H3, %H1, %2";
+	}
+    else
+      switch (which_alternative)
+	{
+	case 0:
+	  return "v_mov_b32\t%H0, %H1\;v_mov_b32\t%L0, %L1";
+	case 1:
+	  return "v_cndmask_b32\t%H0, %H3, %H1, vcc\;"
+		 "v_cndmask_b32\t%L0, %L3, %L1, vcc";
+	case 2:
+	  return "v_cndmask_b32\t%H0, %H3, %H1, %2\;"
+		 "v_cndmask_b32\t%L0, %L3, %L1, %2";
+	}
+
+    return "#";
+  }
+  [(set_attr "type" "vmult,vmult,vmult,*,*")
+   (set_attr "length" "16,16,16,16,16")
+   (set_attr "exec" "*,full,full,*,*")])
+
+; This variant does not accept an unspec, but does permit MEM
+; read/modify/write which is necessary for maskstore.
+
+(define_insn "*mov<mode>_vector_match"
+  [(set (match_operand:VEC_2REG_MODE 0 "nonimmediate_operand" "=v, v, m")
+	(vec_merge:VEC_2REG_MODE
+	  (match_operand:VEC_2REG_MODE 1 "general_operand"   "vDB, m, v")
+	  (match_dup 0)
+	  (match_operand:DI 2 "gcn_exec_reg_operand"	      " e, e, e")))
+   (clobber (match_scratch:V64DI 3			      "=X,&v,&v"))]
+  "!MEM_P (operands[0]) || REG_P (operands[1])"
+  "@
+   * if (!REG_P (operands[1]) || REGNO (operands[0]) <= REGNO (operands[1])) \
+       return \"v_mov_b32\t%L0, %L1\;v_mov_b32\t%H0, %H1\"; \
+     else \
+       return \"v_mov_b32\t%H0, %H1\;v_mov_b32\t%L0, %L1\";
+   #
+   #"
+  [(set_attr "type" "vmult,*,*")
+   (set_attr "length" "16,16,16")])
+
+; Expand scalar addresses into gather/scatter patterns
+
+(define_split
+  [(set (match_operand:VEC_REG_MODE 0 "memory_operand")
+	(unspec:VEC_REG_MODE
+	  [(match_operand:VEC_REG_MODE 1 "general_operand")]
+	  UNSPEC_SGPRBASE))
+   (clobber (match_scratch:V64DI 2))]
+  ""
+  [(set (mem:BLK (scratch))
+	(unspec:BLK [(match_dup 5) (match_dup 1)
+		     (match_dup 6) (match_dup 7) (match_dup 8)]
+		    UNSPEC_SCATTER))]
+  {
+    operands[5] = gcn_expand_scalar_to_vector_address (<MODE>mode, NULL,
+						       operands[0],
+						       operands[2]);
+    operands[6] = gen_rtx_CONST_INT (VOIDmode, MEM_ADDR_SPACE (operands[0]));
+    operands[7] = gen_rtx_CONST_INT (VOIDmode, MEM_VOLATILE_P (operands[0]));
+    operands[8] = gen_rtx_CONST_INT (VOIDmode, -1);
+  })
+
+(define_split
+  [(set (match_operand:VEC_REG_MODE 0 "memory_operand")
+	(vec_merge:VEC_REG_MODE
+	  (match_operand:VEC_REG_MODE 1 "general_operand")
+	  (match_operand:VEC_REG_MODE 3 "")
+	  (match_operand:DI 2 "gcn_exec_reg_operand")))
+   (clobber (match_scratch:V64DI 4))]
+  ""
+  [(set (mem:BLK (scratch))
+	(unspec:BLK [(match_dup 5) (match_dup 1)
+		     (match_dup 6) (match_dup 7) (match_dup 2)]
+		    UNSPEC_SCATTER))]
+  {
+    operands[5] = gcn_expand_scalar_to_vector_address (<MODE>mode,
+						       operands[2],
+						       operands[0],
+						       operands[4]);
+    operands[6] = gen_rtx_CONST_INT (VOIDmode, MEM_ADDR_SPACE (operands[0]));
+    operands[7] = gen_rtx_CONST_INT (VOIDmode, MEM_VOLATILE_P (operands[0]));
+  })
+
+(define_split
+  [(set (match_operand:VEC_REG_MODE 0 "nonimmediate_operand")
+	(unspec:VEC_REG_MODE
+	  [(match_operand:VEC_REG_MODE 1 "memory_operand")]
+	  UNSPEC_SGPRBASE))
+   (clobber (match_scratch:V64DI 2))]
+  ""
+  [(set (match_dup 0)
+	(vec_merge:VEC_REG_MODE
+	  (unspec:VEC_REG_MODE [(match_dup 5) (match_dup 6) (match_dup 7)
+				(mem:BLK (scratch))]
+			       UNSPEC_GATHER)
+	  (match_dup 8)
+	  (match_dup 9)))]
+  {
+    operands[5] = gcn_expand_scalar_to_vector_address (<MODE>mode, NULL,
+						       operands[1],
+						       operands[2]);
+    operands[6] = gen_rtx_CONST_INT (VOIDmode, MEM_ADDR_SPACE (operands[1]));
+    operands[7] = gen_rtx_CONST_INT (VOIDmode, MEM_VOLATILE_P (operands[1]));
+    operands[8] = gcn_gen_undef (<MODE>mode);
+    operands[9] = gen_rtx_CONST_INT (VOIDmode, -1);
+  })
+
+(define_split
+  [(set (match_operand:VEC_REG_MODE 0 "nonimmediate_operand")
+	(vec_merge:VEC_REG_MODE
+	  (match_operand:VEC_REG_MODE 1 "memory_operand")
+	  (match_operand:VEC_REG_MODE 3 "")
+	  (match_operand:DI 2 "gcn_exec_reg_operand")))
+   (clobber (match_scratch:V64DI 4))]
+  ""
+  [(set (match_dup 0)
+	(vec_merge:VEC_REG_MODE
+	  (unspec:VEC_REG_MODE [(match_dup 5) (match_dup 6) (match_dup 7)
+				(mem:BLK (scratch))]
+			       UNSPEC_GATHER)
+	  (match_dup 3)
+	  (match_dup 2)))]
+  {
+    operands[5] = gcn_expand_scalar_to_vector_address (<MODE>mode,
+						       operands[2],
+						       operands[1],
+						       operands[4]);
+    operands[6] = gen_rtx_CONST_INT (VOIDmode, MEM_ADDR_SPACE (operands[1]));
+    operands[7] = gen_rtx_CONST_INT (VOIDmode, MEM_VOLATILE_P (operands[1]));
+  })
+
+; TODO: Add zero/sign extending variants.
+
+;; }}}
+;; {{{ Lane moves
+
+; v_writelane and v_readlane work regardless of exec flags.
+; We allow source to be scratch.
+;
+; FIXME these should take A immediates
+
+(define_insn "*vec_set<mode>"
+  [(set (match_operand:VEC_1REG_MODE 0 "register_operand"            "= v")
+	(vec_merge:VEC_1REG_MODE
+	  (vec_duplicate:VEC_1REG_MODE
+	    (match_operand:<SCALAR_MODE> 1 "register_operand"	     " SS"))
+	  (match_operand:VEC_1REG_MODE 3 "gcn_register_or_unspec_operand"
+								     " U0")
+	  (ashift (const_int 1)
+		  (match_operand:SI 2 "gcn_alu_operand"		     "SSB"))))]
+  ""
+  "v_writelane_b32 %0, %1, %2"
+  [(set_attr "type" "vop3a")
+   (set_attr "length" "8")
+   (set_attr "laneselect" "yes")])
+
+; FIXME: 64bit operations really should be splitters, but I am not sure how
+; to represent vertical subregs.
+(define_insn "*vec_set<mode>"
+  [(set (match_operand:VEC_2REG_MODE 0 "register_operand"	     "= v")
+	(vec_merge:VEC_2REG_MODE
+	  (vec_duplicate:VEC_2REG_MODE
+	    (match_operand:<SCALAR_MODE> 1 "register_operand"	     " SS"))
+	  (match_operand:VEC_2REG_MODE 3 "gcn_register_or_unspec_operand"
+								     " U0")
+	  (ashift (const_int 1)
+		  (match_operand:SI 2 "gcn_alu_operand"		     "SSB"))))]
+  ""
+  "v_writelane_b32 %L0, %L1, %2\;v_writelane_b32 %H0, %H1, %2"
+  [(set_attr "type" "vmult")
+   (set_attr "length" "16")
+   (set_attr "laneselect" "yes")])
+
+(define_expand "vec_set<mode>"
+  [(set (match_operand:VEC_REG_MODE 0 "register_operand")
+	(vec_merge:VEC_REG_MODE
+	  (vec_duplicate:VEC_REG_MODE
+	    (match_operand:<SCALAR_MODE> 1 "register_operand"))
+	  (match_dup 0)
+	  (ashift (const_int 1) (match_operand:SI 2 "gcn_alu_operand"))))]
+  "")
+
+(define_insn "*vec_set<mode>_1"
+  [(set (match_operand:VEC_1REG_MODE 0 "register_operand"	       "=v")
+	(vec_merge:VEC_1REG_MODE
+	  (vec_duplicate:VEC_1REG_MODE
+	    (match_operand:<SCALAR_MODE> 1 "register_operand"	       "SS"))
+	  (match_operand:VEC_1REG_MODE 3 "gcn_register_or_unspec_operand"
+								       "U0")
+	  (match_operand:SI 2 "const_int_operand"	               " i")))]
+  "((unsigned) exact_log2 (INTVAL (operands[2])) < 64)"
+  {
+    operands[2] = GEN_INT (exact_log2 (INTVAL (operands[2])));
+    return "v_writelane_b32 %0, %1, %2";
+  }
+  [(set_attr "type" "vop3a")
+   (set_attr "length" "8")
+   (set_attr "laneselect" "yes")])
+
+(define_insn "*vec_set<mode>_1"
+  [(set (match_operand:VEC_2REG_MODE 0 "register_operand"	       "=v")
+	(vec_merge:VEC_2REG_MODE
+	  (vec_duplicate:VEC_2REG_MODE
+	    (match_operand:<SCALAR_MODE> 1 "register_operand"	       "SS"))
+	  (match_operand:VEC_2REG_MODE 3 "gcn_register_or_unspec_operand"
+								       "U0")
+	  (match_operand:SI 2 "const_int_operand"		       " i")))]
+  "((unsigned) exact_log2 (INTVAL (operands[2])) < 64)"
+  {
+    operands[2] = GEN_INT (exact_log2 (INTVAL (operands[2])));
+    return "v_writelane_b32 %L0, %L1, %2\;v_writelane_b32 %H0, %H1, %2";
+  }
+  [(set_attr "type" "vmult")
+   (set_attr "length" "16")
+   (set_attr "laneselect" "yes")])
+
+(define_insn "vec_duplicate<mode>"
+  [(set (match_operand:VEC_1REG_MODE 0 "register_operand"  "=v")
+	(vec_duplicate:VEC_1REG_MODE
+	  (match_operand:<SCALAR_MODE> 1 "gcn_alu_operand" "SgB")))]
+  ""
+  "v_mov_b32\t%0, %1"
+  [(set_attr "type" "vop3a")
+   (set_attr "exec" "full")
+   (set_attr "length" "8")])
+
+(define_insn "vec_duplicate<mode>"
+  [(set (match_operand:VEC_2REG_MODE 0 "register_operand"  "=  v")
+	(vec_duplicate:VEC_2REG_MODE
+	  (match_operand:<SCALAR_MODE> 1 "gcn_alu_operand" "SgDB")))]
+  ""
+  "v_mov_b32\t%L0, %L1\;v_mov_b32\t%H0, %H1"
+  [(set_attr "type" "vop3a")
+   (set_attr "exec" "full")
+   (set_attr "length" "16")])
+
+(define_insn "vec_duplicate<mode>_exec"
+  [(set (match_operand:VEC_1REG_MODE 0 "register_operand"	      "= v")
+	(vec_merge:VEC_1REG_MODE
+	  (vec_duplicate:VEC_1REG_MODE
+	    (match_operand:<SCALAR_MODE> 1 "gcn_alu_operand"	      "SSB"))
+	  (match_operand:VEC_1REG_MODE 3 "gcn_register_or_unspec_operand"
+								      " U0")
+	  (match_operand:DI 2 "gcn_exec_reg_operand"		      "  e")))]
+  ""
+  "v_mov_b32\t%0, %1"
+  [(set_attr "type" "vop3a")
+   (set_attr "length" "8")])
+
+(define_insn "vec_duplicate<mode>_exec"
+  [(set (match_operand:VEC_2REG_MODE 0 "register_operand"	      "= v")
+	(vec_merge:VEC_2REG_MODE
+	  (vec_duplicate:VEC_2REG_MODE
+	    (match_operand:<SCALAR_MODE> 1 "register_operand"	     "SgDB"))
+	  (match_operand:VEC_2REG_MODE 3 "gcn_register_or_unspec_operand"
+								      " U0")
+	  (match_operand:DI 2 "gcn_exec_reg_operand"		      "  e")))]
+  ""
+  "v_mov_b32\t%L0, %L1\;v_mov_b32\t%H0, %H1"
+  [(set_attr "type" "vmult")
+   (set_attr "length" "16")])
+
+(define_insn "vec_extract<mode><scalar_mode>"
+  [(set (match_operand:<SCALAR_MODE> 0 "register_operand"   "=Sg")
+	(vec_select:<SCALAR_MODE>
+	  (match_operand:VEC_1REG_MODE 1 "register_operand" "  v")
+	  (parallel [(match_operand:SI 2 "gcn_alu_operand"  "SSB")])))]
+  ""
+  "v_readlane_b32 %0, %1, %2"
+  [(set_attr "type" "vop3a")
+   (set_attr "length" "8")
+   (set_attr "laneselect" "yes")])
+
+(define_insn "vec_extract<mode><scalar_mode>"
+  [(set (match_operand:<SCALAR_MODE> 0 "register_operand"   "=Sg")
+	(vec_select:<SCALAR_MODE>
+	  (match_operand:VEC_2REG_MODE 1 "register_operand" "  v")
+	  (parallel [(match_operand:SI 2 "gcn_alu_operand"  "SSB")])))]
+  ""
+  "v_readlane_b32 %L0, %L1, %2\;v_readlane_b32 %H0, %H1, %2"
+  [(set_attr "type" "vmult")
+   (set_attr "length" "16")
+   (set_attr "laneselect" "yes")])
+
+(define_expand "vec_init<mode><scalar_mode>"
+  [(match_operand:VEC_REG_MODE 0 "register_operand")
+   (match_operand 1)]
+  ""
+  {
+    gcn_expand_vector_init (operands[0], operands[1]);
+    DONE;
+  })
+
+;; }}}
+;; {{{ Scatter / Gather
+
+;; GCN does not have an instruction for loading a vector from contiguous
+;; memory so *all* loads and stores are eventually converted to scatter
+;; or gather.
+;;
+;; GCC does not permit MEM to hold vectors of addresses, so we must use an
+;; unspec.  The unspec formats are as follows:
+;;
+;;     (unspec:V64??
+;;	 [(<address expression>)
+;;	  (<addr_space_t>)
+;;	  (<use_glc>)
+;;	  (mem:BLK (scratch))]
+;;	 UNSPEC_GATHER)
+;;
+;;     (unspec:BLK
+;;	  [(<address expression>)
+;;	   (<source register>)
+;;	   (<addr_space_t>)
+;;	   (<use_glc>)
+;;	   (<exec>)]
+;;	  UNSPEC_SCATTER)
+;;
+;; - Loads are expected to be wrapped in a vec_merge, so do not need <exec>.
+;; - The mem:BLK does not contain any real information, but indicates that an
+;;   unknown memory read is taking place.  Stores are expected to use a similar
+;;   mem:BLK outside the unspec.
+;; - The address space and glc (volatile) fields are there to replace the
+;;   fields normally found in a MEM.
+;; - Multiple forms of address expression are supported, below.
+
+(define_expand "gather_load<mode>"
+  [(match_operand:VEC_REG_MODE 0 "register_operand")
+   (match_operand:DI 1 "register_operand")
+   (match_operand 2 "register_operand")
+   (match_operand 3 "immediate_operand")
+   (match_operand:SI 4 "gcn_alu_operand")]
+  ""
+  {
+    rtx exec = gcn_full_exec_reg ();
+
+    /* TODO: more conversions will be needed when more types are vectorized. */
+    if (GET_MODE (operands[2]) == V64DImode)
+      {
+	rtx tmp = gen_reg_rtx (V64SImode);
+	emit_insn (gen_vec_truncatev64div64si (tmp, operands[2],
+					       gcn_gen_undef (V64SImode),
+					       exec));
+	operands[2] = tmp;
+      }
+
+    emit_insn (gen_gather<mode>_exec (operands[0], operands[1], operands[2],
+				      operands[3], operands[4], exec));
+    DONE;
+  })
+
+(define_expand "gather<mode>_exec"
+  [(match_operand:VEC_REG_MODE 0 "register_operand")
+   (match_operand:DI 1 "register_operand")
+   (match_operand:V64SI 2 "register_operand")
+   (match_operand 3 "immediate_operand")
+   (match_operand:SI 4 "gcn_alu_operand")
+   (match_operand:DI 5 "gcn_exec_reg_operand")]
+  ""
+  {
+    rtx dest = operands[0];
+    rtx base = operands[1];
+    rtx offsets = operands[2];
+    int unsignedp = INTVAL (operands[3]);
+    rtx scale = operands[4];
+    rtx exec = operands[5];
+
+    rtx tmpsi = gen_reg_rtx (V64SImode);
+    rtx tmpdi = gen_reg_rtx (V64DImode);
+    rtx undefsi = gcn_gen_undef (V64SImode);
+    rtx undefdi = gcn_gen_undef (V64DImode);
+    rtx undefmode = gcn_gen_undef (<MODE>mode);
+
+    if (CONST_INT_P (scale)
+	&& INTVAL (scale) > 0
+	&& exact_log2 (INTVAL (scale)) >= 0)
+      emit_insn (gen_ashlv64si3 (tmpsi, offsets,
+				 GEN_INT (exact_log2 (INTVAL (scale)))));
+    else
+      emit_insn (gen_mulv64si3_vector_dup (tmpsi, offsets, scale, exec,
+					   undefsi));
+
+    if (DEFAULT_ADDR_SPACE == ADDR_SPACE_FLAT)
+      {
+	if (unsignedp)
+	  emit_insn (gen_addv64di3_zext_dup2 (tmpdi, tmpsi, base, exec,
+					      undefdi));
+	else
+	  emit_insn (gen_addv64di3_sext_dup2 (tmpdi, tmpsi, base, exec,
+					      undefdi));
+	emit_insn (gen_gather<mode>_insn_1offset (dest, tmpdi, const0_rtx,
+						  const0_rtx, const0_rtx,
+						  undefmode, exec));
+      }
+    else if (DEFAULT_ADDR_SPACE == ADDR_SPACE_GLOBAL)
+      emit_insn (gen_gather<mode>_insn_2offsets (dest, base, tmpsi, const0_rtx,
+						 const0_rtx, const0_rtx,
+						 undefmode, exec));
+    else
+      gcc_unreachable ();
+    DONE;
+  })
+
+; Allow any address expression
+(define_expand "gather<mode>_expr"
+  [(set (match_operand:VEC_REG_MODE 0 "register_operand")
+	(vec_merge:VEC_REG_MODE
+	  (unspec:VEC_REG_MODE
+	    [(match_operand 1 "")
+	     (match_operand 2 "immediate_operand")
+	     (match_operand 3 "immediate_operand")
+	     (mem:BLK (scratch))]
+	    UNSPEC_GATHER)
+	  (match_operand:VEC_REG_MODE 4 "gcn_register_or_unspec_operand")
+	  (match_operand:DI 5 "gcn_exec_operand")))]
+    ""
+    {})
+
+(define_insn "gather<mode>_insn_1offset"
+  [(set (match_operand:VEC_REG_MODE 0 "register_operand"	   "=v,  v")
+	(vec_merge:VEC_REG_MODE
+	  (unspec:VEC_REG_MODE
+	    [(plus:V64DI (match_operand:V64DI 1 "register_operand" " v,  v")
+			 (vec_duplicate:V64DI
+			   (match_operand 2 "immediate_operand"	   " n,  n")))
+	     (match_operand 3 "immediate_operand"		   " n,  n")
+	     (match_operand 4 "immediate_operand"		   " n,  n")
+	     (mem:BLK (scratch))]
+	    UNSPEC_GATHER)
+	  (match_operand:VEC_REG_MODE 5 "gcn_register_or_unspec_operand"
+								   "U0, U0")
+	  (match_operand:DI 6 "gcn_exec_operand"		   " e,*Kf")))]
+  "(AS_FLAT_P (INTVAL (operands[3]))
+    && ((TARGET_GCN3 && INTVAL(operands[2]) == 0)
+	|| ((unsigned HOST_WIDE_INT)INTVAL(operands[2]) < 0x1000)))
+    || (AS_GLOBAL_P (INTVAL (operands[3]))
+	&& (((unsigned HOST_WIDE_INT)INTVAL(operands[2]) + 0x1000) < 0x2000))"
+  {
+    addr_space_t as = INTVAL (operands[3]);
+    const char *glc = INTVAL (operands[4]) ? " glc" : "";
+
+    static char buf[200];
+    if (AS_FLAT_P (as))
+      {
+	if (TARGET_GCN5_PLUS)
+	  sprintf (buf, "flat_load%%s0\t%%0, %%1 offset:%%2%s\;s_waitcnt\t0",
+		   glc);
+	else
+	  sprintf (buf, "flat_load%%s0\t%%0, %%1%s\;s_waitcnt\t0", glc);
+      }
+    else if (AS_GLOBAL_P (as))
+      sprintf (buf, "global_load%%s0\t%%0, %%1, off offset:%%2%s\;"
+	       "s_waitcnt\tvmcnt(0)", glc);
+    else
+      gcc_unreachable ();
+
+    return buf;
+  }
+  [(set_attr "type" "flat")
+   (set_attr "length" "12")
+   (set_attr "exec" "*,full")])
+
+(define_insn "gather<mode>_insn_1offset_ds"
+  [(set (match_operand:VEC_REG_MODE 0 "register_operand"	   "=v,  v")
+	(vec_merge:VEC_REG_MODE
+	  (unspec:VEC_REG_MODE
+	    [(plus:V64SI (match_operand:V64SI 1 "register_operand" " v,  v")
+			 (vec_duplicate:V64SI
+			   (match_operand 2 "immediate_operand"	   " n,  n")))
+	     (match_operand 3 "immediate_operand"		   " n,  n")
+	     (match_operand 4 "immediate_operand"		   " n,  n")
+	     (mem:BLK (scratch))]
+	    UNSPEC_GATHER)
+	  (match_operand:VEC_REG_MODE 5 "gcn_register_or_unspec_operand"
+								   "U0, U0")
+	  (match_operand:DI 6 "gcn_exec_operand"		   " e,*Kf")))]
+  "(AS_ANY_DS_P (INTVAL (operands[3]))
+    && ((unsigned HOST_WIDE_INT)INTVAL(operands[2]) < 0x10000))"
+  {
+    addr_space_t as = INTVAL (operands[3]);
+    static char buf[200];
+    sprintf (buf, "ds_read%%b0\t%%0, %%1 offset:%%2%s\;s_waitcnt\tlgkmcnt(0)",
+	     (AS_GDS_P (as) ? " gds" : ""));
+    return buf;
+  }
+  [(set_attr "type" "ds")
+   (set_attr "length" "12")
+   (set_attr "exec" "*,full")])
+
+(define_insn "gather<mode>_insn_2offsets"
+  [(set (match_operand:VEC_REG_MODE 0 "register_operand"	   "=v,  v")
+	(vec_merge:VEC_REG_MODE
+	  (unspec:VEC_REG_MODE
+	    [(plus:V64DI
+	       (plus:V64DI
+		 (vec_duplicate:V64DI
+		   (match_operand:DI 1 "register_operand"	   "SS, SS"))
+		 (sign_extend:V64DI
+		   (match_operand:V64SI 2 "register_operand"	   " v,  v")))
+	       (vec_duplicate:V64DI (match_operand 3 "immediate_operand" 
+								   " n,  n")))
+	     (match_operand 4 "immediate_operand"		   " n,  n")
+	     (match_operand 5 "immediate_operand"		   " n,  n")
+	     (mem:BLK (scratch))]
+	    UNSPEC_GATHER)
+	  (match_operand:VEC_REG_MODE 6 "gcn_register_or_unspec_operand"
+								   "U0, U0")
+	  (match_operand:DI 7 "gcn_exec_operand"		   " e,*Kf")))]
+  "(AS_GLOBAL_P (INTVAL (operands[4]))
+    && (((unsigned HOST_WIDE_INT)INTVAL(operands[3]) + 0x1000) < 0x2000))"
+  {
+    addr_space_t as = INTVAL (operands[4]);
+    const char *glc = INTVAL (operands[5]) ? " glc" : "";
+
+    static char buf[200];
+    if (AS_GLOBAL_P (as))
+      {
+	/* Work around assembler bug in which a 64-bit register is expected,
+	but a 32-bit value would be correct.  */
+	int reg = REGNO (operands[2]) - FIRST_VGPR_REG;
+	sprintf (buf, "global_load%%s0\t%%0, v[%d:%d], %%1 offset:%%3%s\;"
+		      "s_waitcnt\tvmcnt(0)", reg, reg + 1, glc);
+      }
+    else
+      gcc_unreachable ();
+      
+    return buf;
+  }
+  [(set_attr "type" "flat")
+   (set_attr "length" "12")
+   (set_attr "exec" "*,full")])
+
+(define_expand "scatter_store<mode>"
+  [(match_operand:DI 0 "register_operand")
+   (match_operand 1 "register_operand")
+   (match_operand 2 "immediate_operand")
+   (match_operand:SI 3 "gcn_alu_operand")
+   (match_operand:VEC_REG_MODE 4 "register_operand")]
+  ""
+  {
+    rtx exec = gcn_full_exec_reg ();
+
+    /* TODO: more conversions will be needed when more types are vectorized. */
+    if (GET_MODE (operands[1]) == V64DImode)
+      {
+	rtx tmp = gen_reg_rtx (V64SImode);
+	emit_insn (gen_vec_truncatev64div64si (tmp, operands[1],
+					       gcn_gen_undef (V64SImode),
+					       exec));
+	operands[1] = tmp;
+      }
+
+    emit_insn (gen_scatter<mode>_exec (operands[0], operands[1], operands[2],
+				       operands[3], operands[4], exec));
+    DONE;
+  })
+
+(define_expand "scatter<mode>_exec"
+  [(match_operand:DI 0 "register_operand")
+   (match_operand 1 "register_operand")
+   (match_operand 2 "immediate_operand")
+   (match_operand:SI 3 "gcn_alu_operand")
+   (match_operand:VEC_REG_MODE 4 "register_operand")
+   (match_operand:DI 5 "gcn_exec_reg_operand")]
+  ""
+  {
+    rtx base = operands[0];
+    rtx offsets = operands[1];
+    int unsignedp = INTVAL (operands[2]);
+    rtx scale = operands[3];
+    rtx src = operands[4];
+    rtx exec = operands[5];
+
+    rtx tmpsi = gen_reg_rtx (V64SImode);
+    rtx tmpdi = gen_reg_rtx (V64DImode);
+    rtx undefsi = gcn_gen_undef (V64SImode);
+    rtx undefdi = gcn_gen_undef (V64DImode);
+
+    if (CONST_INT_P (scale)
+	&& INTVAL (scale) > 0
+	&& exact_log2 (INTVAL (scale)) >= 0)
+      emit_insn (gen_ashlv64si3 (tmpsi, offsets,
+				 GEN_INT (exact_log2 (INTVAL (scale)))));
+    else
+      emit_insn (gen_mulv64si3_vector_dup (tmpsi, offsets, scale, exec,
+					   undefsi));
+
+    if (DEFAULT_ADDR_SPACE == ADDR_SPACE_FLAT)
+      {
+	if (unsignedp)
+	  emit_insn (gen_addv64di3_zext_dup2 (tmpdi, tmpsi, base, exec,
+					      undefdi));
+	else
+	  emit_insn (gen_addv64di3_sext_dup2 (tmpdi, tmpsi, base, exec,
+					      undefdi));
+	emit_insn (gen_scatter<mode>_insn_1offset (tmpdi, const0_rtx, src,
+						   const0_rtx, const0_rtx,
+						   exec));
+      }
+    else if (DEFAULT_ADDR_SPACE == ADDR_SPACE_GLOBAL)
+      emit_insn (gen_scatter<mode>_insn_2offsets (base, tmpsi, const0_rtx, src,
+						  const0_rtx, const0_rtx,
+						  exec));
+    else
+      gcc_unreachable ();
+    DONE;
+  })
+
+; Allow any address expression
+(define_expand "scatter<mode>_expr"
+  [(set (mem:BLK (scratch))
+	(unspec:BLK
+	  [(match_operand:V64DI 0 "")
+	   (match_operand:VEC_REG_MODE 1 "register_operand")
+	   (match_operand 2 "immediate_operand")
+	   (match_operand 3 "immediate_operand")
+	   (match_operand:DI 4 "gcn_exec_operand")]
+	  UNSPEC_SCATTER))]
+  ""
+  {})
+
+(define_insn "scatter<mode>_insn_1offset"
+  [(set (mem:BLK (scratch))
+	(unspec:BLK
+	  [(plus:V64DI (match_operand:V64DI 0 "register_operand" "v,  v")
+		       (vec_duplicate:V64DI
+			 (match_operand 1 "immediate_operand"	 "n,  n")))
+	   (match_operand:VEC_REG_MODE 2 "register_operand"	 "v,  v")
+	   (match_operand 3 "immediate_operand"			 "n,  n")
+	   (match_operand 4 "immediate_operand"			 "n,  n")
+	   (match_operand:DI 5 "gcn_exec_operand"		 "e,*Kf")]
+	  UNSPEC_SCATTER))]
+  "(AS_FLAT_P (INTVAL (operands[3]))
+    && (INTVAL(operands[1]) == 0
+	|| (TARGET_GCN5_PLUS
+	    && (unsigned HOST_WIDE_INT)INTVAL(operands[1]) < 0x1000)))
+    || (AS_GLOBAL_P (INTVAL (operands[3]))
+	&& (((unsigned HOST_WIDE_INT)INTVAL(operands[1]) + 0x1000) < 0x2000))"
+  {
+    addr_space_t as = INTVAL (operands[3]);
+    const char *glc = INTVAL (operands[4]) ? " glc" : "";
+
+    static char buf[200];
+    if (AS_FLAT_P (as))
+      {
+	if (TARGET_GCN5_PLUS)
+	  sprintf (buf, "flat_store%%s2\t%%0, %%2 offset:%%1%s\;s_waitcnt\t0",
+		   glc);
+	else
+	  sprintf (buf, "flat_store%%s2\t%%0, %%2%s\;s_waitcnt\t0", glc);
+      }
+    else if (AS_GLOBAL_P (as))
+      sprintf (buf, "global_store%%s2\t%%0, %%2, off offset:%%1%s\;"
+	       "s_waitcnt\tvmcnt(0)", glc);
+    else
+      gcc_unreachable ();
+
+    return buf;
+  }
+  [(set_attr "type" "flat")
+   (set_attr "length" "12")
+   (set_attr "exec" "*,full")])
+
+(define_insn "scatter<mode>_insn_1offset_ds"
+  [(set (mem:BLK (scratch))
+	(unspec:BLK
+	  [(plus:V64SI (match_operand:V64SI 0 "register_operand" "v,  v")
+		       (vec_duplicate:V64SI
+			 (match_operand 1 "immediate_operand"	 "n,  n")))
+	   (match_operand:VEC_REG_MODE 2 "register_operand"	 "v,  v")
+	   (match_operand 3 "immediate_operand"			 "n,  n")
+	   (match_operand 4 "immediate_operand"			 "n,  n")
+	   (match_operand:DI 5 "gcn_exec_operand"		 "e,*Kf")]
+	  UNSPEC_SCATTER))]
+  "(AS_ANY_DS_P (INTVAL (operands[3]))
+    && ((unsigned HOST_WIDE_INT)INTVAL(operands[1]) < 0x10000))"
+  {
+    addr_space_t as = INTVAL (operands[3]);
+    static char buf[200];
+    sprintf (buf, "ds_write%%b2\t%%0, %%2 offset:%%1%s\;s_waitcnt\tlgkmcnt(0)",
+	     (AS_GDS_P (as) ? " gds" : ""));
+    return buf;
+  }
+  [(set_attr "type" "ds")
+   (set_attr "length" "12")
+   (set_attr "exec" "*,full")])
+
+(define_insn "scatter<mode>_insn_2offsets"
+  [(set (mem:BLK (scratch))
+	(unspec:BLK
+	  [(plus:V64DI
+	     (plus:V64DI
+	       (vec_duplicate:V64DI
+		 (match_operand:DI 0 "register_operand"		    "SS, SS"))
+	       (sign_extend:V64DI
+		 (match_operand:V64SI 1 "register_operand"	    " v,  v")))
+	     (vec_duplicate:V64DI (match_operand 2 "immediate_operand"
+								    " n,  n")))
+	   (match_operand:VEC_REG_MODE 3 "register_operand"	    " v,  v")
+	   (match_operand 4 "immediate_operand"			    " n,  n")
+	   (match_operand 5 "immediate_operand"			    " n,  n")
+	   (match_operand:DI 6 "gcn_exec_operand"		    " e,*Kf")]
+	  UNSPEC_SCATTER))]
+  "(AS_GLOBAL_P (INTVAL (operands[4]))
+    && (((unsigned HOST_WIDE_INT)INTVAL(operands[2]) + 0x1000) < 0x2000))"
+  {
+    addr_space_t as = INTVAL (operands[4]);
+    const char *glc = INTVAL (operands[5]) ? " glc" : "";
+
+    static char buf[200];
+    if (AS_GLOBAL_P (as))
+      {
+	/* Work around assembler bug in which a 64-bit register is expected,
+	but a 32-bit value would be correct.  */
+	int reg = REGNO (operands[1]) - FIRST_VGPR_REG;
+	sprintf (buf, "global_store%%s3\tv[%d:%d], %%3, %%0 offset:%%2%s\;"
+		      "s_waitcnt\tvmcnt(0)", reg, reg + 1, glc);
+      }
+    else
+      gcc_unreachable ();
+
+    return buf;
+  }
+  [(set_attr "type" "flat")
+   (set_attr "length" "12")
+   (set_attr "exec" "*,full")])
+
+;; }}}
+;; {{{ Permutations
+
+(define_insn "ds_bpermute<mode>"
+  [(set (match_operand:VEC_1REG_MODE 0 "register_operand"    "=v")
+	(unspec:VEC_1REG_MODE
+	  [(match_operand:VEC_1REG_MODE 2 "register_operand" " v")
+	   (match_operand:V64SI 1 "register_operand"	     " v")
+	   (match_operand:DI 3 "gcn_exec_reg_operand"	     " e")]
+	  UNSPEC_BPERMUTE))]
+  ""
+  "ds_bpermute_b32\t%0, %1, %2\;s_waitcnt\tlgkmcnt(0)"
+  [(set_attr "type" "vop2")
+   (set_attr "length" "12")])
+
+(define_insn_and_split "ds_bpermute<mode>"
+  [(set (match_operand:VEC_2REG_MODE 0 "register_operand"    "=&v")
+	(unspec:VEC_2REG_MODE
+	  [(match_operand:VEC_2REG_MODE 2 "register_operand" " v0")
+	   (match_operand:V64SI 1 "register_operand"	     "  v")
+	   (match_operand:DI 3 "gcn_exec_reg_operand"	     "  e")]
+	  UNSPEC_BPERMUTE))]
+  ""
+  "#"
+  "reload_completed"
+  [(set (match_dup 4) (unspec:V64SI [(match_dup 6) (match_dup 1) (match_dup 3)]
+				    UNSPEC_BPERMUTE))
+   (set (match_dup 5) (unspec:V64SI [(match_dup 7) (match_dup 1) (match_dup 3)]
+				    UNSPEC_BPERMUTE))]
+  {
+    operands[4] = gcn_operand_part (<MODE>mode, operands[0], 0);
+    operands[5] = gcn_operand_part (<MODE>mode, operands[0], 1);
+    operands[6] = gcn_operand_part (<MODE>mode, operands[2], 0);
+    operands[7] = gcn_operand_part (<MODE>mode, operands[2], 1);
+  }
+  [(set_attr "type" "vmult")
+   (set_attr "length" "24")])
+
+;; }}}
+;; {{{ ALU special case: add/sub
+
+(define_mode_iterator V64SIDI [V64SI V64DI])
+
+(define_expand "<expander><mode>3"
+  [(parallel [(set (match_operand:V64SIDI 0 "register_operand")
+		   (vec_merge:V64SIDI
+		     (plus_minus:V64SIDI
+		       (match_operand:V64SIDI 1 "register_operand")
+		       (match_operand:V64SIDI 2 "gcn_alu_operand"))
+		     (match_dup 4)
+		     (match_dup 3)))
+	      (clobber (reg:DI VCC_REG))])]
+  ""
+  {
+    operands[3] = gcn_full_exec_reg ();
+    operands[4] = gcn_gen_undef (<MODE>mode);
+  })
+
+(define_insn "addv64si3_vector"
+  [(set (match_operand:V64SI 0 "register_operand"		  "=  v")
+	(vec_merge:V64SI
+	  (plus:V64SI
+	    (match_operand:V64SI 1 "register_operand"		  "%  v")
+	    (match_operand:V64SI 2 "gcn_alu_operand"		  "vSSB"))
+	  (match_operand:V64SI 4 "gcn_register_or_unspec_operand" "  U0")
+	  (match_operand:DI 3 "gcn_exec_reg_operand"		  "   e")))
+   (clobber (reg:DI VCC_REG))]
+  ""
+  "v_add%^_u32\t%0, vcc, %2, %1"
+  [(set_attr "type" "vop2")
+   (set_attr "length" "8")])
+
+(define_insn "addsi3_scalar"
+  [(set (match_operand:SI 0 "register_operand"	   "=  v")
+	  (plus:SI
+	    (match_operand:SI 1 "register_operand" "%  v")
+	    (match_operand:SI 2 "gcn_alu_operand"  "vSSB")))
+   (use (match_operand:DI 3 "gcn_exec_operand"	   "   e"))
+   (clobber (reg:DI VCC_REG))]
+  ""
+  "v_add%^_u32\t%0, vcc, %2, %1"
+  [(set_attr "type" "vop2")
+   (set_attr "length" "8")])
+
+(define_insn "addv64si3_vector_dup"
+  [(set (match_operand:V64SI 0 "register_operand"		  "= v,  v")
+	(vec_merge:V64SI
+	  (plus:V64SI
+	    (vec_duplicate:V64SI
+	      (match_operand:SI 2 "gcn_alu_operand"		  "SSB,SSB"))
+	    (match_operand:V64SI 1 "register_operand"		  "  v,  v"))
+	  (match_operand:V64SI 4 "gcn_register_or_unspec_operand" " U0, U0")
+	  (match_operand:DI 3 "gcn_exec_reg_operand"		  "  e,*Kf")))
+   (clobber (reg:DI VCC_REG))]
+  ""
+  "v_add%^_u32\t%0, vcc, %2, %1"
+  [(set_attr "type" "vop2")
+   (set_attr "length" "8")
+   (set_attr "exec" "*,full")])
+
+(define_insn "addv64si3_vector_vcc"
+  [(set (match_operand:V64SI 0 "register_operand"	      "=  v,   v")
+	(vec_merge:V64SI
+	  (plus:V64SI
+	    (match_operand:V64SI 1 "register_operand"	      "%  v,   v")
+	    (match_operand:V64SI 2 "gcn_alu_operand"	      "vSSB,vSSB"))
+	  (match_operand:V64SI 4 "gcn_register_or_unspec_operand"
+							      "  U0,  U0")
+	  (match_operand:DI 3 "gcn_exec_reg_operand"	      "   e,   e")))
+   (set (match_operand:DI 5 "register_operand"		      "= cV,  Sg")
+	(ior:DI (and:DI (ltu:DI (plus:V64SI (match_dup 1) (match_dup 2))
+				(match_dup 1))
+			(match_dup 3))
+		(and:DI (not:DI (match_dup 3))
+			(match_operand:DI 6 "gcn_register_or_unspec_operand" 
+							      "  U5,  U5"))))]
+  ""
+  "v_add%^_u32\t%0, %5, %2, %1"
+  [(set_attr "type" "vop2,vop3b")
+   (set_attr "length" "8")])
+
+; This pattern only changes the VCC bits when the corresponding lane is
+; enabled, so the set must be described as an ior.
+
+(define_insn "addv64si3_vector_vcc_dup"
+  [(set (match_operand:V64SI 0 "register_operand"		 "= v,  v")
+	(vec_merge:V64SI
+	  (plus:V64SI
+	    (vec_duplicate:V64SI (match_operand:SI 2 "gcn_alu_operand"
+								 "SSB,SSB"))
+	    (match_operand:V64SI 1 "register_operand"		 "  v,  v"))
+	  (match_operand:V64SI 4 "gcn_register_or_unspec_operand" "U0, U0")
+	  (match_operand:DI 3 "gcn_exec_reg_operand"		 "  e,  e")))
+   (set (match_operand:DI 5 "register_operand"			 "=cV, Sg")
+	(ior:DI (and:DI (ltu:DI (plus:V64SI (vec_duplicate:V64SI (match_dup 2))
+					    (match_dup 1))
+				(vec_duplicate:V64SI (match_dup 2)))
+			(match_dup 3))
+		(and:DI (not:DI (match_dup 3))
+			(match_operand:DI 6 "gcn_register_or_unspec_operand"
+								 " 5U, 5U"))))]
+  ""
+  "v_add%^_u32\t%0, %5, %2, %1"
+  [(set_attr "type" "vop2,vop3b")
+   (set_attr "length" "8,8")])
+
+; This pattern does not accept SGPR because VCC read already counts as an
+; SGPR use and number of SGPR operands is limited to 1.
+
+(define_insn "addcv64si3_vec"
+  [(set (match_operand:V64SI 0 "register_operand" "=v,v")
+	(vec_merge:V64SI
+	  (plus:V64SI
+	    (plus:V64SI
+	      (vec_merge:V64SI
+		(match_operand:V64SI 7 "gcn_vec1_operand"	  "  A, A")
+		(match_operand:V64SI 8 "gcn_vec0_operand"	  "  A, A")
+		(match_operand:DI 5 "register_operand"		  " cV,Sg"))
+	      (match_operand:V64SI 1 "gcn_alu_operand"		  "%vA,vA"))
+	    (match_operand:V64SI 2 "gcn_alu_operand"		  " vB,vB"))
+	  (match_operand:V64SI 4 "gcn_register_or_unspec_operand" " U0,U0")
+	  (match_operand:DI 3 "gcn_exec_reg_operand"		  "  e, e")))
+   (set (match_operand:DI 6 "register_operand"			  "=cV,Sg")
+	(ior:DI (and:DI (ior:DI (ltu:DI (plus:V64SI (plus:V64SI
+						      (vec_merge:V64SI
+							(match_dup 7)
+							(match_dup 8)
+							(match_dup 5))
+						      (match_dup 1))
+						    (match_dup 2))
+					(match_dup 2))
+				(ltu:DI (plus:V64SI (vec_merge:V64SI
+						      (match_dup 7)
+						      (match_dup 8)
+						      (match_dup 5))
+						    (match_dup 1))
+					(match_dup 1)))
+			(match_dup 3))
+		(and:DI (not:DI (match_dup 3))
+			(match_operand:DI 9 "gcn_register_or_unspec_operand"
+								  " 6U,6U"))))]
+  ""
+  "v_addc%^_u32\t%0, %6, %1, %2, %5"
+  [(set_attr "type" "vop2,vop3b")
+   (set_attr "length" "4,8")])
+
+(define_insn "addcv64si3_vec_dup"
+  [(set (match_operand:V64SI 0 "register_operand" "=v,v")
+	(vec_merge:V64SI
+	  (plus:V64SI
+	    (plus:V64SI
+	      (vec_merge:V64SI
+		(match_operand:V64SI 7 "gcn_vec1_operand"	  "  A,  A")
+		(match_operand:V64SI 8 "gcn_vec0_operand"	  "  A,  A")
+		(match_operand:DI 5 "register_operand"		  " cV, Sg"))
+	      (match_operand:V64SI 1 "gcn_alu_operand"		  "%vA, vA"))
+	    (vec_duplicate:V64SI
+	      (match_operand:SI 2 "gcn_alu_operand"		  "SSB,SSB")))
+	  (match_operand:V64SI 4 "gcn_register_or_unspec_operand" " U0, U0")
+	  (match_operand:DI 3 "gcn_exec_reg_operand"		  "  e,  e")))
+   (set (match_operand:DI 6 "register_operand"			  "=cV, Sg")
+	(ior:DI (and:DI (ior:DI (ltu:DI (plus:V64SI (plus:V64SI
+						      (vec_merge:V64SI
+							(match_dup 7)
+							(match_dup 8)
+							(match_dup 5))
+						      (match_dup 1))
+						    (vec_duplicate:V64SI
+						      (match_dup 2)))
+					(vec_duplicate:V64SI
+					  (match_dup 2)))
+				(ltu:DI (plus:V64SI (vec_merge:V64SI
+						      (match_dup 7)
+						      (match_dup 8)
+						      (match_dup 5))
+						    (match_dup 1))
+					(match_dup 1)))
+			(match_dup 3))
+		(and:DI (not:DI (match_dup 3))
+			(match_operand:DI 9 "gcn_register_or_unspec_operand"
+								  " 6U,6U"))))]
+  ""
+  "v_addc%^_u32\t%0, %6, %1, %2, %5"
+  [(set_attr "type" "vop2,vop3b")
+   (set_attr "length" "4,8")])
+
+(define_insn "subv64si3_vector"
+  [(set (match_operand:V64SI 0 "register_operand"		 "=  v,   v")
+	(vec_merge:V64SI
+	  (minus:V64SI
+	    (match_operand:V64SI 1 "gcn_alu_operand"		 "vSSB,   v")
+	    (match_operand:V64SI 2 "gcn_alu_operand"		 "   v,vSSB"))
+	  (match_operand:V64SI 4 "gcn_register_or_unspec_operand" " U0,  U0")
+	  (match_operand:DI 3 "gcn_exec_reg_operand"		 "   e,   e")))
+   (clobber (reg:DI VCC_REG))]
+  "register_operand (operands[1], VOIDmode)
+   || register_operand (operands[2], VOIDmode)"
+  "@
+   v_sub%^_u32\t%0, vcc, %1, %2
+   v_subrev%^_u32\t%0, vcc, %2, %1"
+  [(set_attr "type" "vop2")
+   (set_attr "length" "8,8")])
+
+(define_insn "subsi3_scalar"
+  [(set (match_operand:SI 0 "register_operand"	  "=  v,   v")
+	  (minus:SI
+	    (match_operand:SI 1 "gcn_alu_operand" "vSSB,   v")
+	    (match_operand:SI 2 "gcn_alu_operand" "   v,vSSB")))
+   (use (match_operand:DI 3 "gcn_exec_operand"	  "   e,   e"))
+   (clobber (reg:DI VCC_REG))]
+  "register_operand (operands[1], VOIDmode)
+   || register_operand (operands[2], VOIDmode)"
+  "@
+   v_sub%^_u32\t%0, vcc, %1, %2
+   v_subrev%^_u32\t%0, vcc, %2, %1"
+  [(set_attr "type" "vop2")
+   (set_attr "length" "8,8")])
+
+(define_insn "subv64si3_vector_vcc"
+  [(set (match_operand:V64SI 0 "register_operand"    "=  v,   v,   v,   v")
+	(vec_merge:V64SI
+	  (minus:V64SI
+	    (match_operand:V64SI 1 "gcn_alu_operand" "vSSB,vSSB,   v,   v")
+	    (match_operand:V64SI 2 "gcn_alu_operand" "   v,   v,vSSB,vSSB"))
+	  (match_operand:V64SI 4 "gcn_register_or_unspec_operand"
+						     "  U0,  U0,  U0,  U0")
+	  (match_operand:DI 3 "gcn_exec_reg_operand" "   e,   e,   e,   e")))
+   (set (match_operand:DI 5 "register_operand"	     "= cV,  Sg,  cV,  Sg")
+	(ior:DI (and:DI (gtu:DI (minus:V64SI (match_dup 1)
+					     (match_dup 2))
+				(match_dup 1))
+			(match_dup 3))
+		(and:DI (not:DI (match_dup 3))
+			(match_operand:DI 6 "gcn_register_or_unspec_operand"
+						     "  5U,  5U,  5U,  5U"))))]
+  "register_operand (operands[1], VOIDmode)
+   || register_operand (operands[2], VOIDmode)"
+  "@
+   v_sub%^_u32\t%0, %5, %1, %2
+   v_sub%^_u32\t%0, %5, %1, %2
+   v_subrev%^_u32\t%0, %5, %2, %1
+   v_subrev%^_u32\t%0, %5, %2, %1"
+  [(set_attr "type" "vop2,vop3b,vop2,vop3b")
+   (set_attr "length" "8")])
+
+; This pattern does not accept SGPR because VCC read already counts
+; as a SGPR use and number of SGPR operands is limited to 1.
+
+(define_insn "subcv64si3_vec"
+  [(set (match_operand:V64SI 0 "register_operand"	    "= v, v, v, v")
+	(vec_merge:V64SI
+	  (minus:V64SI
+	    (minus:V64SI
+	      (vec_merge:V64SI
+		(match_operand:V64SI 7 "gcn_vec1_operand"   "  A, A, A, A")
+		(match_operand:V64SI 8 "gcn_vec0_operand"   "  A, A, A, A")
+		(match_operand:DI 5 "gcn_alu_operand"	    " cV,Sg,cV,Sg"))
+	      (match_operand:V64SI 1 "gcn_alu_operand"	    " vA,vA,vB,vB"))
+	    (match_operand:V64SI 2 "gcn_alu_operand"	    " vB,vB,vA,vA"))
+	  (match_operand:V64SI 4 "gcn_register_or_unspec_operand"
+							    " U0,U0,U0,U0")
+	  (match_operand:DI 3 "gcn_exec_reg_operand"	    "  e, e, e, e")))
+   (set (match_operand:DI 6 "register_operand"		    "=cV,Sg,cV,Sg")
+	(ior:DI (and:DI (ior:DI (gtu:DI (minus:V64SI (minus:V64SI
+						       (vec_merge:V64SI
+							 (match_dup 7)
+							 (match_dup 8)
+							 (match_dup 5))
+						       (match_dup 1))
+						     (match_dup 2))
+					(match_dup 2))
+				(ltu:DI (minus:V64SI (vec_merge:V64SI
+						       (match_dup 7)
+						       (match_dup 8)
+						       (match_dup 5))
+						     (match_dup 1))
+					(match_dup 1)))
+			(match_dup 3))
+		(and:DI (not:DI (match_dup 3))
+			(match_operand:DI 9 "gcn_register_or_unspec_operand"
+							    " 6U,6U,6U,6U"))))]
+  "register_operand (operands[1], VOIDmode)
+   || register_operand (operands[2], VOIDmode)"
+  "@
+   v_subb%^_u32\t%0, %6, %1, %2, %5
+   v_subb%^_u32\t%0, %6, %1, %2, %5
+   v_subbrev%^_u32\t%0, %6, %2, %1, %5
+   v_subbrev%^_u32\t%0, %6, %2, %1, %5"
+  [(set_attr "type" "vop2,vop3b,vop2,vop3b")
+   (set_attr "length" "8")])
+
+(define_insn_and_split "addv64di3_vector"
+  [(set (match_operand:V64DI 0 "register_operand"		  "=  &v")
+	(vec_merge:V64DI
+	  (plus:V64DI
+	    (match_operand:V64DI 1 "register_operand"		  "%  v0")
+	    (match_operand:V64DI 2 "gcn_alu_operand"		  "vSSB0"))
+	  (match_operand:V64DI 4 "gcn_register_or_unspec_operand" "   U0")
+	  (match_operand:DI 3 "gcn_exec_reg_operand"		  "    e")))
+   (clobber (reg:DI VCC_REG))]
+  ""
+  "#"
+  "gcn_can_split_p  (V64DImode, operands[0])
+   && gcn_can_split_p (V64DImode, operands[1])
+   && gcn_can_split_p (V64DImode, operands[2])
+   && gcn_can_split_p (V64DImode, operands[4])"
+  [(const_int 0)]
+  {
+    rtx vcc = gen_rtx_REG (DImode, VCC_REG);
+    emit_insn (gen_addv64si3_vector_vcc
+		(gcn_operand_part (V64DImode, operands[0], 0),
+		 gcn_operand_part (V64DImode, operands[1], 0),
+		 gcn_operand_part (V64DImode, operands[2], 0),
+		 operands[3],
+		 gcn_operand_part (V64DImode, operands[4], 0),
+		 vcc, gcn_gen_undef (DImode)));
+    emit_insn (gen_addcv64si3_vec
+		(gcn_operand_part (V64DImode, operands[0], 1),
+		 gcn_operand_part (V64DImode, operands[1], 1),
+		 gcn_operand_part (V64DImode, operands[2], 1),
+		 operands[3],
+		 gcn_operand_part (V64DImode, operands[4], 1),
+		 vcc, vcc, gcn_vec_constant (V64SImode, 1),
+		 gcn_vec_constant (V64SImode, 0),
+		 gcn_gen_undef (DImode)));
+    DONE;
+  }
+  [(set_attr "type" "vmult")
+   (set_attr "length" "8")])
+
+(define_insn_and_split "subv64di3_vector"
+  [(set (match_operand:V64DI 0 "register_operand"	       "=  &v,   &v")
+	(vec_merge:V64DI
+	  (minus:V64DI
+	    (match_operand:V64DI 1 "gcn_alu_operand"	       "vSSB0,   v0")
+	    (match_operand:V64DI 2 "gcn_alu_operand"	       "   v0,vSSB0"))
+	  (match_operand:V64DI 4 "gcn_register_or_unspec_operand"
+							       "   U0,   U0")
+	  (match_operand:DI 3 "gcn_exec_reg_operand"	       "    e,    e")))
+   (clobber (reg:DI VCC_REG))]
+  "register_operand (operands[1], VOIDmode)
+   || register_operand (operands[2], VOIDmode)"
+  "#"
+  "gcn_can_split_p  (V64DImode, operands[0])
+   && gcn_can_split_p (V64DImode, operands[1])
+   && gcn_can_split_p (V64DImode, operands[2])
+   && gcn_can_split_p (V64DImode, operands[4])"
+  [(const_int 0)]
+  {
+    rtx vcc = gen_rtx_REG (DImode, VCC_REG);
+    emit_insn (gen_subv64si3_vector_vcc
+		(gcn_operand_part (V64DImode, operands[0], 0),
+		 gcn_operand_part (V64DImode, operands[1], 0),
+		 gcn_operand_part (V64DImode, operands[2], 0),
+		 operands[3],
+		 gcn_operand_part (V64DImode, operands[4], 0),
+		 vcc, gcn_gen_undef (DImode)));
+    emit_insn (gen_subcv64si3_vec
+		(gcn_operand_part (V64DImode, operands[0], 1),
+		 gcn_operand_part (V64DImode, operands[1], 1),
+		 gcn_operand_part (V64DImode, operands[2], 1),
+		 operands[3],
+		 gcn_operand_part (V64DImode, operands[4], 1),
+		 vcc, vcc, gcn_vec_constant (V64SImode, 1),
+		 gcn_vec_constant (V64SImode, 0),
+		 gcn_gen_undef (DImode)));
+    DONE;
+  }
+  [(set_attr "type" "vmult")
+   (set_attr "length" "8,8")])
+
+(define_insn_and_split "addv64di3_vector_dup"
+  [(set (match_operand:V64DI 0 "register_operand"		  "= &v")
+	(vec_merge:V64DI
+	  (plus:V64DI
+	    (match_operand:V64DI 1 "register_operand"		  "  v0")
+	    (vec_duplicate:V64DI
+	      (match_operand:DI 2 "gcn_alu_operand"		  "SSDB")))
+	  (match_operand:V64DI 4 "gcn_register_or_unspec_operand" "  U0")
+	  (match_operand:DI 3 "gcn_exec_reg_operand"		  "   e")))
+   (clobber (reg:DI VCC_REG))]
+  ""
+  "#"
+  "gcn_can_split_p  (V64DImode, operands[0])
+   && gcn_can_split_p (V64DImode, operands[1])
+   && gcn_can_split_p (V64DImode, operands[2])
+   && gcn_can_split_p (V64DImode, operands[4])"
+  [(const_int 0)]
+  {
+    rtx vcc = gen_rtx_REG (DImode, VCC_REG);
+    emit_insn (gen_addv64si3_vector_vcc_dup
+		(gcn_operand_part (V64DImode, operands[0], 0),
+		 gcn_operand_part (V64DImode, operands[1], 0),
+		 gcn_operand_part (DImode, operands[2], 0),
+		 operands[3],
+		 gcn_operand_part (V64DImode, operands[4], 0),
+		 vcc, gcn_gen_undef (DImode)));
+    emit_insn (gen_addcv64si3_vec_dup
+		(gcn_operand_part (V64DImode, operands[0], 1),
+		 gcn_operand_part (V64DImode, operands[1], 1),
+		 gcn_operand_part (DImode, operands[2], 1),
+		 operands[3],
+		 gcn_operand_part (V64DImode, operands[4], 1),
+		 vcc, vcc, gcn_vec_constant (V64SImode, 1),
+		 gcn_vec_constant (V64SImode, 0),
+		 gcn_gen_undef (DImode)));
+    DONE;
+  }
+  [(set_attr "type" "vmult")
+   (set_attr "length" "8")])
+
+(define_insn_and_split "addv64di3_zext"
+  [(set (match_operand:V64DI 0 "register_operand"		  "=&v,&v")
+	(vec_merge:V64DI
+	  (plus:V64DI
+	    (zero_extend:V64DI
+	      (match_operand:V64SI 1 "gcn_alu_operand"		  "0vA,0vB"))
+	    (match_operand:V64DI 2 "gcn_alu_operand"		  "0vB,0vA"))
+	  (match_operand:V64DI 4 "gcn_register_or_unspec_operand" " U0, U0")
+	  (match_operand:DI 3 "gcn_exec_reg_operand"		  "  e,  e")))
+   (clobber (reg:DI VCC_REG))]
+  ""
+  "#"
+  "gcn_can_split_p  (V64DImode, operands[0])
+   && gcn_can_split_p (V64DImode, operands[2])
+   && gcn_can_split_p (V64DImode, operands[4])"
+  [(const_int 0)]
+  {
+    rtx vcc = gen_rtx_REG (DImode, VCC_REG);
+    emit_insn (gen_addv64si3_vector_vcc
+		(gcn_operand_part (V64DImode, operands[0], 0),
+		 operands[1],
+		 gcn_operand_part (V64DImode, operands[2], 0),
+		 operands[3],
+		 gcn_operand_part (V64DImode, operands[4], 0),
+		 vcc, gcn_gen_undef (DImode)));
+    emit_insn (gen_addcv64si3_vec
+		(gcn_operand_part (V64DImode, operands[0], 1),
+		 gcn_operand_part (V64DImode, operands[2], 1),
+		 const0_rtx,
+		 operands[3],
+		 gcn_operand_part (V64DImode, operands[4], 1),
+		 vcc, vcc, gcn_vec_constant (V64SImode, 1),
+		 gcn_vec_constant (V64SImode, 0),
+		 gcn_gen_undef (DImode)));
+    DONE;
+  }
+  [(set_attr "type" "vmult")
+   (set_attr "length" "8,8")])
+
+(define_insn_and_split "addv64di3_zext_dup"
+  [(set (match_operand:V64DI 0 "register_operand"		  "=&v")
+	(vec_merge:V64DI
+	  (plus:V64DI
+	    (zero_extend:V64DI
+	      (vec_duplicate:V64SI
+		(match_operand:SI 1 "gcn_alu_operand"		  "BSS")))
+	    (match_operand:V64DI 2 "gcn_alu_operand"		  "vA0"))
+	  (match_operand:V64DI 4 "gcn_register_or_unspec_operand" " U0")
+	  (match_operand:DI 3 "gcn_exec_reg_operand"		  "  e")))
+   (clobber (reg:DI VCC_REG))]
+  ""
+  "#"
+  "gcn_can_split_p  (V64DImode, operands[0])
+   && gcn_can_split_p (V64DImode, operands[2])
+   && gcn_can_split_p (V64DImode, operands[4])"
+  [(const_int 0)]
+  {
+    rtx vcc = gen_rtx_REG (DImode, VCC_REG);
+    emit_insn (gen_addv64si3_vector_vcc_dup
+		(gcn_operand_part (V64DImode, operands[0], 0),
+		 gcn_operand_part (DImode, operands[1], 0),
+		 gcn_operand_part (V64DImode, operands[2], 0),
+		 operands[3],
+		 gcn_operand_part (V64DImode, operands[4], 0),
+		 vcc, gcn_gen_undef (DImode)));
+    emit_insn (gen_addcv64si3_vec
+		(gcn_operand_part (V64DImode, operands[0], 1),
+		 gcn_operand_part (V64DImode, operands[2], 1),
+		 const0_rtx, operands[3],
+		 gcn_operand_part (V64DImode, operands[4], 1),
+		 vcc, vcc, gcn_vec_constant (V64SImode, 1),
+		 gcn_vec_constant (V64SImode, 0),
+		 gcn_gen_undef (DImode)));
+    DONE;
+  }
+  [(set_attr "type" "vmult")
+   (set_attr "length" "8")])
+
+(define_insn_and_split "addv64di3_zext_dup2"
+  [(set (match_operand:V64DI 0 "register_operand"		       "= v")
+	(vec_merge:V64DI
+	  (plus:V64DI
+	    (zero_extend:V64DI (match_operand:V64SI 1 "gcn_alu_operand"
+								       " vA"))
+	    (vec_duplicate:V64DI (match_operand:DI 2 "gcn_alu_operand" "BSS")))
+	  (match_operand:V64DI 4 "gcn_register_or_unspec_operand"      " U0")
+	  (match_operand:DI 3 "gcn_exec_reg_operand"		       "  e")))
+   (clobber (reg:DI VCC_REG))]
+  ""
+  "#"
+  "gcn_can_split_p  (V64DImode, operands[0])
+   && gcn_can_split_p (V64DImode, operands[4])"
+  [(const_int 0)]
+  {
+    rtx vcc = gen_rtx_REG (DImode, VCC_REG);
+    emit_insn (gen_addv64si3_vector_vcc_dup
+		(gcn_operand_part (V64DImode, operands[0], 0),
+		 operands[1],
+		 gcn_operand_part (DImode, operands[2], 0),
+		 operands[3],
+		 gcn_operand_part (V64DImode, operands[4], 0),
+		 vcc, gcn_gen_undef (DImode)));
+    rtx dsthi = gcn_operand_part (V64DImode, operands[0], 1);
+    emit_insn (gen_vec_duplicatev64si_exec
+		(dsthi, gcn_operand_part (DImode, operands[2], 1),
+		 operands[3], gcn_gen_undef (V64SImode)));
+    emit_insn (gen_addcv64si3_vec
+		(dsthi, dsthi, const0_rtx, operands[3],
+		 gcn_operand_part (V64DImode, operands[4], 1),
+		 vcc, vcc, gcn_vec_constant (V64SImode, 1),
+		 gcn_vec_constant (V64SImode, 0),
+		 gcn_gen_undef (DImode)));
+    DONE;
+  }
+  [(set_attr "type" "vmult")
+   (set_attr "length" "8")])
+
+(define_insn_and_split "addv64di3_sext_dup2"
+  [(set (match_operand:V64DI 0 "register_operand"		       "= v")
+	(vec_merge:V64DI
+	  (plus:V64DI
+	    (sign_extend:V64DI (match_operand:V64SI 1 "gcn_alu_operand"
+								       " vA"))
+	    (vec_duplicate:V64DI (match_operand:DI 2 "gcn_alu_operand" "BSS")))
+	  (match_operand:V64DI 4 "gcn_register_or_unspec_operand"      " U0")
+	  (match_operand:DI 3 "gcn_exec_reg_operand"		       "  e")))
+   (clobber (match_scratch:V64SI 5				       "=&v"))
+   (clobber (reg:DI VCC_REG))]
+  ""
+  "#"
+  "gcn_can_split_p  (V64DImode, operands[0])
+   && gcn_can_split_p (V64DImode, operands[4])"
+  [(const_int 0)]
+  {
+    rtx vcc = gen_rtx_REG (DImode, VCC_REG);
+    emit_insn (gen_ashrv64si3_vector (operands[5], operands[1], GEN_INT (31),
+				      operands[3], gcn_gen_undef (V64SImode)));
+    emit_insn (gen_addv64si3_vector_vcc_dup
+		(gcn_operand_part (V64DImode, operands[0], 0),
+		 operands[1],
+		 gcn_operand_part (DImode, operands[2], 0),
+		 operands[3],
+		 gcn_operand_part (V64DImode, operands[4], 0),
+		 vcc, gcn_gen_undef (DImode)));
+    rtx dsthi = gcn_operand_part (V64DImode, operands[0], 1);
+    emit_insn (gen_vec_duplicatev64si_exec
+		(dsthi, gcn_operand_part (DImode, operands[2], 1),
+		 operands[3], gcn_gen_undef (V64SImode)));
+    emit_insn (gen_addcv64si3_vec
+		(dsthi, dsthi, operands[5], operands[3],
+		 gcn_operand_part (V64DImode, operands[4], 1),
+		 vcc, vcc, gcn_vec_constant (V64SImode, 1),
+		 gcn_vec_constant (V64SImode, 0),
+		 gcn_gen_undef (DImode)));
+    DONE;
+  }
+  [(set_attr "type" "vmult")
+   (set_attr "length" "8")])
+
+(define_insn "addv64di3_scalarsi"
+  [(set (match_operand:V64DI 0 "register_operand"	       "=&v, v")
+	(plus:V64DI (vec_duplicate:V64DI
+		      (zero_extend:DI
+			(match_operand:SI 2 "register_operand" " Sg,Sg")))
+		    (match_operand:V64DI 1 "register_operand"  "  v, 0")))]
+  ""
+  "v_add%^_u32\t%L0, vcc, %2, %L1\;v_addc%^_u32\t%H0, vcc, 0, %H1, vcc"
+  [(set_attr "type" "vmult")
+   (set_attr "length" "8")
+   (set_attr "exec" "full")])
+
+;; }}}
+;; {{{ DS memory ALU: add/sub
+
+(define_mode_iterator DS_ARITH_MODE [V64SI V64SF V64DI])
+(define_mode_iterator DS_ARITH_SCALAR_MODE [SI SF DI])
+
+;; FIXME: the vector patterns probably need RD expanded to a vector of
+;;        addresses.  For now, the only way a vector can get into LDS is
+;;        if the user puts it there manually.
+;;
+;; FIXME: the scalar patterns are probably fine in themselves, but need to be
+;;        checked to see if anything can ever use them.
+
+(define_insn "add<mode>3_ds_vector"
+  [(set (match_operand:DS_ARITH_MODE 0 "gcn_ds_memory_operand"	      "=RD")
+	(vec_merge:DS_ARITH_MODE
+	  (plus:DS_ARITH_MODE
+	    (match_operand:DS_ARITH_MODE 1 "gcn_ds_memory_operand"    "%RD")
+	    (match_operand:DS_ARITH_MODE 2 "register_operand"	      "  v"))
+	  (match_operand:DS_ARITH_MODE 4 "gcn_register_ds_or_unspec_operand"
+								      " U0")
+	  (match_operand:DI 3 "gcn_exec_reg_operand"		      "  e")))]
+  "rtx_equal_p (operands[0], operands[1])"
+  "ds_add%u0\t%A0, %2%O0"
+  [(set_attr "type" "ds")
+   (set_attr "length" "8")])
+
+(define_insn "add<mode>3_ds_scalar"
+  [(set (match_operand:DS_ARITH_SCALAR_MODE 0 "gcn_ds_memory_operand"  "=RD")
+	(plus:DS_ARITH_SCALAR_MODE
+	  (match_operand:DS_ARITH_SCALAR_MODE 1 "gcn_ds_memory_operand"
+								       "%RD")
+	  (match_operand:DS_ARITH_SCALAR_MODE 2 "register_operand"     "  v")))
+   (use (match_operand:DI 3 "gcn_exec_operand"			       "  e"))]
+  "rtx_equal_p (operands[0], operands[1])"
+  "ds_add%u0\t%A0, %2%O0"
+  [(set_attr "type" "ds")
+   (set_attr "length" "8")])
+
+(define_insn "sub<mode>3_ds_vector"
+  [(set (match_operand:DS_ARITH_MODE 0 "gcn_ds_memory_operand"	      "=RD")
+	(vec_merge:DS_ARITH_MODE
+	  (minus:DS_ARITH_MODE
+	    (match_operand:DS_ARITH_MODE 1 "gcn_ds_memory_operand"    " RD")
+	    (match_operand:DS_ARITH_MODE 2 "register_operand"	      "  v"))
+	  (match_operand:DS_ARITH_MODE 4 "gcn_register_ds_or_unspec_operand" 
+								      " U0")
+	  (match_operand:DI 3 "gcn_exec_reg_operand"		      "  e")))]
+  "rtx_equal_p (operands[0], operands[1])"
+  "ds_sub%u0\t%A0, %2%O0"
+  [(set_attr "type" "ds")
+   (set_attr "length" "8")])
+
+(define_insn "sub<mode>3_ds_scalar"
+  [(set (match_operand:DS_ARITH_SCALAR_MODE 0 "gcn_ds_memory_operand"  "=RD")
+	(minus:DS_ARITH_SCALAR_MODE
+	  (match_operand:DS_ARITH_SCALAR_MODE 1 "gcn_ds_memory_operand"
+								       " RD")
+	  (match_operand:DS_ARITH_SCALAR_MODE 2 "register_operand"     "  v")))
+   (use (match_operand:DI 3 "gcn_exec_operand"			       "  e"))]
+  "rtx_equal_p (operands[0], operands[1])"
+  "ds_sub%u0\t%A0, %2%O0"
+  [(set_attr "type" "ds")
+   (set_attr "length" "8")])
+
+(define_insn "subr<mode>3_ds_vector"
+  [(set (match_operand:DS_ARITH_MODE 0 "gcn_ds_memory_operand"	      "=RD")
+	(vec_merge:DS_ARITH_MODE
+	  (minus:DS_ARITH_MODE
+	    (match_operand:DS_ARITH_MODE 2 "register_operand"	      "  v")
+	    (match_operand:DS_ARITH_MODE 1 "gcn_ds_memory_operand"    " RD"))
+	  (match_operand:DS_ARITH_MODE 4 "gcn_register_ds_or_unspec_operand"
+								      " U0")
+	  (match_operand:DI 3 "gcn_exec_reg_operand"		      "  e")))]
+  "rtx_equal_p (operands[0], operands[1])"
+  "ds_rsub%u0\t%A0, %2%O0"
+  [(set_attr "type" "ds")
+   (set_attr "length" "8")])
+
+(define_insn "subr<mode>3_ds_scalar"
+  [(set (match_operand:DS_ARITH_SCALAR_MODE 0 "gcn_ds_memory_operand"  "=RD")
+	(minus:DS_ARITH_SCALAR_MODE
+	  (match_operand:DS_ARITH_SCALAR_MODE 2 "register_operand"     "  v")
+	  (match_operand:DS_ARITH_SCALAR_MODE 1 "gcn_ds_memory_operand" 
+								       " RD")))
+   (use (match_operand:DI 3 "gcn_exec_operand"			       "  e"))]
+  "rtx_equal_p (operands[0], operands[1])"
+  "ds_rsub%u0\t%A0, %2%O0"
+  [(set_attr "type" "ds")
+   (set_attr "length" "8")])
+
+;; }}}
+;; {{{ ALU special case: mult
+
+(define_code_iterator any_extend [sign_extend zero_extend])
+(define_code_attr sgnsuffix [(sign_extend "%i") (zero_extend "%u")])
+(define_code_attr su [(sign_extend "s") (zero_extend "u")])
+(define_code_attr u [(sign_extend "") (zero_extend "u")])
+(define_code_attr iu [(sign_extend "i") (zero_extend "u")])
+(define_code_attr e [(sign_extend "e") (zero_extend "")])
+
+(define_expand "<su>mulsi3_highpart"
+  [(parallel [(set (match_operand:SI 0 "register_operand")
+		   (truncate:SI
+		     (lshiftrt:DI
+		       (mult:DI
+			 (any_extend:DI
+			   (match_operand:SI 1 "register_operand"))
+			 (any_extend:DI
+			   (match_operand:SI 2 "gcn_vop3_operand")))
+		       (const_int 32))))
+	      (use (match_dup 3))])]
+  ""
+  {
+    operands[3] = gcn_scalar_exec_reg ();
+
+    if (CONST_INT_P (operands[2]))
+      {
+	emit_insn (gen_const_<su>mulsi3_highpart_scalar (operands[0],
+							 operands[1],
+							 operands[2],
+							 operands[3]));
+	DONE;
+      }
+  })
+
+(define_insn "<su>mulv64si3_highpart_vector"
+  [(set (match_operand:V64SI 0 "register_operand"		     "=  v")
+	(vec_merge:V64SI
+	  (truncate:V64SI
+	    (lshiftrt:V64DI
+	      (mult:V64DI
+		(any_extend:V64DI
+		  (match_operand:V64SI 1 "gcn_alu_operand"	     "  %v"))
+		(any_extend:V64DI
+		  (match_operand:V64SI 2 "gcn_alu_operand"	     "vSSB")))
+	      (const_int 32)))
+	  (match_operand:V64SI 4 "gcn_register_ds_or_unspec_operand" "  U0")
+	  (match_operand:DI 3 "gcn_exec_reg_operand"		     "   e")))]
+  ""
+  "v_mul_hi<sgnsuffix>0\t%0, %2, %1"
+  [(set_attr "type" "vop3a")
+   (set_attr "length" "8")])
+
+(define_insn "<su>mulsi3_highpart_scalar"
+  [(set (match_operand:SI 0 "register_operand"	       "= v")
+	(truncate:SI
+	  (lshiftrt:DI
+	    (mult:DI
+	      (any_extend:DI
+		(match_operand:SI 1 "register_operand" "% v"))
+	      (any_extend:DI
+		(match_operand:SI 2 "register_operand" "vSS")))
+	    (const_int 32))))
+    (use (match_operand:DI 3 "gcn_exec_reg_operand"    "  e"))]
+  ""
+  "v_mul_hi<sgnsuffix>0\t%0, %2, %1"
+  [(set_attr "type" "vop3a")
+   (set_attr "length" "8")])
+
+(define_insn "const_<su>mulsi3_highpart_scalar"
+  [(set (match_operand:SI 0 "register_operand"	       "=v")
+	(truncate:SI
+	  (lshiftrt:DI
+	    (mult:DI
+	      (any_extend:DI
+		(match_operand:SI 1 "register_operand" "%v"))
+	      (match_operand:SI 2 "gcn_vop3_operand"   " A"))
+	    (const_int 32))))
+    (use (match_operand:DI 3 "gcn_exec_reg_operand"    " e"))]
+  ""
+  "v_mul_hi<sgnsuffix>0\t%0, %1, %2"
+  [(set_attr "type" "vop3a")
+   (set_attr "length" "8")])
+
+(define_expand "<u>mulhisi3"
+  [(parallel [(set (match_operand:SI 0 "register_operand")
+		   (mult:SI
+		     (any_extend:SI (match_operand:HI 1 "register_operand"))
+		     (any_extend:SI (match_operand:HI 2 "register_operand"))))
+	      (use (match_dup 3))])]
+  ""
+  {
+    operands[3] = gcn_scalar_exec_reg ();
+  })
+
+(define_insn "<u>mulhisi3_scalar"
+  [(set (match_operand:SI 0 "register_operand"			"=v")
+	(mult:SI
+	  (any_extend:SI (match_operand:HI 1 "register_operand" "%v"))
+	  (any_extend:SI (match_operand:HI 2 "register_operand" " v"))))
+   (use (match_operand:DI 3 "gcn_exec_reg_operand"	        " e"))]
+  ""
+  "v_mul_<iu>32_<iu>24_sdwa\t%0, %<e>1, %<e>2 src0_sel:WORD_0 src1_sel:WORD_0"
+  [(set_attr "type" "vop_sdwa")
+   (set_attr "length" "8")])
+
+(define_expand "<u>mulqihi3"
+  [(parallel [(set (match_operand:HI 0 "register_operand")
+		   (mult:HI
+		     (any_extend:HI (match_operand:QI 1 "register_operand"))
+		     (any_extend:HI (match_operand:QI 2 "register_operand"))))
+	      (use (match_dup 3))])]
+  ""
+  {
+    operands[3] = gcn_scalar_exec_reg ();
+  })
+
+(define_insn "<u>mulqihi3_scalar"
+  [(set (match_operand:HI 0 "register_operand"			"=v")
+	(mult:HI
+	  (any_extend:HI (match_operand:QI 1 "register_operand" "%v"))
+	  (any_extend:HI (match_operand:QI 2 "register_operand" " v"))))
+   (use (match_operand:DI 3 "gcn_exec_reg_operand"		" e"))]
+  ""
+  "v_mul_<iu>32_<iu>24_sdwa\t%0, %<e>1, %<e>2 src0_sel:BYTE_0 src1_sel:BYTE_0"
+  [(set_attr "type" "vop_sdwa")
+   (set_attr "length" "8")])
+
+(define_expand "mulv64si3"
+  [(set (match_operand:V64SI 0 "register_operand")
+	(vec_merge:V64SI
+	  (mult:V64SI
+	    (match_operand:V64SI 1 "gcn_alu_operand")
+	    (match_operand:V64SI 2 "gcn_alu_operand"))
+	  (match_dup 4)
+	  (match_dup 3)))]
+  ""
+  {
+    operands[3] = gcn_full_exec_reg ();
+    operands[4] = gcn_gen_undef (V64SImode);
+  })
+
+(define_insn "mulv64si3_vector"
+  [(set (match_operand:V64SI 0 "register_operand"		  "=   v")
+	(vec_merge:V64SI
+	  (mult:V64SI
+	    (match_operand:V64SI 1 "gcn_alu_operand"		  "%vSvA")
+	    (match_operand:V64SI 2 "gcn_alu_operand"		  " vSvA"))
+	  (match_operand:V64SI 4 "gcn_register_or_unspec_operand" "   U0")
+	  (match_operand:DI 3 "gcn_exec_reg_operand"		  "    e")))]
+  ""
+  "v_mul_lo_u32\t%0, %1, %2"
+  [(set_attr "type" "vop3a")
+   (set_attr "length" "8")])
+
+(define_insn "mulv64si3_vector_dup"
+  [(set (match_operand:V64SI 0 "register_operand"		  "=   v")
+	(vec_merge:V64SI
+	  (mult:V64SI
+	    (match_operand:V64SI 1 "gcn_alu_operand"		  "%vSvA")
+	    (vec_duplicate:V64SI
+	      (match_operand:SI 2 "gcn_alu_operand"		  "  SvA")))
+	  (match_operand:V64SI 4 "gcn_register_or_unspec_operand" "   U0")
+	  (match_operand:DI 3 "gcn_exec_reg_operand"		  "    e")))]
+  ""
+  "v_mul_lo_u32\t%0, %1, %2"
+  [(set_attr "type" "vop3a")
+   (set_attr "length" "8")])
+
+(define_expand "mulv64di3"
+  [(match_operand:V64DI 0 "register_operand")
+   (match_operand:V64DI 1 "gcn_alu_operand")
+   (match_operand:V64DI 2 "gcn_alu_operand")]
+  ""
+  {
+    emit_insn (gen_mulv64di3_vector (operands[0], operands[1], operands[2],
+				     gcn_full_exec_reg (),
+				     gcn_gen_undef (V64DImode)));
+    DONE;
+  })
+
+(define_insn_and_split "mulv64di3_vector"
+  [(set (match_operand:V64DI 0 "register_operand"		  "=&v")
+	(vec_merge:V64DI
+	  (mult:V64DI
+	    (match_operand:V64DI 1 "gcn_alu_operand"		  "% v")
+	    (match_operand:V64DI 2 "gcn_alu_operand"		  "vDA"))
+	  (match_operand:V64DI 4 "gcn_register_or_unspec_operand" " U0")
+	  (match_operand:DI 3 "gcn_exec_reg_operand"		  "  e")))
+   (clobber (match_scratch:V64SI 5                                "=&v"))]
+  ""
+  "#"
+  "reload_completed"
+  [(const_int 0)]
+  {
+    rtx out_lo = gcn_operand_part (V64DImode, operands[0], 0);
+    rtx out_hi = gcn_operand_part (V64DImode, operands[0], 1);
+    rtx left_lo = gcn_operand_part (V64DImode, operands[1], 0);
+    rtx left_hi = gcn_operand_part (V64DImode, operands[1], 1);
+    rtx right_lo = gcn_operand_part (V64DImode, operands[2], 0);
+    rtx right_hi = gcn_operand_part (V64DImode, operands[2], 1);
+    rtx exec = operands[3];
+    rtx tmp = operands[5];
+
+    rtx old_lo, old_hi;
+    if (GET_CODE (operands[4]) == UNSPEC)
+      {
+	old_lo = old_hi = gcn_gen_undef (V64SImode);
+      }
+    else
+      {
+	old_lo = gcn_operand_part (V64DImode, operands[4], 0);
+	old_hi = gcn_operand_part (V64DImode, operands[4], 1);
+      }
+
+    rtx undef = gcn_gen_undef (V64SImode);
+
+    emit_insn (gen_mulv64si3_vector (out_lo, left_lo, right_lo, exec, old_lo));
+    emit_insn (gen_umulv64si3_highpart_vector (out_hi, left_lo, right_lo,
+					       exec, old_hi));
+    emit_insn (gen_mulv64si3_vector (tmp, left_hi, right_lo, exec, undef));
+    emit_insn (gen_addv64si3_vector (out_hi, out_hi, tmp, exec, out_hi));
+    emit_insn (gen_mulv64si3_vector (tmp, left_lo, right_hi, exec, undef));
+    emit_insn (gen_addv64si3_vector (out_hi, out_hi, tmp, exec, out_hi));
+    emit_insn (gen_mulv64si3_vector (tmp, left_hi, right_hi, exec, undef));
+    emit_insn (gen_addv64si3_vector (out_hi, out_hi, tmp, exec, out_hi));
+    DONE;
+  })
+
+(define_insn_and_split "mulv64di3_vector_zext"
+  [(set (match_operand:V64DI 0 "register_operand"		  "=&v")
+	(vec_merge:V64DI
+	  (mult:V64DI
+	    (zero_extend:V64DI
+	      (match_operand:V64SI 1 "gcn_alu_operand"		  "  v"))
+	    (match_operand:V64DI 2 "gcn_alu_operand"		  "vDA"))
+	  (match_operand:V64DI 4 "gcn_register_or_unspec_operand" " U0")
+	  (match_operand:DI 3 "gcn_exec_reg_operand"		  "  e")))
+   (clobber (match_scratch:V64SI 5                                "=&v"))]
+  ""
+  "#"
+  "reload_completed"
+  [(const_int 0)]
+  {
+    rtx out_lo = gcn_operand_part (V64DImode, operands[0], 0);
+    rtx out_hi = gcn_operand_part (V64DImode, operands[0], 1);
+    rtx left = operands[1];
+    rtx right_lo = gcn_operand_part (V64DImode, operands[2], 0);
+    rtx right_hi = gcn_operand_part (V64DImode, operands[2], 1);
+    rtx exec = operands[3];
+    rtx tmp = operands[5];
+
+    rtx old_lo, old_hi;
+    if (GET_CODE (operands[4]) == UNSPEC)
+      {
+	old_lo = old_hi = gcn_gen_undef (V64SImode);
+      }
+    else
+      {
+	old_lo = gcn_operand_part (V64DImode, operands[4], 0);
+	old_hi = gcn_operand_part (V64DImode, operands[4], 1);
+      }
+
+    rtx undef = gcn_gen_undef (V64SImode);
+
+    emit_insn (gen_mulv64si3_vector (out_lo, left, right_lo, exec, old_lo));
+    emit_insn (gen_umulv64si3_highpart_vector (out_hi, left, right_lo,
+					       exec, old_hi));
+    emit_insn (gen_mulv64si3_vector (tmp, left, right_hi, exec, undef));
+    emit_insn (gen_addv64si3_vector (out_hi, out_hi, tmp, exec, out_hi));
+    DONE;
+  })
+
+(define_insn_and_split "mulv64di3_vector_zext_dup2"
+  [(set (match_operand:V64DI 0 "register_operand"		  "= &v")
+	(vec_merge:V64DI
+	  (mult:V64DI
+	    (zero_extend:V64DI
+	      (match_operand:V64SI 1 "gcn_alu_operand"		  "   v"))
+	    (vec_duplicate:V64DI
+	      (match_operand:DI 2 "gcn_alu_operand"		  "SSDA")))
+	  (match_operand:V64DI 4 "gcn_register_or_unspec_operand" "  U0")
+	  (match_operand:DI 3 "gcn_exec_reg_operand"		  "   e")))
+   (clobber (match_scratch:V64SI 5                                "= &v"))]
+  ""
+  "#"
+  "reload_completed"
+  [(const_int 0)]
+  {
+    rtx out_lo = gcn_operand_part (V64DImode, operands[0], 0);
+    rtx out_hi = gcn_operand_part (V64DImode, operands[0], 1);
+    rtx left = operands[1];
+    rtx right_lo = gcn_operand_part (V64DImode, operands[2], 0);
+    rtx right_hi = gcn_operand_part (V64DImode, operands[2], 1);
+    rtx exec = operands[3];
+    rtx tmp = operands[5];
+
+    rtx old_lo, old_hi;
+    if (GET_CODE (operands[4]) == UNSPEC)
+      {
+	old_lo = old_hi = gcn_gen_undef (V64SImode);
+      }
+    else
+      {
+	old_lo = gcn_operand_part (V64DImode, operands[4], 0);
+	old_hi = gcn_operand_part (V64DImode, operands[4], 1);
+      }
+
+    rtx undef = gcn_gen_undef (V64SImode);
+
+    emit_insn (gen_mulv64si3_vector (out_lo, left, right_lo, exec, old_lo));
+    emit_insn (gen_umulv64si3_highpart_vector (out_hi, left, right_lo,
+					       exec, old_hi));
+    emit_insn (gen_mulv64si3_vector (tmp, left, right_hi, exec, undef));
+    emit_insn (gen_addv64si3_vector (out_hi, out_hi, tmp, exec, out_hi));
+    DONE;
+  })
+
+;; }}}
+;; {{{ ALU generic case
+
+(define_mode_iterator VEC_INT_MODE [V64QI V64HI V64SI V64DI])
+
+(define_code_iterator bitop [and ior xor])
+(define_code_iterator bitunop [not popcount])
+(define_code_iterator shiftop [ashift lshiftrt ashiftrt])
+(define_code_iterator minmaxop [smin smax umin umax])
+
+(define_expand "<expander><mode>3"
+  [(set (match_operand:VEC_INT_MODE 0 "gcn_valu_dst_operand")
+	(vec_merge:VEC_INT_MODE
+	  (bitop:VEC_INT_MODE
+	    (match_operand:VEC_INT_MODE 1 "gcn_valu_src0_operand")
+	    (match_operand:VEC_INT_MODE 2 "gcn_valu_src1com_operand"))
+	  (match_dup 4)
+	  (match_dup 3)))]
+  ""
+  {
+    operands[3] = gcn_full_exec_reg ();
+    operands[4] = gcn_gen_undef (<MODE>mode);
+  })
+
+(define_expand "<expander>v64si3"
+  [(set (match_operand:V64SI 0 "register_operand")
+	(vec_merge:V64SI
+	  (shiftop:V64SI
+	    (match_operand:V64SI 1 "register_operand")
+	    (match_operand:SI 2 "gcn_alu_operand"))
+	  (match_dup 4)
+	  (match_dup 3)))]
+  ""
+  {
+    operands[3] = gcn_full_exec_reg ();
+    operands[4] = gcn_gen_undef (V64SImode);
+  })
+
+(define_expand "v<expander>v64si3"
+  [(set (match_operand:V64SI 0 "register_operand")
+	(vec_merge:V64SI
+	  (shiftop:V64SI
+	    (match_operand:V64SI 1 "register_operand")
+	    (match_operand:V64SI 2 "gcn_alu_operand"))
+	  (match_dup 4)
+	  (match_dup 3)))]
+  ""
+  {
+    operands[3] = gcn_full_exec_reg ();
+    operands[4] = gcn_gen_undef (V64SImode);
+  })
+
+(define_expand "<expander><mode>3"
+  [(set (match_operand:VEC_1REG_INT_MODE 0 "gcn_valu_dst_operand")
+	(vec_merge:VEC_1REG_INT_MODE
+	  (minmaxop:VEC_1REG_INT_MODE
+	    (match_operand:VEC_1REG_INT_MODE 1 "gcn_valu_src0_operand")
+	    (match_operand:VEC_1REG_INT_MODE 2 "gcn_valu_src1_operand"))
+	  (match_dup 4)
+	  (match_dup 3)))]
+  "<MODE>mode != V64QImode"
+  {
+    operands[3] = gcn_full_exec_reg ();
+    operands[4] = gcn_gen_undef (<MODE>mode);
+  })
+
+(define_insn "<expander><mode>2_vector"
+  [(set (match_operand:VEC_1REG_INT_MODE 0 "gcn_valu_dst_operand"    "=  v")
+	(vec_merge:VEC_1REG_INT_MODE
+	  (bitunop:VEC_1REG_INT_MODE
+	    (match_operand:VEC_1REG_INT_MODE 1 "gcn_valu_src0_operand"
+								     "vSSB"))
+	  (match_operand:VEC_1REG_INT_MODE 3 "gcn_register_or_unspec_operand"
+								     "  U0")
+	  (match_operand:DI 2 "gcn_exec_reg_operand"		     "   e")))]
+  ""
+  "v_<mnemonic>0\t%0, %1"
+  [(set_attr "type" "vop1")
+   (set_attr "length" "8")])
+
+(define_insn "<expander><mode>3_vector"
+  [(set (match_operand:VEC_1REG_INT_MODE 0 "gcn_valu_dst_operand" "=  v,RD")
+	(vec_merge:VEC_1REG_INT_MODE
+	  (bitop:VEC_1REG_INT_MODE
+	    (match_operand:VEC_1REG_INT_MODE 1 "gcn_valu_src0_operand"
+								  "%  v, 0")
+	    (match_operand:VEC_1REG_INT_MODE 2 "gcn_valu_src1com_operand"
+								  "vSSB, v"))
+	  (match_operand:VEC_1REG_INT_MODE 4
+	    "gcn_register_ds_or_unspec_operand"			  "  U0,U0")
+	  (match_operand:DI 3 "gcn_exec_reg_operand"		  "   e, e")))]
+  "!memory_operand (operands[0], VOIDmode)
+   || (rtx_equal_p (operands[0], operands[1]) 
+       && register_operand (operands[2], VOIDmode))"
+  "@
+   v_<mnemonic>0\t%0, %2, %1
+   ds_<mnemonic>0\t%A0, %2%O0"
+  [(set_attr "type" "vop2,ds")
+   (set_attr "length" "8,8")])
+
+(define_insn "<expander><mode>2_vscalar"
+  [(set (match_operand:SCALAR_1REG_INT_MODE 0 "gcn_valu_dst_operand"  "=  v")
+	(bitunop:SCALAR_1REG_INT_MODE
+	  (match_operand:SCALAR_1REG_INT_MODE 1 "gcn_valu_src0_operand"
+								      "vSSB")))
+   (use (match_operand:DI 2 "gcn_exec_operand"			      "   e"))]
+  ""
+  "v_<mnemonic>0\t%0, %1"
+  [(set_attr "type" "vop1")
+   (set_attr "length" "8")])
+
+(define_insn "<expander><mode>3_scalar"
+  [(set (match_operand:SCALAR_1REG_INT_MODE 0 "gcn_valu_dst_operand"
+								   "=  v,RD")
+	(vec_and_scalar_com:SCALAR_1REG_INT_MODE
+	  (match_operand:SCALAR_1REG_INT_MODE 1 "gcn_valu_src0_operand"
+								   "%  v, 0")
+	  (match_operand:SCALAR_1REG_INT_MODE 2 "gcn_valu_src1com_operand"
+								   "vSSB, v")))
+   (use (match_operand:DI 3 "gcn_exec_operand"                     "   e, e"))]
+  "!memory_operand (operands[0], VOIDmode)
+   || (rtx_equal_p (operands[0], operands[1])
+       && register_operand (operands[2], VOIDmode))"
+  "@
+   v_<mnemonic>0\t%0, %2, %1
+   ds_<mnemonic>0\t%A0, %2%O0"
+  [(set_attr "type" "vop2,ds")
+   (set_attr "length" "8,8")])
+
+(define_insn_and_split "<expander>v64di3_vector"
+  [(set (match_operand:V64DI 0 "gcn_valu_dst_operand" "=&v,RD")
+	(vec_merge:V64DI
+	  (bitop:V64DI
+	    (match_operand:V64DI 1 "gcn_valu_src0_operand"	  "%  v,RD")
+	    (match_operand:V64DI 2 "gcn_valu_src1com_operand"	  "vSSB, v"))
+	  (match_operand:V64DI 4 "gcn_register_ds_or_unspec_operand"
+								  "  U0,U0")
+	  (match_operand:DI 3 "gcn_exec_reg_operand"		  "   e, e")))]
+  "!memory_operand (operands[0], VOIDmode)
+   || (rtx_equal_p (operands[0], operands[1])
+       && register_operand (operands[2], VOIDmode))"
+  "@
+   #
+   ds_<mnemonic>0\t%A0, %2%O0"
+  "(reload_completed && !gcn_ds_memory_operand (operands[0], V64DImode))"
+  [(set (match_dup 5)
+	(vec_merge:V64SI
+	  (bitop:V64SI (match_dup 7) (match_dup 9))
+	  (match_dup 11)
+	  (match_dup 3)))
+   (set (match_dup 6)
+	(vec_merge:V64SI
+	  (bitop:V64SI (match_dup 8) (match_dup 10))
+	  (match_dup 12)
+	  (match_dup 3)))]
+  {
+    operands[5] = gcn_operand_part (V64DImode, operands[0], 0);
+    operands[6] = gcn_operand_part (V64DImode, operands[0], 1);
+    operands[7] = gcn_operand_part (V64DImode, operands[1], 0);
+    operands[8] = gcn_operand_part (V64DImode, operands[1], 1);
+    operands[9] = gcn_operand_part (V64DImode, operands[2], 0);
+    operands[10] = gcn_operand_part (V64DImode, operands[2], 1);
+    operands[11] = gcn_operand_part (V64DImode, operands[4], 0);
+    operands[12] = gcn_operand_part (V64DImode, operands[4], 1);
+  }
+  [(set_attr "type" "vmult,ds")
+   (set_attr "length" "16,8")])
+
+(define_insn_and_split "<expander>di3_scalar"
+  [(set (match_operand:DI 0 "gcn_valu_dst_operand"	   "= &v,RD")
+	  (bitop:DI
+	    (match_operand:DI 1 "gcn_valu_src0_operand"	   "%  v,RD")
+	    (match_operand:DI 2 "gcn_valu_src1com_operand" "vSSB, v")))
+   (use (match_operand:DI 3 "gcn_exec_operand"		   "   e, e"))]
+  "!memory_operand (operands[0], VOIDmode)
+   || (rtx_equal_p (operands[0], operands[1])
+       && register_operand (operands[2], VOIDmode))"
+  "@
+   #
+   ds_<mnemonic>0\t%A0, %2%O0"
+  "(reload_completed && !gcn_ds_memory_operand (operands[0], DImode))"
+  [(parallel [(set (match_dup 4)
+		   (bitop:V64SI (match_dup 6) (match_dup 8)))
+	      (use (match_dup 3))])
+   (parallel [(set (match_dup 5)
+		   (bitop:V64SI (match_dup 7) (match_dup 9)))
+	      (use (match_dup 3))])]
+  {
+    operands[4] = gcn_operand_part (DImode, operands[0], 0);
+    operands[5] = gcn_operand_part (DImode, operands[0], 1);
+    operands[6] = gcn_operand_part (DImode, operands[1], 0);
+    operands[7] = gcn_operand_part (DImode, operands[1], 1);
+    operands[8] = gcn_operand_part (DImode, operands[2], 0);
+    operands[9] = gcn_operand_part (DImode, operands[2], 1);
+  }
+  [(set_attr "type" "vmult,ds")
+   (set_attr "length" "16,8")])
+
+(define_insn "<expander>v64si3_vector"
+  [(set (match_operand:V64SI 0 "register_operand"		  "= v")
+	(vec_merge:V64SI
+	  (shiftop:V64SI
+	    (match_operand:V64SI 1 "gcn_alu_operand"		  "  v")
+	    (match_operand:SI 2 "gcn_alu_operand"		  "SSB"))
+	  (match_operand:V64SI 4 "gcn_register_or_unspec_operand" " U0")
+	  (match_operand:DI 3 "gcn_exec_reg_operand"		  "  e")))]
+  ""
+  "v_<revmnemonic>0\t%0, %2, %1"
+  [(set_attr "type" "vop2")
+   (set_attr "length" "8")])
+
+(define_insn "v<expander>v64si3_vector"
+  [(set (match_operand:V64SI 0 "register_operand"		  "=v")
+	(vec_merge:V64SI
+	  (shiftop:V64SI
+	    (match_operand:V64SI 1 "gcn_alu_operand"		  " v")
+	    (match_operand:V64SI 2 "gcn_alu_operand"		  "vB"))
+	  (match_operand:V64SI 4 "gcn_register_or_unspec_operand" "U0")
+	  (match_operand:DI 3 "gcn_exec_reg_operand"		  " e")))]
+  ""
+  "v_<revmnemonic>0\t%0, %2, %1"
+  [(set_attr "type" "vop2")
+   (set_attr "length" "8")])
+
+(define_insn "<expander>v64si3_full"
+  [(set (match_operand:V64SI 0 "register_operand"                "=v,v")
+	(shiftop:V64SI (match_operand:V64SI 1 "register_operand" " v,v")
+		       (match_operand:SI 2 "nonmemory_operand"   "Sg,I")))]
+  ""
+  "@
+   v_<revmnemonic>0\t%0, %2, %1
+   v_<revmnemonic>0\t%0, %2, %1"
+  [(set_attr "type" "vop2")
+   (set_attr "length" "4")
+   (set_attr "exec" "full")])
+
+(define_insn "*<expander>si3_scalar"
+  [(set (match_operand:SI 0 "register_operand"  "=  v")
+	(shiftop:SI
+	  (match_operand:SI 1 "gcn_alu_operand" "   v")
+	  (match_operand:SI 2 "gcn_alu_operand" "vSSB")))
+   (use (match_operand:DI 3 "gcn_exec_operand"  "   e"))]
+  ""
+  "v_<revmnemonic>0\t%0, %2, %1"
+  [(set_attr "type" "vop2")
+   (set_attr "length" "8")])
+
+(define_insn "<expander><mode>3_vector"
+  [(set (match_operand:VEC_1REG_INT_MODE 0 "gcn_valu_dst_operand" "=  v,RD")
+	(vec_merge:VEC_1REG_INT_MODE
+	  (minmaxop:VEC_1REG_INT_MODE
+	    (match_operand:VEC_1REG_INT_MODE 1 "gcn_valu_src0_operand"
+								  "%  v, 0")
+	    (match_operand:VEC_1REG_INT_MODE 2 "gcn_valu_src1com_operand"
+								  "vSSB, v"))
+	  (match_operand:VEC_1REG_INT_MODE 4
+	    "gcn_register_ds_or_unspec_operand"			  "  U0,U0")
+	  (match_operand:DI 3 "gcn_exec_reg_operand"		  "   e, e")))]
+  "<MODE>mode != V64QImode
+   && (!memory_operand (operands[0], VOIDmode)
+       || (rtx_equal_p (operands[0], operands[1])
+	   && register_operand (operands[2], VOIDmode)))"
+  "@
+   v_<mnemonic>0\t%0, %2, %1
+   ds_<mnemonic>0\t%A0, %2%O0"
+  [(set_attr "type" "vop2,ds")
+   (set_attr "length" "8,8")])
+
+;; }}}
+;; {{{ FP binops - special cases
+
+; GCN does not directly provide a DFmode subtract instruction, so we do it by
+; adding the negated second operand to the first.
+
+(define_insn "subv64df3_vector"
+  [(set (match_operand:V64DF 0 "register_operand"		"=  v,   v")
+	(vec_merge:V64DF
+	  (minus:V64DF
+	    (match_operand:V64DF 1 "gcn_alu_operand"	        "vSSB,   v")
+	    (match_operand:V64DF 2 "gcn_alu_operand"		"   v,vSSB"))
+	  (match_operand:V64DF 4 "gcn_register_or_unspec_operand"
+								"  U0,  U0")
+	  (match_operand:DI 3 "gcn_exec_reg_operand"		"   e,   e")))]
+  ""
+  "@
+   v_add_f64\t%0, %1, -%2
+   v_add_f64\t%0, -%2, %1"
+  [(set_attr "type" "vop3a")
+   (set_attr "length" "8,8")])
+
+(define_insn "subdf_scalar"
+  [(set (match_operand:DF 0 "register_operand"  "=  v,   v")
+	(minus:DF
+	  (match_operand:DF 1 "gcn_alu_operand" "vSSB,   v")
+	  (match_operand:DF 2 "gcn_alu_operand" "   v,vSSB")))
+   (use (match_operand:DI 3 "gcn_exec_operand"  "   e,   e"))]
+  ""
+  "@
+   v_add_f64\t%0, %1, -%2
+   v_add_f64\t%0, -%2, %1"
+  [(set_attr "type" "vop3a")
+   (set_attr "length" "8,8")])
+
+;; }}}
+;; {{{ FP binops - generic
+
+(define_mode_iterator VEC_FP_MODE [V64HF V64SF V64DF])
+(define_mode_iterator VEC_FP_1REG_MODE [V64HF V64SF])
+(define_mode_iterator FP_MODE [HF SF DF])
+(define_mode_iterator FP_1REG_MODE [HF SF])
+
+(define_code_iterator comm_fp [plus mult smin smax])
+(define_code_iterator nocomm_fp [minus])
+(define_code_iterator all_fp [plus mult minus smin smax])
+
+(define_insn "<expander><mode>3_vector"
+  [(set (match_operand:VEC_FP_MODE 0 "register_operand"		     "=  v")
+	(vec_merge:VEC_FP_MODE
+	  (comm_fp:VEC_FP_MODE
+	    (match_operand:VEC_FP_MODE 1 "gcn_alu_operand"	     "%  v")
+	    (match_operand:VEC_FP_MODE 2 "gcn_alu_operand"	     "vSSB"))
+	  (match_operand:VEC_FP_MODE 4 "gcn_register_or_unspec_operand"
+								     "  U0")
+	  (match_operand:DI 3 "gcn_exec_reg_operand"		     "   e")))]
+  ""
+  "v_<mnemonic>0\t%0, %2, %1"
+  [(set_attr "type" "vop2")
+   (set_attr "length" "8")])
+
+(define_insn "<expander><mode>3_scalar"
+  [(set (match_operand:FP_MODE 0 "gcn_valu_dst_operand"    "=  v,  RL")
+	(comm_fp:FP_MODE
+	  (match_operand:FP_MODE 1 "gcn_valu_src0_operand" "%  v,   0")
+	  (match_operand:FP_MODE 2 "gcn_valu_src1_operand" "vSSB,vSSB")))
+   (use (match_operand:DI 3 "gcn_exec_operand"             "   e,   e"))]
+  ""
+  "@
+  v_<mnemonic>0\t%0, %2, %1
+  v_<mnemonic>0\t%0, %1%O0"
+  [(set_attr "type" "vop2,ds")
+   (set_attr "length" "8")])
+
+(define_insn "<expander><mode>3_vector"
+  [(set (match_operand:VEC_FP_1REG_MODE 0 "register_operand"    "=  v,   v")
+	(vec_merge:VEC_FP_1REG_MODE
+	  (nocomm_fp:VEC_FP_1REG_MODE
+	    (match_operand:VEC_FP_1REG_MODE 1 "gcn_alu_operand" "vSSB,   v")
+	    (match_operand:VEC_FP_1REG_MODE 2 "gcn_alu_operand" "   v,vSSB"))
+	  (match_operand:VEC_FP_1REG_MODE 4 "gcn_register_or_unspec_operand"
+								"  U0,  U0")
+	  (match_operand:DI 3 "gcn_exec_reg_operand"		"   e,   e")))]
+  ""
+  "@
+   v_<mnemonic>0\t%0, %1, %2
+   v_<revmnemonic>0\t%0, %2, %1"
+  [(set_attr "type" "vop2")
+   (set_attr "length" "8,8")])
+
+(define_insn "<expander><mode>3_scalar"
+  [(set (match_operand:FP_1REG_MODE 0 "register_operand"  "=  v,   v")
+	(nocomm_fp:FP_1REG_MODE
+	  (match_operand:FP_1REG_MODE 1 "gcn_alu_operand" "vSSB,   v")
+	  (match_operand:FP_1REG_MODE 2 "gcn_alu_operand" "   v,vSSB")))
+   (use (match_operand:DI 3 "gcn_exec_operand"		  "   e,   e"))]
+  ""
+  "@
+   v_<mnemonic>0\t%0, %1, %2
+   v_<revmnemonic>0\t%0, %2, %1"
+  [(set_attr "type" "vop2")
+   (set_attr "length" "8,8")])
+
+(define_expand "<expander><mode>3"
+  [(set (match_operand:VEC_FP_MODE 0 "gcn_valu_dst_operand")
+	(vec_merge:VEC_FP_MODE
+	  (all_fp:VEC_FP_MODE
+	    (match_operand:VEC_FP_MODE 1 "gcn_valu_src0_operand")
+	    (match_operand:VEC_FP_MODE 2 "gcn_valu_src1_operand"))
+	  (match_dup 4)
+	  (match_dup 3)))]
+  ""
+  {
+    operands[3] = gcn_full_exec_reg ();
+    operands[4] = gcn_gen_undef (<MODE>mode);
+  })
+
+(define_expand "<expander><mode>3"
+  [(parallel [(set (match_operand:FP_MODE 0 "gcn_valu_dst_operand")
+		   (all_fp:FP_MODE
+		     (match_operand:FP_MODE 1 "gcn_valu_src0_operand")
+		     (match_operand:FP_MODE 2 "gcn_valu_src1_operand")))
+	      (use (match_dup 3))])]
+  ""
+  {
+    operands[3] = gcn_scalar_exec ();
+  })
+
+;; }}}
+;; {{{ FP unops
+
+(define_insn "abs<mode>2"
+  [(set (match_operand:FP_MODE 0 "register_operand"		 "=v")
+	(abs:FP_MODE (match_operand:FP_MODE 1 "register_operand" " v")))]
+  ""
+  "v_add%i0\t%0, 0, |%1|"
+  [(set_attr "type" "vop3a")
+   (set_attr "length" "8")])
+
+(define_expand "abs<mode>2"
+  [(set (match_operand:VEC_FP_MODE 0 "register_operand")
+	(abs:VEC_FP_MODE (match_operand:VEC_FP_MODE 1 "register_operand")))]
+  ""
+  {
+    emit_insn (gen_abs<mode>2_vector (operands[0], operands[1],
+				      gcn_full_exec_reg (),
+				      gcn_gen_undef (<MODE>mode)));
+    DONE;
+  })
+
+(define_insn "abs<mode>2_vector"
+  [(set (match_operand:VEC_FP_MODE 0 "register_operand"		       "=v")
+	(vec_merge:VEC_FP_MODE
+	  (abs:VEC_FP_MODE
+	    (match_operand:VEC_FP_MODE 1 "register_operand"	       " v"))
+	  (match_operand:VEC_FP_MODE 3 "gcn_register_or_unspec_operand"
+								       "U0")
+	  (match_operand:DI 2 "gcn_exec_reg_operand"		       " e")))]
+  ""
+  "v_add%i0\t%0, 0, |%1|"
+  [(set_attr "type" "vop3a")
+   (set_attr "length" "8")])
+
+(define_expand "neg<mode>2"
+  [(set (match_operand:VEC_FP_MODE 0 "register_operand")
+	(neg:VEC_FP_MODE (match_operand:VEC_FP_MODE 1 "register_operand")))]
+  ""
+  {
+    emit_insn (gen_neg<mode>2_vector (operands[0], operands[1],
+				      gcn_full_exec_reg (),
+				      gcn_gen_undef (<MODE>mode)));
+    DONE;
+  })
+
+(define_insn "neg<mode>2_vector"
+  [(set (match_operand:VEC_FP_MODE 0 "register_operand"		       "=v")
+	(vec_merge:VEC_FP_MODE
+	  (neg:VEC_FP_MODE
+	    (match_operand:VEC_FP_MODE 1 "register_operand"	       " v"))
+	  (match_operand:VEC_FP_MODE 3 "gcn_register_or_unspec_operand" 
+								       "U0")
+	  (match_operand:DI 2 "gcn_exec_reg_operand"		       " e")))]
+  ""
+  "v_add%i0\t%0, 0, -%1"
+  [(set_attr "type" "vop3a")
+   (set_attr "length" "8")])
+
+(define_insn "sqrt<mode>_vector"
+  [(set (match_operand:VEC_FP_MODE 0 "register_operand"		     "=  v")
+	(vec_merge:VEC_FP_MODE
+	  (sqrt:VEC_FP_MODE
+	    (match_operand:VEC_FP_MODE 1 "gcn_alu_operand"	     "vSSB"))
+	  (match_operand:VEC_FP_MODE 3 "gcn_register_or_unspec_operand"
+								     "  U0")
+	  (match_operand:DI 2 "gcn_exec_reg_operand"		     "   e")))]
+  "flag_unsafe_math_optimizations"
+  "v_sqrt%i0\t%0, %1"
+  [(set_attr "type" "vop1")
+   (set_attr "length" "8")])
+
+(define_insn "sqrt<mode>_scalar"
+  [(set (match_operand:FP_MODE 0 "register_operand"  "=  v")
+	(sqrt:FP_MODE
+	  (match_operand:FP_MODE 1 "gcn_alu_operand" "vSSB")))
+   (use (match_operand:DI 2 "gcn_exec_operand"	     "   e"))]
+  "flag_unsafe_math_optimizations"
+  "v_sqrt%i0\t%0, %1"
+  [(set_attr "type" "vop1")
+   (set_attr "length" "8")])
+
+(define_expand "sqrt<mode>2"
+  [(set (match_operand:VEC_FP_MODE 0 "register_operand")
+	(vec_merge:VEC_FP_MODE
+	  (sqrt:VEC_FP_MODE
+	    (match_operand:VEC_FP_MODE 1 "gcn_alu_operand"))
+	  (match_dup 3)
+	  (match_dup 2)))]
+  "flag_unsafe_math_optimizations"
+  {
+    operands[2] = gcn_full_exec_reg ();
+    operands[3] = gcn_gen_undef (<MODE>mode);
+  })
+
+(define_expand "sqrt<mode>2"
+  [(parallel [(set (match_operand:FP_MODE 0 "register_operand")
+		   (sqrt:FP_MODE
+		     (match_operand:FP_MODE 1 "gcn_alu_operand")))
+	      (use (match_dup 2))])]
+  "flag_unsafe_math_optimizations"
+  {
+    operands[2] = gcn_scalar_exec ();
+  })
+
+;; }}}
+;; {{{ FP fused multiply and add
+
+(define_insn "fma<mode>_vector"
+  [(set (match_operand:VEC_FP_MODE 0 "register_operand"		"=  v,   v")
+	(vec_merge:VEC_FP_MODE
+	  (fma:VEC_FP_MODE
+	    (match_operand:VEC_FP_MODE 1 "gcn_alu_operand"	"% vA,  vA")
+	    (match_operand:VEC_FP_MODE 2 "gcn_alu_operand"	"  vA,vSSA")
+	    (match_operand:VEC_FP_MODE 3 "gcn_alu_operand"	"vSSA,  vA"))
+	  (match_operand:VEC_FP_MODE 5 "gcn_register_or_unspec_operand"
+								"  U0,  U0")
+	  (match_operand:DI 4 "gcn_exec_reg_operand"		"   e,   e")))]
+  ""
+  "v_fma%i0\t%0, %1, %2, %3"
+  [(set_attr "type" "vop3a")
+   (set_attr "length" "8")])
+
+(define_insn "fma<mode>_vector_negop2"
+  [(set (match_operand:VEC_FP_MODE 0 "register_operand"    "=  v,   v,   v")
+	(vec_merge:VEC_FP_MODE
+	  (fma:VEC_FP_MODE
+	    (match_operand:VEC_FP_MODE 1 "gcn_alu_operand" "  vA,  vA,vSSA")
+	    (neg:VEC_FP_MODE
+	      (match_operand:VEC_FP_MODE 2 "gcn_alu_operand" 
+							   "  vA,vSSA,  vA"))
+	    (match_operand:VEC_FP_MODE 3 "gcn_alu_operand" "vSSA,  vA,  vA"))
+	  (match_operand:VEC_FP_MODE 5 "gcn_register_or_unspec_operand"
+							   "  U0,  U0,  U0")
+	  (match_operand:DI 4 "gcn_exec_reg_operand"	   "   e,   e,   e")))]
+  ""
+  "v_fma%i0\t%0, %1, -%2, %3"
+  [(set_attr "type" "vop3a")
+   (set_attr "length" "8")])
+
+(define_insn "fma<mode>_scalar"
+  [(set (match_operand:FP_MODE 0 "register_operand"  "=  v,   v")
+	(fma:FP_MODE
+	  (match_operand:FP_MODE 1 "gcn_alu_operand" "% vA,  vA")
+	  (match_operand:FP_MODE 2 "gcn_alu_operand" "  vA,vSSA")
+	  (match_operand:FP_MODE 3 "gcn_alu_operand" "vSSA,  vA")))
+   (use (match_operand:DI 4 "gcn_exec_operand"	     "   e,   e"))]
+  ""
+  "v_fma%i0\t%0, %1, %2, %3"
+  [(set_attr "type" "vop3a")
+   (set_attr "length" "8")])
+
+(define_insn "fma<mode>_scalar_negop2"
+  [(set (match_operand:FP_MODE 0 "register_operand"    "=  v,   v,   v")
+	(fma:FP_MODE
+	  (match_operand:FP_MODE 1 "gcn_alu_operand"   "  vA,  vA,vSSA")
+	  (neg:FP_MODE
+	    (match_operand:FP_MODE 2 "gcn_alu_operand" "  vA,vSSA,  vA"))
+	  (match_operand:FP_MODE 3 "gcn_alu_operand"   "vSSA,  vA,  vA")))
+   (use (match_operand:DI 4 "gcn_exec_operand"	       "   e,   e,   e"))]
+  ""
+  "v_fma%i0\t%0, %1, -%2, %3"
+  [(set_attr "type" "vop3a")
+   (set_attr "length" "8")])
+
+(define_expand "fma<mode>4"
+  [(set (match_operand:VEC_FP_MODE 0 "gcn_valu_dst_operand")
+	(vec_merge:VEC_FP_MODE
+	  (fma:VEC_FP_MODE
+	    (match_operand:VEC_FP_MODE 1 "gcn_valu_src1_operand")
+	    (match_operand:VEC_FP_MODE 2 "gcn_valu_src1_operand")
+	    (match_operand:VEC_FP_MODE 3 "gcn_valu_src1_operand"))
+	  (match_dup 5)
+	  (match_dup 4)))]
+  ""
+  {
+    operands[4] = gcn_full_exec_reg ();
+    operands[5] = gcn_gen_undef (<MODE>mode);
+  })
+
+(define_expand "fma<mode>4_negop2"
+  [(set (match_operand:VEC_FP_MODE 0 "gcn_valu_dst_operand")
+	(vec_merge:VEC_FP_MODE
+	  (fma:VEC_FP_MODE
+	    (match_operand:VEC_FP_MODE 1 "gcn_valu_src1_operand")
+	    (neg:VEC_FP_MODE
+	      (match_operand:VEC_FP_MODE 2 "gcn_valu_src1_operand"))
+	    (match_operand:VEC_FP_MODE 3 "gcn_valu_src1_operand"))
+	  (match_dup 5)
+	  (match_dup 4)))]
+  ""
+  {
+    operands[4] = gcn_full_exec_reg ();
+    operands[5] = gcn_gen_undef (<MODE>mode);
+  })
+
+(define_expand "fma<mode>4"
+  [(parallel [(set (match_operand:FP_MODE 0 "gcn_valu_dst_operand")
+		   (fma:FP_MODE
+		     (match_operand:FP_MODE 1 "gcn_valu_src1_operand")
+		     (match_operand:FP_MODE 2 "gcn_valu_src1_operand")
+		     (match_operand:FP_MODE 3 "gcn_valu_src1_operand")))
+	      (use (match_dup 4))])]
+  ""
+  {
+    operands[4] = gcn_scalar_exec ();
+  })
+
+(define_expand "fma<mode>4_negop2"
+  [(parallel [(set (match_operand:FP_MODE 0 "gcn_valu_dst_operand")
+		   (fma:FP_MODE
+		     (match_operand:FP_MODE 1 "gcn_valu_src1_operand")
+		     (neg:FP_MODE
+		       (match_operand:FP_MODE 2 "gcn_valu_src1_operand"))
+		     (match_operand:FP_MODE 3 "gcn_valu_src1_operand")))
+	      (use (match_dup 4))])]
+  ""
+  {
+    operands[4] = gcn_scalar_exec ();
+  })
+
+;; }}}
+;; {{{ FP division
+
+(define_insn "recip<mode>_vector"
+  [(set (match_operand:VEC_FP_MODE 0 "register_operand"		     "=  v")
+	(vec_merge:VEC_FP_MODE
+	  (div:VEC_FP_MODE
+	    (match_operand:VEC_FP_MODE 1 "gcn_vec1d_operand"	     "   A")
+	    (match_operand:VEC_FP_MODE 2 "gcn_alu_operand"	     "vSSB"))
+	  (match_operand:VEC_FP_MODE 4 "gcn_register_or_unspec_operand"
+								     "  U0")
+	  (match_operand:DI 3 "gcn_exec_reg_operand"		     "   e")))]
+  ""
+  "v_rcp%i0\t%0, %2"
+  [(set_attr "type" "vop1")
+   (set_attr "length" "8")])
+
+(define_insn "recip<mode>_scalar"
+  [(set (match_operand:FP_MODE 0 "register_operand"	 "=  v")
+	(div:FP_MODE
+	  (match_operand:FP_MODE 1 "gcn_const1d_operand" "   A")
+	  (match_operand:FP_MODE 2 "gcn_alu_operand"	 "vSSB")))
+   (use (match_operand:DI 3 "gcn_exec_operand"		 "   e"))]
+  ""
+  "v_rcp%i0\t%0, %2"
+  [(set_attr "type" "vop1")
+   (set_attr "length" "8")])
+
+;; Do division via a = b * 1/c
+;; The v_rcp_* instructions are not sufficiently accurate on their own,
+;; so we use 2 v_fma_* instructions to do one round of Newton-Raphson
+;; which the ISA manual says is enough to improve the reciprocal accuracy.
+;;
+;; FIXME: This does not handle denormals, NaNs, division-by-zero etc.
+
+(define_expand "div<mode>3"
+  [(match_operand:VEC_FP_MODE 0 "gcn_valu_dst_operand")
+   (match_operand:VEC_FP_MODE 1 "gcn_valu_src0_operand")
+   (match_operand:VEC_FP_MODE 2 "gcn_valu_src0_operand")]
+  "flag_reciprocal_math"
+  {
+    rtx one = gcn_vec_constant (<MODE>mode,
+		  const_double_from_real_value (dconst1, <SCALAR_MODE>mode));
+    rtx two = gcn_vec_constant (<MODE>mode,
+		  const_double_from_real_value (dconst2, <SCALAR_MODE>mode));
+    rtx initrcp = gen_reg_rtx (<MODE>mode);
+    rtx fma = gen_reg_rtx (<MODE>mode);
+    rtx rcp;
+
+    bool is_rcp = (GET_CODE (operands[1]) == CONST_VECTOR
+		   && real_identical
+		        (CONST_DOUBLE_REAL_VALUE
+			  (CONST_VECTOR_ELT (operands[1], 0)), &dconstm1));
+
+    if (is_rcp)
+      rcp = operands[0];
+    else
+      rcp = gen_reg_rtx (<MODE>mode);
+
+    emit_insn (gen_recip<mode>_vector (initrcp, one, operands[2],
+				       gcn_full_exec_reg (),
+				       gcn_gen_undef (<MODE>mode)));
+    emit_insn (gen_fma<mode>4_negop2 (fma, initrcp, operands[2], two));
+    emit_insn (gen_mul<mode>3 (rcp, initrcp, fma));
+
+    if (!is_rcp)
+      emit_insn (gen_mul<mode>3 (operands[0], operands[1], rcp));
+
+    DONE;
+  })
+
+(define_expand "div<mode>3"
+  [(match_operand:FP_MODE 0 "gcn_valu_dst_operand")
+   (match_operand:FP_MODE 1 "gcn_valu_src0_operand")
+   (match_operand:FP_MODE 2 "gcn_valu_src0_operand")]
+  "flag_reciprocal_math"
+  {
+    rtx one = const_double_from_real_value (dconst1, <MODE>mode);
+    rtx two = const_double_from_real_value (dconst2, <MODE>mode);
+    rtx initrcp = gen_reg_rtx (<MODE>mode);
+    rtx fma = gen_reg_rtx (<MODE>mode);
+    rtx rcp;
+
+    bool is_rcp = (GET_CODE (operands[1]) == CONST_DOUBLE
+		   && real_identical (CONST_DOUBLE_REAL_VALUE (operands[1]),
+				      &dconstm1));
+
+    if (is_rcp)
+      rcp = operands[0];
+    else
+      rcp = gen_reg_rtx (<MODE>mode);
+
+    emit_insn (gen_recip<mode>_scalar (initrcp, one, operands[2],
+				       gcn_scalar_exec ()));
+    emit_insn (gen_fma<mode>4_negop2 (fma, initrcp, operands[2], two));
+    emit_insn (gen_mul<mode>3 (rcp, initrcp, fma));
+
+    if (!is_rcp)
+      emit_insn (gen_mul<mode>3 (operands[0], operands[1], rcp));
+
+    DONE;
+  })
+
+;; }}}
+;; {{{ Int/FP conversions
+
+(define_mode_iterator CVT_FROM_MODE [HI SI HF SF DF])
+(define_mode_iterator CVT_TO_MODE [HI SI HF SF DF])
+(define_mode_iterator CVT_F_MODE [HF SF DF])
+(define_mode_iterator CVT_I_MODE [HI SI])
+
+(define_mode_iterator VCVT_FROM_MODE [V64HI V64SI V64HF V64SF V64DF])
+(define_mode_iterator VCVT_TO_MODE [V64HI V64SI V64HF V64SF V64DF])
+(define_mode_iterator VCVT_F_MODE [V64HF V64SF V64DF])
+(define_mode_iterator VCVT_I_MODE [V64HI V64SI])
+
+(define_code_iterator cvt_op [fix unsigned_fix
+			      float unsigned_float
+			      float_extend float_truncate])
+(define_code_attr cvt_name [(fix "fix_trunc") (unsigned_fix "fixuns_trunc")
+			    (float "float") (unsigned_float "floatuns")
+			    (float_extend "extend") (float_truncate "trunc")])
+(define_code_attr cvt_operands [(fix "%i0%i1") (unsigned_fix "%u0%i1")
+				(float "%i0%i1") (unsigned_float "%i0%u1")
+				(float_extend "%i0%i1")
+				(float_truncate "%i0%i1")])
+
+(define_expand "<cvt_name><CVT_FROM_MODE:mode><CVT_F_MODE:mode>2"
+  [(parallel [(set (match_operand:CVT_F_MODE 0 "register_operand")
+		   (cvt_op:CVT_F_MODE
+		     (match_operand:CVT_FROM_MODE 1 "gcn_valu_src0_operand")))
+	      (use (match_dup 2))])]
+  "gcn_valid_cvt_p (<CVT_FROM_MODE:MODE>mode, <CVT_F_MODE:MODE>mode,
+		    <cvt_name>_cvt)"
+  {
+    operands[2] = gcn_scalar_exec ();
+  })
+
+(define_expand "<cvt_name><VCVT_FROM_MODE:mode><VCVT_F_MODE:mode>2"
+  [(set (match_operand:VCVT_F_MODE 0 "register_operand")
+	(vec_merge:VCVT_F_MODE
+	  (cvt_op:VCVT_F_MODE
+	    (match_operand:VCVT_FROM_MODE 1 "gcn_valu_src0_operand"))
+	  (match_dup 3)
+	  (match_dup 2)))]
+  "gcn_valid_cvt_p (<VCVT_FROM_MODE:MODE>mode, <VCVT_F_MODE:MODE>mode,
+		    <cvt_name>_cvt)"
+  {
+    operands[2] = gcn_full_exec_reg ();
+    operands[3] = gcn_gen_undef (<VCVT_F_MODE:MODE>mode);
+  })
+
+(define_expand "<cvt_name><CVT_F_MODE:mode><CVT_I_MODE:mode>2"
+  [(parallel [(set (match_operand:CVT_I_MODE 0 "register_operand")
+		   (cvt_op:CVT_I_MODE
+		     (match_operand:CVT_F_MODE 1 "gcn_valu_src0_operand")))
+	      (use (match_dup 2))])]
+  "gcn_valid_cvt_p (<CVT_F_MODE:MODE>mode, <CVT_I_MODE:MODE>mode,
+		    <cvt_name>_cvt)"
+  {
+    operands[2] = gcn_scalar_exec ();
+  })
+
+(define_expand "<cvt_name><VCVT_F_MODE:mode><VCVT_I_MODE:mode>2"
+  [(set (match_operand:VCVT_I_MODE 0 "register_operand")
+	(vec_merge:VCVT_I_MODE
+	  (cvt_op:VCVT_I_MODE
+	    (match_operand:VCVT_F_MODE 1 "gcn_valu_src0_operand"))
+	  (match_dup 3)
+	  (match_dup 2)))]
+  "gcn_valid_cvt_p (<VCVT_F_MODE:MODE>mode, <VCVT_I_MODE:MODE>mode,
+		    <cvt_name>_cvt)"
+  {
+    operands[2] = gcn_full_exec_reg ();
+    operands[3] = gcn_gen_undef (<VCVT_I_MODE:MODE>mode);
+  })
+
+(define_insn "<cvt_name><CVT_FROM_MODE:mode><CVT_TO_MODE:mode>2_insn"
+  [(set (match_operand:CVT_TO_MODE 0 "register_operand"	   "=  v")
+	(cvt_op:CVT_TO_MODE
+	  (match_operand:CVT_FROM_MODE 1 "gcn_alu_operand" "vSSB")))
+   (use (match_operand:DI 2 "gcn_exec_operand"		   "   e"))]
+  "gcn_valid_cvt_p (<CVT_FROM_MODE:MODE>mode, <CVT_TO_MODE:MODE>mode,
+		    <cvt_name>_cvt)"
+  "v_cvt<cvt_operands>\t%0, %1"
+  [(set_attr "type" "vop1")
+   (set_attr "length" "8")])
+
+(define_insn "<cvt_name><VCVT_FROM_MODE:mode><VCVT_TO_MODE:mode>2_insn"
+  [(set (match_operand:VCVT_TO_MODE 0 "register_operand"	    "=  v")
+	(vec_merge:VCVT_TO_MODE
+	  (cvt_op:VCVT_TO_MODE
+	    (match_operand:VCVT_FROM_MODE 1 "gcn_alu_operand"	    "vSSB"))
+	  (match_operand:VCVT_TO_MODE 2 "gcn_alu_or_unspec_operand" "  U0")
+	  (match_operand:DI 3 "gcn_exec_operand"		    "   e")))]
+  "gcn_valid_cvt_p (<VCVT_FROM_MODE:MODE>mode, <VCVT_TO_MODE:MODE>mode,
+		    <cvt_name>_cvt)"
+  "v_cvt<cvt_operands>\t%0, %1"
+  [(set_attr "type" "vop1")
+   (set_attr "length" "8")])
+
+;; }}}
+;; {{{ Int/int conversions
+
+;; GCC can already do these for scalar types, but not for vector types.
+;; Unfortunately you can't just do SUBREG on a vector to select the low part,
+;; so there must be a few tricks here.
+
+(define_insn_and_split "vec_truncatev64div64si"
+  [(set (match_operand:V64SI 0 "register_operand"	     "=v,&v")
+	(vec_merge:V64SI
+	  (truncate:V64SI
+	    (match_operand:V64DI 1 "register_operand"        " 0, v"))
+	  (match_operand:V64SI 2 "gcn_alu_or_unspec_operand" "U0,U0")
+	  (match_operand:DI 3 "gcn_exec_operand"	     " e, e")))]
+  ""
+  "#"
+  "reload_completed"
+  [(parallel [(set (match_dup 0)
+		   (vec_merge:V64SI (match_dup 1) (match_dup 2) (match_dup 3)))
+	      (clobber (scratch:V64DI))])]
+  {
+    operands[1] = gcn_operand_part (V64SImode, operands[1], 0);
+  }
+  [(set_attr "type" "vop2")
+   (set_attr "length" "0,4")])
+
+;; }}}
+;; {{{ Vector comparison/merge
+
+(define_expand "vec_cmp<mode>di"
+  [(parallel
+     [(set (match_operand:DI 0 "register_operand")
+	   (and:DI
+	     (match_operator 1 "comparison_operator"
+	       [(match_operand:VEC_1REG_MODE 2 "gcn_alu_operand")
+		(match_operand:VEC_1REG_MODE 3 "gcn_vop3_operand")])
+	     (match_dup 4)))
+      (clobber (match_scratch:DI 5))])]
+  ""
+  {
+    operands[4] = gcn_full_exec_reg ();
+  })
+
+(define_expand "vec_cmpu<mode>di"
+  [(parallel
+     [(set (match_operand:DI 0 "register_operand")
+	   (and:DI
+	     (match_operator 1 "comparison_operator"
+	       [(match_operand:VEC_1REG_INT_MODE 2 "gcn_alu_operand")
+		(match_operand:VEC_1REG_INT_MODE 3 "gcn_vop3_operand")])
+	     (match_dup 4)))
+      (clobber (match_scratch:DI 5))])]
+  ""
+  {
+    operands[4] = gcn_full_exec_reg ();
+  })
+
+(define_insn "vec_cmp<mode>di_insn"
+  [(set (match_operand:DI 0 "register_operand"	       "=cV,cV,  e, e,Sg,Sg")
+	(and:DI
+	  (match_operator 1 "comparison_operator"
+	    [(match_operand:VEC_1REG_MODE 2 "gcn_alu_operand"
+						       "vSS, B,vSS, B, v,vA")
+	     (match_operand:VEC_1REG_MODE 3 "gcn_vop3_operand"
+						       "  v, v,  v, v,vA, v")])
+	  (match_operand:DI 4 "gcn_exec_reg_operand"   "  e, e,  e, e, e, e")))
+   (clobber (match_scratch:DI 5			       "= X, X, cV,cV, X, X"))]
+  ""
+  "@
+   v_cmp%E1\tvcc, %2, %3
+   v_cmp%E1\tvcc, %2, %3
+   v_cmpx%E1\tvcc, %2, %3
+   v_cmpx%E1\tvcc, %2, %3
+   v_cmp%E1\t%0, %2, %3
+   v_cmp%E1\t%0, %2, %3"
+  [(set_attr "type" "vopc,vopc,vopc,vopc,vop3a,vop3a")
+   (set_attr "length" "4,8,4,8,8,8")])
+
+(define_insn "vec_cmp<mode>di_dup"
+  [(set (match_operand:DI 0 "register_operand"		    "=cV,cV, e,e,Sg")
+	(and:DI
+	  (match_operator 1 "comparison_operator"
+	    [(vec_duplicate:VEC_1REG_MODE
+	       (match_operand:<SCALAR_MODE> 2 "gcn_alu_operand"
+							    " SS, B,SS,B, A"))
+	     (match_operand:VEC_1REG_MODE 3 "gcn_vop3_operand"
+							    "  v, v, v,v, v")])
+	  (match_operand:DI 4 "gcn_exec_reg_operand"	    "  e, e, e,e, e")))
+   (clobber (match_scratch:DI 5				    "= X,X,cV,cV, X"))]
+  ""
+  "@
+   v_cmp%E1\tvcc, %2, %3
+   v_cmp%E1\tvcc, %2, %3
+   v_cmpx%E1\tvcc, %2, %3
+   v_cmpx%E1\tvcc, %2, %3
+   v_cmp%E1\t%0, %2, %3"
+  [(set_attr "type" "vopc,vopc,vopc,vopc,vop3a")
+   (set_attr "length" "4,8,4,8,8")])
+
+(define_expand "vcond_mask_<mode>di"
+  [(parallel
+    [(set (match_operand:VEC_REG_MODE 0 "register_operand" "")
+	  (vec_merge:VEC_REG_MODE
+	    (match_operand:VEC_REG_MODE 1 "gcn_vop3_operand" "")
+	    (match_operand:VEC_REG_MODE 2 "gcn_alu_operand" "")
+	    (match_operand:DI 3 "register_operand" "")))
+     (clobber (scratch:V64DI))])]
+  ""
+  "")
+
+(define_expand "vcond<VEC_1REG_MODE:mode><VEC_1REG_ALT:mode>"
+  [(match_operand:VEC_1REG_MODE 0 "register_operand")
+   (match_operand:VEC_1REG_MODE 1 "gcn_vop3_operand")
+   (match_operand:VEC_1REG_MODE 2 "gcn_alu_operand")
+   (match_operator 3 "comparison_operator"
+     [(match_operand:VEC_1REG_ALT 4 "gcn_alu_operand")
+      (match_operand:VEC_1REG_ALT 5 "gcn_vop3_operand")])]
+  ""
+  {
+    rtx tmp = gen_reg_rtx (DImode);
+    rtx cmp_op = gen_rtx_fmt_ee (GET_CODE (operands[3]), DImode, operands[4],
+				 operands[5]);
+    rtx set = gen_rtx_SET (tmp, gen_rtx_AND (DImode, cmp_op,
+					     gcn_full_exec_reg ()));
+    rtx clobber = gen_rtx_CLOBBER (VOIDmode, gen_rtx_SCRATCH (DImode));
+    emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, set, clobber)));
+    emit_insn (gen_vcond_mask_<mode>di (operands[0], operands[1], operands[2],
+					tmp));
+    DONE;
+  })
+
+
+(define_expand "vcondu<VEC_1REG_INT_MODE:mode><VEC_1REG_INT_ALT:mode>"
+  [(match_operand:VEC_1REG_INT_MODE 0 "register_operand")
+   (match_operand:VEC_1REG_INT_MODE 1 "gcn_vop3_operand")
+   (match_operand:VEC_1REG_INT_MODE 2 "gcn_alu_operand")
+   (match_operator 3 "comparison_operator"
+     [(match_operand:VEC_1REG_INT_ALT 4 "gcn_alu_operand")
+      (match_operand:VEC_1REG_INT_ALT 5 "gcn_vop3_operand")])]
+  ""
+  {
+    rtx tmp = gen_reg_rtx (DImode);
+    rtx cmp_op = gen_rtx_fmt_ee (GET_CODE (operands[3]), DImode, operands[4],
+				 operands[5]);
+    rtx set = gen_rtx_SET (tmp,
+			   gen_rtx_AND (DImode, cmp_op, gcn_full_exec_reg ()));
+    rtx clobber = gen_rtx_CLOBBER (VOIDmode, gen_rtx_SCRATCH (DImode));
+    emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, set, clobber)));
+    emit_insn (gen_vcond_mask_<mode>di (operands[0], operands[1], operands[2],
+				        tmp));
+    DONE;
+  })
+
+;; }}}
+;; {{{ Fully masked loop support
+
+(define_expand "while_ultsidi"
+  [(match_operand:DI 0 "register_operand")
+   (match_operand:SI 1 "")
+   (match_operand:SI 2 "")]
+  ""
+  {
+    if (GET_CODE (operands[1]) != CONST_INT
+	|| GET_CODE (operands[2]) != CONST_INT)
+      {
+	rtx exec = gcn_full_exec_reg ();
+	rtx _0_1_2_3 = gen_rtx_REG (V64SImode, VGPR_REGNO (1));
+	rtx tmp = _0_1_2_3;
+	if (GET_CODE (operands[1]) != CONST_INT
+	    || INTVAL (operands[1]) != 0)
+	  {
+	    tmp = gen_reg_rtx (V64SImode);
+	    emit_insn (gen_addv64si3_vector_dup (tmp, _0_1_2_3, operands[1],
+						 exec, tmp));
+	  }
+	emit_insn (gen_vec_cmpv64sidi_dup (operands[0],
+					   gen_rtx_GT (VOIDmode, 0, 0),
+					   operands[2], tmp, exec));
+      }
+    else
+      {
+	HOST_WIDE_INT diff = INTVAL (operands[2]) - INTVAL (operands[1]);
+	HOST_WIDE_INT mask = (diff >= 64 ? -1
+			      : ~((unsigned HOST_WIDE_INT)-1 << diff));
+	emit_move_insn (operands[0], gen_rtx_CONST_INT (VOIDmode, mask));
+      }
+    DONE;
+  })
+
+(define_expand "maskload<mode>di"
+  [(match_operand:VEC_REG_MODE 0 "register_operand")
+   (match_operand:VEC_REG_MODE 1 "memory_operand")
+   (match_operand 2 "")]
+  ""
+  {
+    rtx exec = force_reg (DImode, operands[2]);
+    rtx addr = gcn_expand_scalar_to_vector_address
+		(<MODE>mode, exec, operands[1], gen_rtx_SCRATCH (V64DImode));
+    rtx as = gen_rtx_CONST_INT (VOIDmode, MEM_ADDR_SPACE (operands[1]));
+    rtx v = gen_rtx_CONST_INT (VOIDmode, MEM_VOLATILE_P (operands[1]));
+    rtx undef = gcn_gen_undef (<MODE>mode);
+    emit_insn (gen_gather<mode>_expr (operands[0], addr, as, v, undef, exec));
+    DONE;
+  })
+
+(define_expand "maskstore<mode>di"
+  [(match_operand:VEC_REG_MODE 0 "memory_operand")
+   (match_operand:VEC_REG_MODE 1 "register_operand")
+   (match_operand 2 "")]
+  ""
+  {
+    rtx exec = force_reg (DImode, operands[2]);
+    rtx addr = gcn_expand_scalar_to_vector_address
+		(<MODE>mode, exec, operands[0], gen_rtx_SCRATCH (V64DImode));
+    rtx as = gen_rtx_CONST_INT (VOIDmode, MEM_ADDR_SPACE (operands[0]));
+    rtx v = gen_rtx_CONST_INT (VOIDmode, MEM_VOLATILE_P (operands[0]));
+    emit_insn (gen_scatter<mode>_expr (addr, operands[1], as, v, exec));
+    DONE;
+  })
+
+(define_expand "mask_gather_load<mode>"
+  [(match_operand:VEC_REG_MODE 0 "register_operand")
+   (match_operand:DI 1 "register_operand")
+   (match_operand 2 "register_operand")
+   (match_operand 3 "immediate_operand")
+   (match_operand:SI 4 "gcn_alu_operand")
+   (match_operand:DI 5 "")]
+  ""
+  {
+    rtx exec = force_reg (DImode, operands[5]);
+
+    /* TODO: more conversions will be needed when more types are vectorized. */
+    if (GET_MODE (operands[2]) == V64DImode)
+      {
+	rtx tmp = gen_reg_rtx (V64SImode);
+	emit_insn (gen_vec_truncatev64div64si (tmp, operands[2],
+					       gcn_gen_undef (V64SImode),
+					       exec));
+	operands[2] = tmp;
+      }
+
+    emit_insn (gen_gather<mode>_exec (operands[0], operands[1], operands[2],
+				      operands[3], operands[4], exec));
+    DONE;
+  })
+
+(define_expand "mask_scatter_store<mode>"
+  [(match_operand:DI 0 "register_operand")
+   (match_operand 1 "register_operand")
+   (match_operand 2 "immediate_operand")
+   (match_operand:SI 3 "gcn_alu_operand")
+   (match_operand:VEC_REG_MODE 4 "register_operand")
+   (match_operand:DI 5 "")]
+  ""
+  {
+    rtx exec = force_reg (DImode, operands[5]);
+
+    /* TODO: more conversions will be needed when more types are vectorized. */
+    if (GET_MODE (operands[1]) == V64DImode)
+      {
+	rtx tmp = gen_reg_rtx (V64SImode);
+	emit_insn (gen_vec_truncatev64div64si (tmp, operands[1],
+					       gcn_gen_undef (V64SImode),
+					       exec));
+	operands[1] = tmp;
+      }
+
+    emit_insn (gen_scatter<mode>_exec (operands[0], operands[1], operands[2],
+				       operands[3], operands[4], exec));
+    DONE;
+  })
+
+; FIXME this should be VEC_REG_MODE, but not all dependencies are implemented.
+(define_mode_iterator COND_MODE [V64SI V64DI V64SF V64DF])
+(define_mode_iterator COND_INT_MODE [V64SI V64DI])
+
+(define_code_iterator cond_op [plus minus])
+
+(define_expand "cond_<expander><mode>"
+  [(match_operand:COND_MODE 0 "register_operand")
+   (match_operand:DI 1 "register_operand")
+   (cond_op:COND_MODE
+     (match_operand:COND_MODE 2 "gcn_alu_operand")
+     (match_operand:COND_MODE 3 "gcn_alu_operand"))
+   (match_operand:COND_MODE 4 "register_operand")]
+  ""
+  {
+    operands[1] = force_reg (DImode, operands[1]);
+    operands[2] = force_reg (<MODE>mode, operands[2]);
+
+    emit_insn (gen_<expander><mode>3_vector (operands[0], operands[2],
+					     operands[3], operands[1],
+					     operands[4]));
+    DONE;
+  })
+
+(define_code_iterator cond_bitop [and ior xor])
+
+(define_expand "cond_<expander><mode>"
+  [(match_operand:COND_INT_MODE 0 "register_operand")
+   (match_operand:DI 1 "register_operand")
+   (cond_bitop:COND_INT_MODE
+     (match_operand:COND_INT_MODE 2 "gcn_alu_operand")
+     (match_operand:COND_INT_MODE 3 "gcn_alu_operand"))
+   (match_operand:COND_INT_MODE 4 "register_operand")]
+  ""
+  {
+    operands[1] = force_reg (DImode, operands[1]);
+    operands[2] = force_reg (<MODE>mode, operands[2]);
+
+    emit_insn (gen_<expander><mode>3_vector (operands[0], operands[2],
+					     operands[3], operands[1],
+					     operands[4]));
+    DONE;
+  })
+
+;; }}}
+;; {{{ Vector reductions
+
+(define_int_iterator REDUC_UNSPEC [UNSPEC_SMIN_DPP_SHR UNSPEC_SMAX_DPP_SHR
+				   UNSPEC_UMIN_DPP_SHR UNSPEC_UMAX_DPP_SHR
+				   UNSPEC_PLUS_DPP_SHR
+				   UNSPEC_AND_DPP_SHR
+				   UNSPEC_IOR_DPP_SHR UNSPEC_XOR_DPP_SHR])
+
+(define_int_iterator REDUC_2REG_UNSPEC [UNSPEC_PLUS_DPP_SHR
+					UNSPEC_AND_DPP_SHR
+					UNSPEC_IOR_DPP_SHR UNSPEC_XOR_DPP_SHR])
+
+; FIXME: Isn't there a better way of doing this?
+(define_int_attr reduc_unspec [(UNSPEC_SMIN_DPP_SHR "UNSPEC_SMIN_DPP_SHR")
+			       (UNSPEC_SMAX_DPP_SHR "UNSPEC_SMAX_DPP_SHR")
+			       (UNSPEC_UMIN_DPP_SHR "UNSPEC_UMIN_DPP_SHR")
+			       (UNSPEC_UMAX_DPP_SHR "UNSPEC_UMAX_DPP_SHR")
+			       (UNSPEC_PLUS_DPP_SHR "UNSPEC_PLUS_DPP_SHR")
+			       (UNSPEC_AND_DPP_SHR "UNSPEC_AND_DPP_SHR")
+			       (UNSPEC_IOR_DPP_SHR "UNSPEC_IOR_DPP_SHR")
+			       (UNSPEC_XOR_DPP_SHR "UNSPEC_XOR_DPP_SHR")])
+
+(define_int_attr reduc_op [(UNSPEC_SMIN_DPP_SHR "smin")
+			   (UNSPEC_SMAX_DPP_SHR "smax")
+			   (UNSPEC_UMIN_DPP_SHR "umin")
+			   (UNSPEC_UMAX_DPP_SHR "umax")
+			   (UNSPEC_PLUS_DPP_SHR "plus")
+			   (UNSPEC_AND_DPP_SHR "and")
+			   (UNSPEC_IOR_DPP_SHR "ior")
+			   (UNSPEC_XOR_DPP_SHR "xor")])
+
+(define_int_attr reduc_insn [(UNSPEC_SMIN_DPP_SHR "v_min%i0")
+			     (UNSPEC_SMAX_DPP_SHR "v_max%i0")
+			     (UNSPEC_UMIN_DPP_SHR "v_min%u0")
+			     (UNSPEC_UMAX_DPP_SHR "v_max%u0")
+			     (UNSPEC_PLUS_DPP_SHR "v_add%u0")
+			     (UNSPEC_AND_DPP_SHR  "v_and%b0")
+			     (UNSPEC_IOR_DPP_SHR  "v_or%b0")
+			     (UNSPEC_XOR_DPP_SHR  "v_xor%b0")])
+
+(define_expand "reduc_<reduc_op>_scal_<mode>"
+  [(set (match_operand:<SCALAR_MODE> 0 "register_operand")
+	(unspec:<SCALAR_MODE>
+	  [(match_operand:VEC_1REG_MODE 1 "register_operand")]
+	  REDUC_UNSPEC))]
+  ""
+  {
+    rtx tmp = gcn_expand_reduc_scalar (<MODE>mode, operands[1],
+				       <reduc_unspec>);
+
+    /* The result of the reduction is in lane 63 of tmp.  */
+    emit_insn (gen_mov_from_lane63_<mode> (operands[0], tmp));
+
+    DONE;
+  })
+
+(define_expand "reduc_<reduc_op>_scal_v64di"
+  [(set (match_operand:DI 0 "register_operand")
+	(unspec:DI
+	  [(match_operand:V64DI 1 "register_operand")]
+	  REDUC_2REG_UNSPEC))]
+  ""
+  {
+    rtx tmp = gcn_expand_reduc_scalar (V64DImode, operands[1],
+				       <reduc_unspec>);
+
+    /* The result of the reduction is in lane 63 of tmp.  */
+    emit_insn (gen_mov_from_lane63_v64di (operands[0], tmp));
+
+    DONE;
+  })
+
+(define_insn "*<reduc_op>_dpp_shr_<mode>"
+  [(set (match_operand:VEC_1REG_MODE 0 "register_operand"   "=v")
+	(unspec:VEC_1REG_MODE
+	  [(match_operand:VEC_1REG_MODE 1 "register_operand" "v")
+	   (match_operand:VEC_1REG_MODE 2 "register_operand" "v")
+	   (match_operand:SI 3 "const_int_operand"	     "n")]
+	  REDUC_UNSPEC))]
+  "!(TARGET_GCN3 && SCALAR_INT_MODE_P (<SCALAR_MODE>mode)
+     && <reduc_unspec> == UNSPEC_PLUS_DPP_SHR)"
+  {
+    return gcn_expand_dpp_shr_insn (<MODE>mode, "<reduc_insn>",
+				    <reduc_unspec>, INTVAL (operands[3]));
+  }
+  [(set_attr "type" "vop_dpp")
+   (set_attr "exec" "full")
+   (set_attr "length" "8")])
+
+(define_insn_and_split "*<reduc_op>_dpp_shr_v64di"
+  [(set (match_operand:V64DI 0 "register_operand"   "=&v")
+	(unspec:V64DI
+	  [(match_operand:V64DI 1 "register_operand" "v0")
+	   (match_operand:V64DI 2 "register_operand" "v0")
+	   (match_operand:SI 3 "const_int_operand"    "n")]
+	  REDUC_2REG_UNSPEC))]
+  ""
+  "#"
+  "reload_completed"
+  [(set (match_dup 4)
+	(unspec:V64SI
+	  [(match_dup 6) (match_dup 8) (match_dup 3)] REDUC_2REG_UNSPEC))
+   (set (match_dup 5)
+	(unspec:V64SI
+	  [(match_dup 7) (match_dup 9) (match_dup 3)] REDUC_2REG_UNSPEC))]
+  {
+    operands[4] = gcn_operand_part (V64DImode, operands[0], 0);
+    operands[5] = gcn_operand_part (V64DImode, operands[0], 1);
+    operands[6] = gcn_operand_part (V64DImode, operands[1], 0);
+    operands[7] = gcn_operand_part (V64DImode, operands[1], 1);
+    operands[8] = gcn_operand_part (V64DImode, operands[2], 0);
+    operands[9] = gcn_operand_part (V64DImode, operands[2], 1);
+  }
+  [(set_attr "type" "vmult")
+   (set_attr "exec" "full")
+   (set_attr "length" "16")])
+
+; Special cases for addition.
+
+(define_insn "*plus_carry_dpp_shr_<mode>"
+  [(set (match_operand:VEC_1REG_INT_MODE 0 "register_operand"   "=v")
+	(unspec:VEC_1REG_INT_MODE
+	  [(match_operand:VEC_1REG_INT_MODE 1 "register_operand" "v")
+	   (match_operand:VEC_1REG_INT_MODE 2 "register_operand" "v")
+	   (match_operand:SI 3 "const_int_operand"		 "n")]
+	  UNSPEC_PLUS_CARRY_DPP_SHR))
+   (clobber (reg:DI VCC_REG))]
+  ""
+  {
+    const char *insn = TARGET_GCN3 ? "v_add%u0" : "v_add_co%u0";
+    return gcn_expand_dpp_shr_insn (<MODE>mode, insn,
+				    UNSPEC_PLUS_CARRY_DPP_SHR,
+				    INTVAL (operands[3]));
+  }
+  [(set_attr "type" "vop_dpp")
+   (set_attr "exec" "full")
+   (set_attr "length" "8")])
+
+(define_insn "*plus_carry_in_dpp_shr_v64si"
+  [(set (match_operand:V64SI 0 "register_operand"   "=v")
+	(unspec:V64SI
+	  [(match_operand:V64SI 1 "register_operand" "v")
+	   (match_operand:V64SI 2 "register_operand" "v")
+	   (match_operand:SI 3 "const_int_operand"   "n")
+	   (match_operand:DI 4 "register_operand"   "cV")]
+	  UNSPEC_PLUS_CARRY_IN_DPP_SHR))
+   (clobber (reg:DI VCC_REG))]
+  ""
+  {
+    const char *insn = TARGET_GCN3 ? "v_addc%u0" : "v_addc_co%u0";
+    return gcn_expand_dpp_shr_insn (V64SImode, insn,
+				    UNSPEC_PLUS_CARRY_IN_DPP_SHR,
+				    INTVAL (operands[3]));
+  }
+  [(set_attr "type" "vop_dpp")
+   (set_attr "exec" "full")
+   (set_attr "length" "8")])
+
+(define_insn_and_split "*plus_carry_dpp_shr_v64di"
+  [(set (match_operand:V64DI 0 "register_operand"   "=&v")
+	(unspec:V64DI
+	  [(match_operand:V64DI 1 "register_operand" "v0")
+	   (match_operand:V64DI 2 "register_operand" "v0")
+	   (match_operand:SI 3 "const_int_operand"    "n")]
+	  UNSPEC_PLUS_CARRY_DPP_SHR))
+   (clobber (reg:DI VCC_REG))]
+  ""
+  "#"
+  "reload_completed"
+  [(parallel [(set (match_dup 4)
+		(unspec:V64SI
+		  [(match_dup 6) (match_dup 8) (match_dup 3)]
+		  UNSPEC_PLUS_CARRY_DPP_SHR))
+	      (clobber (reg:DI VCC_REG))])
+   (parallel [(set (match_dup 5)
+		(unspec:V64SI
+		  [(match_dup 7) (match_dup 9) (match_dup 3) (reg:DI VCC_REG)]
+		  UNSPEC_PLUS_CARRY_IN_DPP_SHR))
+	      (clobber (reg:DI VCC_REG))])]
+  {
+    operands[4] = gcn_operand_part (V64DImode, operands[0], 0);
+    operands[5] = gcn_operand_part (V64DImode, operands[0], 1);
+    operands[6] = gcn_operand_part (V64DImode, operands[1], 0);
+    operands[7] = gcn_operand_part (V64DImode, operands[1], 1);
+    operands[8] = gcn_operand_part (V64DImode, operands[2], 0);
+    operands[9] = gcn_operand_part (V64DImode, operands[2], 1);
+  }
+  [(set_attr "type" "vmult")
+   (set_attr "exec" "full")
+   (set_attr "length" "16")])
+
+; Instructions to move a scalar value from lane 63 of a vector register.
+(define_insn "mov_from_lane63_<mode>"
+  [(set (match_operand:<SCALAR_MODE> 0 "register_operand"  "=Sg,v")
+	(unspec:<SCALAR_MODE>
+	  [(match_operand:VEC_1REG_MODE 1 "register_operand" "v,v")]
+	  UNSPEC_MOV_FROM_LANE63))]
+  ""
+  "@
+   v_readlane_b32\t%0, %1, 63
+   v_mov_b32\t%0, %1 wave_ror:1"
+  [(set_attr "type" "vop3a,vop_dpp")
+   (set_attr "exec" "*,full")
+   (set_attr "length" "8")])
+
+(define_insn "mov_from_lane63_v64di"
+  [(set (match_operand:DI 0 "register_operand"	     "=Sg,v")
+	(unspec:DI
+	  [(match_operand:V64DI 1 "register_operand"   "v,v")]
+	  UNSPEC_MOV_FROM_LANE63))]
+  ""
+  "@
+   v_readlane_b32\t%L0, %L1, 63\;v_readlane_b32\t%H0, %H1, 63
+   * if (REGNO (operands[0]) <= REGNO (operands[1]))	\
+       return \"v_mov_b32\t%L0, %L1 wave_ror:1\;\"	\
+	      \"v_mov_b32\t%H0, %H1 wave_ror:1\";	\
+     else						\
+       return \"v_mov_b32\t%H0, %H1 wave_ror:1\;\"	\
+	      \"v_mov_b32\t%L0, %L1 wave_ror:1\";"
+  [(set_attr "type" "vop3a,vop_dpp")
+   (set_attr "exec" "*,full")
+   (set_attr "length" "8")])
+
+;; }}}
+;; {{{ Miscellaneous
+
+(define_expand "vec_seriesv64si"
+  [(match_operand:V64SI 0 "register_operand")
+   (match_operand:SI 1 "gcn_alu_operand")
+   (match_operand:SI 2 "gcn_alu_operand")]
+  ""
+  {
+    rtx tmp = gen_reg_rtx (V64SImode);
+    rtx v1 = gen_rtx_REG (V64SImode, VGPR_REGNO (1));
+    rtx undef = gcn_gen_undef (V64SImode);
+    rtx exec = gcn_full_exec_reg ();
+
+    emit_insn (gen_mulv64si3_vector_dup (tmp, v1, operands[2], exec, undef));
+    emit_insn (gen_addv64si3_vector_dup (operands[0], tmp, operands[1], exec,
+					 undef));
+    DONE;
+  })
+
+(define_expand "vec_seriesv64di"
+  [(match_operand:V64DI 0 "register_operand")
+   (match_operand:DI 1 "gcn_alu_operand")
+   (match_operand:DI 2 "gcn_alu_operand")]
+  ""
+  {
+    rtx tmp = gen_reg_rtx (V64DImode);
+    rtx v1 = gen_rtx_REG (V64SImode, VGPR_REGNO (1));
+    rtx undef = gcn_gen_undef (V64DImode);
+    rtx exec = gcn_full_exec_reg ();
+
+    emit_insn (gen_mulv64di3_vector_zext_dup2 (tmp, v1, operands[2], exec,
+					       undef));
+    emit_insn (gen_addv64di3_vector_dup (operands[0], tmp, operands[1], exec,
+					 undef));
+    DONE;
+  })
+
+;; }}}
diff --git a/gcc/config/gcn/gcn.md b/gcc/config/gcn/gcn.md
new file mode 100644
index 0000000..5886e0b
--- /dev/null
+++ b/gcc/config/gcn/gcn.md
@@ -0,0 +1,2152 @@ 
+;; Copyright (C) 2016-2018 Free Software Foundation, Inc.
+
+;; This file is free software; you can redistribute it and/or modify it under
+;; the terms of the GNU General Public License as published by the Free
+;; Software Foundation; either version 3 of the License, or (at your option)
+;; any later version.
+
+;; This file is distributed in the hope that it will be useful, but WITHOUT
+;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+;; FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+;; for more details.
+
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+
+;;- See file "rtl.def" for documentation on define_insn, match_*, et. al.
+
+(include "predicates.md")
+(include "constraints.md")
+
+;; {{{ Constants and enums
+
+; Named registers
+(define_constants
+  [(FIRST_SGPR_REG		 0)
+   (LAST_SGPR_REG		 101)
+   (FLAT_SCRATCH_REG		 102)
+   (FLAT_SCRATCH_LO_REG		 102)
+   (FLAT_SCRATCH_HI_REG		 103)
+   (XNACK_MASK_REG		 104)
+   (XNACK_MASK_LO_REG		 104)
+   (XNACK_MASK_HI_REG		 105)
+   (VCC_REG			 106)
+   (VCC_LO_REG			 106)
+   (VCC_HI_REG			 107)
+   (VCCZ_REG			 108)
+   (TBA_REG			 109)
+   (TBA_LO_REG			 109)
+   (TBA_HI_REG			 110)
+   (TMA_REG			 111)
+   (TMA_LO_REG			 111)
+   (TMA_HI_REG			 112)
+   (TTMP0_REG			 113)
+   (TTMP11_REG			 124)
+   (M0_REG			 125)
+   (EXEC_REG			 126)
+   (EXEC_LO_REG			 126)
+   (EXEC_HI_REG			 127)
+   (EXECZ_REG			 128)
+   (SCC_REG			 129)
+   (FIRST_VGPR_REG		 160)
+   (LAST_VGPR_REG		 415)])
+
+(define_constants
+  [(SP_REGNUM 16)
+   (LR_REGNUM 18)
+   (AP_REGNUM 416)
+   (FP_REGNUM 418)])
+
+(define_c_enum "unspecv" [
+  UNSPECV_PROLOGUE_USE
+  UNSPECV_KERNEL_RETURN
+  UNSPECV_BARRIER
+  UNSPECV_ATOMIC
+  UNSPECV_ICACHE_INV])
+
+(define_c_enum "unspec" [
+  UNSPEC_VECTOR
+  UNSPEC_BPERMUTE
+  UNSPEC_SGPRBASE
+  UNSPEC_MEMORY_BARRIER
+  UNSPEC_SMIN_DPP_SHR UNSPEC_SMAX_DPP_SHR
+  UNSPEC_UMIN_DPP_SHR UNSPEC_UMAX_DPP_SHR
+  UNSPEC_PLUS_DPP_SHR
+  UNSPEC_PLUS_CARRY_DPP_SHR UNSPEC_PLUS_CARRY_IN_DPP_SHR
+  UNSPEC_AND_DPP_SHR UNSPEC_IOR_DPP_SHR UNSPEC_XOR_DPP_SHR
+  UNSPEC_MOV_FROM_LANE63
+  UNSPEC_GATHER
+  UNSPEC_SCATTER])
+
+;; }}}
+;; {{{ Attributes
+
+; Instruction type (encoding) as described in the ISA specification.
+; The following table summarizes possible operands of individual instruction
+; types and corresponding constraints.
+;
+; sop2 - scalar, two inputs, one output
+;	 ssrc0/ssrc1: sgpr 0-102; flat_scratch,xnack,vcc,tba,tma,ttmp0-11,exec
+;		      vccz,execz,scc,inline immedate,fp inline immediate
+;	 sdst: sgpr 0-102; flat_scratch,xnack,vcc,tba,tma,ttmp0-11,exec
+;
+;	 Constraints "=SD, SD", "SSA,SSB","SSB,SSA"
+;
+; sopk - scalar, inline constant input, one output
+;	 simm16: 16bit inline constant
+;	 sdst: same as sop2/ssrc0
+;
+;	 Constraints "=SD", "J"
+;
+; sop1 - scalar, one input, one output
+;	 ssrc0: same as sop2/ssrc0.  FIXME: manual omit VCCZ
+;	 sdst: same as sop2/sdst
+;
+;	 Constraints "=SD", "SSA"
+;
+; sopc - scalar, two inputs, one comparsion
+;	 ssrc0: same as sop2/ssc0.
+;
+;	 Constraints "SSI,SSA","SSA,SSI"
+;
+; sopp - scalar, one constant input, one special
+;	 simm16
+;
+; smem - scalar memory
+;	 sbase: aligned pair of sgprs.  Specify {size[15:0], base[47:0]} in
+;               dwords
+;	 sdata: sgpr0-102, flat_scratch, xnack, vcc, tba, tma
+;	 offset: sgpr or 20bit unsigned byte offset
+;
+; vop2 - vector, two inputs, one output
+;	 vsrc0: sgpr0-102,flat_scratch,xnack,vcc,tba,ttmp0-11,m0,exec,
+;		inline constant -16 to -64, fp inline immediate, vccz, execz,
+;		scc, lds, literal constant, vgpr0-255
+;	 vsrc1: vgpr0-255
+;	 vdst: vgpr0-255
+;	 Limitations: At most one SGPR, at most one constant
+;		      if constant is used, SGPR must be M0
+;		      Only SRC0 can be LDS_DIRECT
+;
+;	 constraints: "=v", "vBSS", "v"
+;
+; vop1 - vector, one input, one output
+;	 vsrc0: same as vop2/src0
+;	 vdst: vgpr0-255
+;
+;	 constraints: "=v", "vBSS"
+;
+; vopc - vector, two inputs, one comparsion output;
+;	 vsrc0: same as vop2/src0
+;	 vsrc1: vgpr0-255
+;	 vdst:
+;
+;	 constraints: "vASS", "v"
+;
+; vop3a - vector, three inputs, one output
+;	 vdst: vgpr0-255, for v_cmp sgpr or vcc
+;	 abs,clamp
+;	 vsrc0: sgpr0-102,vcc,tba,ttmp0-11,m0,exec,
+;		inline constant -16 to -64, fp inline immediate, vccz, execz,
+;		scc, lds_direct
+;		FIXME: really missing 1/pi? really 104 SGPRs
+;
+; vop3b - vector, three inputs, one vector output, one scalar output
+;	 vsrc0,vsrc1,vsrc2: same as vop3a vsrc0
+;	 vdst: vgpr0-255
+;	 sdst: sgpr0-103/vcc/tba/tma/ttmp0-11
+;
+; vop_sdwa - second dword for vop1/vop2/vopc for specifying sub-dword address
+;	 src0: vgpr0-255
+;	 dst_sel: BYTE_0-3, WORD_0-1, DWORD
+;	 dst_unused: UNUSED_PAD, UNUSED_SEXT, UNUSED_PRESERVE
+;	 clamp: true/false
+;	 src0_sel: BYTE_0-3, WORD_0-1, DWORD
+;	 flags: src0_sext, src0_neg, src0_abs, src1_sel, src1_sext, src1_neg,
+  ;		src1_abs
+;
+; vop_dpp - second dword for vop1/vop2/vopc for specifying data-parallel ops
+;	 src0: vgpr0-255
+;	 dpp_ctrl: quad_perm, row_sl0-15, row_sr0-15, row_rr0-15, wf_sl1,
+;		  wf_rl1, wf_sr1, wf_rr1, row_mirror, row_half_mirror,
+;		  bcast15, bcast31
+;	 flags: src0_neg, src0_abs, src1_neg, src1_abs
+;	 bank_mask: 4-bit mask
+;	 row_mask: 4-bit mask
+;
+; ds - Local and global data share instructions.
+;	 offset0: 8-bit constant
+;	 offset1: 8-bit constant
+;	 flag: gds
+;	 addr: vgpr0-255
+;	 data0: vgpr0-255
+;	 data1: vgpr0-255
+;	 vdst: vgpr0-255
+;
+; mubuf - Untyped memory buffer operation. First word with LDS, second word
+;	  non-LDS.
+;	 offset: 12-bit constant
+;	 vaddr: vgpr0-255
+;	 vdata: vgpr0-255
+;	 srsrc: sgpr0-102
+;	 soffset: sgpr0-102
+;	 flags: offen, idxen, glc, lds, slc, tfe
+;
+; mtbuf - Typed memory buffer operation. Two words
+;	 offset: 12-bit constant
+;	 dfmt: 4-bit constant
+;	 nfmt: 3-bit constant
+;	 vaddr: vgpr0-255
+;	 vdata: vgpr0-255
+;	 srsrc: sgpr0-102
+;	 soffset: sgpr0-102
+;	 flags: offen, idxen, glc, lds, slc, tfe
+;
+; flat - flat or global memory operations
+;	 flags: glc, slc
+;	 addr: vgpr0-255
+;	 data: vgpr0-255
+;	 vdst: vgpr0-255
+;
+; mult - expands to multiple instructions (pseudo encoding)
+;
+; vmult - as mult, when a vector instruction is used.
+
+(define_attr "type"
+	     "unknown,sop1,sop2,sopk,sopc,sopp,smem,ds,vop2,vop1,vopc,
+	      vop3a,vop3b,vop_sdwa,vop_dpp,mubuf,mtbuf,flat,mult,vmult"
+	     (const_string "unknown"))
+
+; Set if instruction is executed in scalar or vector unit
+
+(define_attr "unit" "unknown,scalar,vector"
+  (cond [(eq_attr "type" "sop1,sop2,sopk,sopc,sopp,smem,mult")
+	    (const_string "scalar")
+	 (eq_attr "type" "vop2,vop1,vopc,vop3a,vop3b,ds,
+			  vop_sdwa,vop_dpp,flat,vmult")
+	    (const_string "vector")]
+	 (const_string "unknown")))
+
+; All vector instructions run as 64 threads as predicated by the EXEC
+; register.  Scalar operations in vector register require a single lane
+; enabled, vector moves require a full set of lanes enabled, and most vector
+; operations handle the lane masking themselves.
+; The md_reorg pass is responsible for ensuring that EXEC is set appropriately
+; according to the following settings:
+;   auto   - instruction doesn't use EXEC, or handles it itself.
+;            md_reorg will inspect def/use to determine what to do.
+;   single - disable all but lane zero.
+;   full   - enable all lanes.
+
+(define_attr "exec" "auto,single,full"
+   (const_string "auto"))
+
+; Infer the (worst-case) length from the instruction type by default.  Many
+; types can have an optional immediate word following, which we include here.
+; "Multiple" types are counted as two 64-bit instructions.  This is just a
+; default fallback: it can be overridden per-alternative in insn patterns for
+; greater accuracy.
+
+(define_attr "length" ""
+  (cond [(eq_attr "type" "sop1") (const_int 8)
+	 (eq_attr "type" "sop2") (const_int 8)
+	 (eq_attr "type" "sopk") (const_int 8)
+	 (eq_attr "type" "sopc") (const_int 8)
+	 (eq_attr "type" "sopp") (const_int 4)
+	 (eq_attr "type" "smem") (const_int 8)
+	 (eq_attr "type" "ds")   (const_int 8)
+	 (eq_attr "type" "vop1") (const_int 8)
+	 (eq_attr "type" "vop2") (const_int 8)
+	 (eq_attr "type" "vopc") (const_int 8)
+	 (eq_attr "type" "vop3a") (const_int 8)
+	 (eq_attr "type" "vop3b") (const_int 8)
+	 (eq_attr "type" "vop_sdwa") (const_int 8)
+	 (eq_attr "type" "vop_dpp") (const_int 8)
+	 (eq_attr "type" "flat") (const_int 8)
+	 (eq_attr "type" "mult") (const_int 16)
+	 (eq_attr "type" "vmult") (const_int 16)]
+	(const_int 4)))
+
+; Disable alternatives that only apply to specific ISA variants.
+
+(define_attr "gcn_version" "gcn3,gcn5" (const_string "gcn3"))
+
+(define_attr "enabled" ""
+  (cond [(eq_attr "gcn_version" "gcn3") (const_int 1)
+	 (and (eq_attr "gcn_version" "gcn5")
+	      (ne (symbol_ref "TARGET_GCN5_PLUS") (const_int 0)))
+	   (const_int 1)]
+	(const_int 0)))
+
+; We need to be able to identify v_readlane and v_writelane with
+; SGPR lane selection in order to handle "Manually Inserted Wait States".
+
+(define_attr "laneselect" "yes,no" (const_string "no"))
+
+;; }}}
+;; {{{ Iterators useful across the wole machine description
+
+(define_mode_iterator SIDI [SI DI])
+(define_mode_iterator SFDF [SF DF])
+(define_mode_iterator SISF [SI SF])
+(define_mode_iterator QIHI [QI HI])
+(define_mode_iterator DIDF [DI DF])
+
+;; }}}
+;; {{{ Attributes.
+
+; Translate RTX code into GCN instruction mnemonics with and without
+; suffixes such as _b32, etc.
+
+(define_code_attr mnemonic
+  [(minus "sub%i")
+   (plus "add%i")
+   (ashift "lshl%b")
+   (lshiftrt "lshr%b")
+   (ashiftrt "ashr%i")
+   (and "and%B")
+   (ior "or%B")
+   (xor "xor%B")
+   (mult "mul%i")
+   (smin "min%i")
+   (smax "max%i")
+   (umin "min%u")
+   (umax "max%u")
+   (not "not%b")
+   (popcount "bcnt_u32%b")])
+
+(define_code_attr bare_mnemonic
+  [(plus "add")
+   (minus "sub")
+   (and "and")
+   (ior "or")
+   (xor "xor")])
+
+(define_code_attr s_mnemonic
+  [(not "not%b")
+   (popcount "bcnt1_i32%b")])
+
+(define_code_attr revmnemonic
+  [(minus "subrev%i")
+   (ashift "lshlrev%b")
+   (lshiftrt "lshrrev%b")
+   (ashiftrt "ashrrev%i")])
+
+; Translate RTX code into corresponding expander name.
+
+(define_code_attr expander
+  [(and "and")
+   (ior "ior")
+   (xor "xor")
+   (plus "add")
+   (minus "sub")
+   (ashift "ashl")
+   (lshiftrt "lshr")
+   (ashiftrt "ashr")
+   (mult "mul")
+   (smin "smin")
+   (smax "smax")
+   (umin "umin")
+   (umax "umax")
+   (not "one_cmpl")
+   (popcount "popcount")])
+
+;; }}}
+;; {{{ Miscellaneous instructions
+
+(define_insn "nop"
+  [(const_int 0)]
+  ""
+  "s_nop\t0x0"
+  [(set_attr "type" "sopp")])
+
+; FIXME: What should the value of the immediate be? Zero is disallowed, so
+; pick 1 for now.
+(define_insn "trap"
+  [(trap_if (const_int 1) (const_int 0))]
+  ""
+  "s_trap\t1"
+  [(set_attr "type" "sopp")])
+
+;; }}}
+;; {{{ Moves
+
+;; All scalar modes we support moves in.
+(define_mode_iterator MOV_MODE [BI QI HI SI DI TI SF DF])
+
+; This is the entry point for creating all kinds of scalar moves,
+; including reloads and symbols.
+
+(define_expand "mov<mode>"
+  [(set (match_operand:MOV_MODE 0 "nonimmediate_operand")
+	(match_operand:MOV_MODE 1 "general_operand"))]
+  ""
+  {
+    if (MEM_P (operands[0]))
+      operands[1] = force_reg (<MODE>mode, operands[1]);
+
+    if (!lra_in_progress && !reload_completed
+	&& !gcn_valid_move_p (<MODE>mode, operands[0], operands[1]))
+      {
+	/* Something is probably trying to generate a move
+	   which can only work indirectly.
+	   E.g. Move from LDS memory to SGPR hardreg
+	     or MEM:QI to SGPR.  */
+	rtx tmpreg = gen_reg_rtx (<MODE>mode);
+	emit_insn (gen_mov<mode> (tmpreg, operands[1]));
+	emit_insn (gen_mov<mode> (operands[0], tmpreg));
+	DONE;
+      }
+
+    if (<MODE>mode == DImode
+	&& (GET_CODE (operands[1]) == SYMBOL_REF
+	    || GET_CODE (operands[1]) == LABEL_REF))
+      {
+	emit_insn (gen_movdi_symbol (operands[0], operands[1]));
+	DONE;
+      }
+  })
+
+; Split invalid moves into two valid moves
+
+(define_split
+  [(set (match_operand:MOV_MODE 0 "nonimmediate_operand")
+	(match_operand:MOV_MODE 1 "general_operand"))]
+  "!reload_completed && !lra_in_progress
+   && !gcn_valid_move_p (<MODE>mode, operands[0], operands[1])"
+  [(set (match_dup 2) (match_dup 1))
+   (set (match_dup 0) (match_dup 2))]
+  {
+    operands[2] = gen_reg_rtx(<MODE>mode);
+  })
+
+; We need BImode move so we can reload flags registers.
+
+(define_insn "*movbi"
+  [(set (match_operand:BI 0 "nonimmediate_operand"
+				    "=SD,   v,Sg,cs,cV,cV,Sm,RS, v,RF, v,RM")
+	(match_operand:BI 1 "gcn_load_operand"
+				    "SSA,vSSA, v,SS, v,SS,RS,Sm,RF, v,RM, v"))]
+  ""
+  {
+    /* SCC as an operand is currently not accepted by the LLVM assembler, so
+       we emit bytes directly as a workaround.  */
+    switch (which_alternative) {
+    case 0:
+      if (REG_P (operands[1]) && REGNO (operands[1]) == SCC_REG)
+	return "; s_mov_b32\t%0,%1 is not supported by the assembler.\;"
+	       ".byte\t0xfd\;"
+	       ".byte\t0x0\;"
+	       ".byte\t0x80|%R0\;"
+	       ".byte\t0xbe";
+      else
+	return "s_mov_b32\t%0, %1";
+    case 1:
+      if (REG_P (operands[1]) && REGNO (operands[1]) == SCC_REG)
+	return "; v_mov_b32\t%0, %1\;"
+	       ".byte\t0xfd\;"
+	       ".byte\t0x2\;"
+	       ".byte\t((%V0<<1)&0xff)\;"
+	       ".byte\t0x7e|(%V0>>7)";
+      else
+	return "v_mov_b32\t%0, %1";
+    case 2:
+      return "v_readlane_b32\t%0, %1, 0";
+    case 3:
+      return "s_cmpk_lg_u32\t%1, 0";
+    case 4:
+      return "v_cmp_ne_u32\tvcc, 0, %1";
+    case 5:
+      if (REGNO (operands[1]) == SCC_REG)
+	return "; s_mov_b32\t%0, %1 is not supported by the assembler.\;"
+	       ".byte\t0xfd\;"
+	       ".byte\t0x0\;"
+	       ".byte\t0xea\;"
+	       ".byte\t0xbe\;"
+	       "s_mov_b32\tvcc_hi, 0";
+      else
+	return "s_mov_b32\tvcc_lo, %1\;"
+	       "s_mov_b32\tvcc_hi, 0";
+    case 6:
+      return "s_load_dword\t%0, %A1\;s_waitcnt\tlgkmcnt(0)";
+    case 7:
+      return "s_store_dword\t%1, %A0\;s_waitcnt\tlgkmcnt(0)";
+    case 8:
+      return "flat_load_dword\t%0, %A1%O1%g1\;s_waitcnt\t0";
+    case 9:
+      return "flat_store_dword\t%A0, %1%O0%g0\;s_waitcnt\t0";
+    case 10:
+      return "global_load_dword\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)";
+    case 11:
+      return "global_store_dword\t%A0, %1%O0%g0\;s_waitcnt\tvmcnt(0)";
+    default:
+      gcc_unreachable ();
+    }
+  }
+  [(set_attr "type" "sop1,vop1,vop3a,sopk,vopc,mult,smem,smem,flat,flat,
+		     flat,flat")
+   (set_attr "exec" "*,single,*,*,single,*,*,*,single,single,single,single")
+   (set_attr "length" "4,4,4,4,4,8,12,12,12,12,12,12")])
+
+; 32bit move pattern
+
+(define_insn "*mov<mode>_insn"
+  [(set (match_operand:SISF 0 "nonimmediate_operand"
+		  "=SD,SD,SD,SD,RB,Sm,RS,v,Sg, v, v,RF,v,RLRG,   v,SD, v,RM")
+	(match_operand:SISF 1 "gcn_load_operand"
+		  "SSA, J, B,RB,Sm,RS,Sm,v, v,SS,RF, v,B,   v,RLRG, Y,RM, v"))]
+  ""
+  "@
+  s_mov_b32\t%0, %1
+  s_movk_i32\t%0, %1
+  s_mov_b32\t%0, %1
+  s_buffer_load%s0\t%0, s[0:3], %1\;s_waitcnt\tlgkmcnt(0)
+  s_buffer_store%s1\t%1, s[0:3], %0\;s_waitcnt\tlgkmcnt(0)
+  s_load_dword\t%0, %A1\;s_waitcnt\tlgkmcnt(0)
+  s_store_dword\t%1, %A0\;s_waitcnt\tlgkmcnt(0)
+  v_mov_b32\t%0, %1
+  v_readlane_b32\t%0, %1, 0
+  v_writelane_b32\t%0, %1, 0
+  flat_load_dword\t%0, %A1%O1%g1\;s_waitcnt\t0
+  flat_store_dword\t%A0, %1%O0%g0\;s_waitcnt\t0
+  v_mov_b32\t%0, %1
+  ds_write_b32\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0)
+  ds_read_b32\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
+  s_mov_b32\t%0, %1
+  global_load_dword\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
+  global_store_dword\t%A0, %1%O0%g0\;s_waitcnt\tvmcnt(0)"
+  [(set_attr "type" "sop1,sopk,sop1,smem,smem,smem,smem,vop1,vop3a,vop3a,flat,
+		     flat,vop1,ds,ds,sop1,flat,flat")
+   (set_attr "exec" "*,*,*,*,*,*,*,single,*,*,single,single,single,
+		     single,single,*,single,single")
+   (set_attr "length" "4,4,8,12,12,12,12,4,8,8,12,12,8,12,12,8,12,12")])
+
+; 8/16bit move pattern
+
+(define_insn "*mov<mode>_insn"
+  [(set (match_operand:QIHI 0 "nonimmediate_operand"
+				 "=SD,SD,SD,v,Sg, v, v,RF,v,RLRG,   v, v,RM")
+	(match_operand:QIHI 1 "gcn_load_operand"
+				 "SSA, J, B,v, v,SS,RF, v,B,   v,RLRG,RM, v"))]
+  "gcn_valid_move_p (<MODE>mode, operands[0], operands[1])"
+  "@
+  s_mov_b32\t%0, %1
+  s_movk_i32\t%0, %1
+  s_mov_b32\t%0, %1
+  v_mov_b32\t%0, %1
+  v_readlane_b32\t%0, %1, 0
+  v_writelane_b32\t%0, %1, 0
+  flat_load%o1\t%0, %A1%O1%g1\;s_waitcnt\t0
+  flat_store%s0\t%A0, %1%O0%g0\;s_waitcnt\t0
+  v_mov_b32\t%0, %1
+  ds_write%b0\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0)
+  ds_read%u1\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
+  global_load%o1\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
+  global_store%s0\t%A0, %1%O0%g0\;s_waitcnt\tvmcnt(0)"
+  [(set_attr "type"
+	     "sop1,sopk,sop1,vop1,vop3a,vop3a,flat,flat,vop1,ds,ds,flat,flat")
+   (set_attr "exec" "*,*,*,single,*,*,single,single,single,single,
+		     single,single,single")
+   (set_attr "length" "4,4,8,4,4,4,12,12,8,12,12,12,12")])
+
+; 64bit move pattern
+
+(define_insn_and_split "*mov<mode>_insn"
+  [(set (match_operand:DIDF 0 "nonimmediate_operand"
+			  "=SD,SD,SD,RS,Sm,v, v,Sg, v, v,RF,RLRG,   v, v,RM")
+	(match_operand:DIDF 1 "general_operand"
+			  "SSA, C,DB,Sm,RS,v,DB, v,SS,RF, v,   v,RLRG,RM, v"))]
+  "GET_CODE(operands[1]) != SYMBOL_REF"
+  "@
+  s_mov_b64\t%0, %1
+  s_mov_b64\t%0, %1
+  #
+  s_store_dwordx2\t%1, %A0\;s_waitcnt\tlgkmcnt(0)
+  s_load_dwordx2\t%0, %A1\;s_waitcnt\tlgkmcnt(0)
+  #
+  #
+  #
+  #
+  flat_load_dwordx2\t%0, %A1%O1%g1\;s_waitcnt\t0
+  flat_store_dwordx2\t%A0, %1%O0%g0\;s_waitcnt\t0
+  ds_write_b64\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0)
+  ds_read_b64\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
+  global_load_dwordx2\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
+  global_store_dwordx2\t%A0, %1%O0%g0\;s_waitcnt\tvmcnt(0)"
+  "(reload_completed && !MEM_P (operands[0]) && !MEM_P (operands[1])
+    && !gcn_sgpr_move_p (operands[0], operands[1]))
+   || (GET_CODE (operands[1]) == CONST_INT && !gcn_constant64_p (operands[1]))"
+  [(set (match_dup 0) (match_dup 1))
+   (set (match_dup 2) (match_dup 3))]
+  {
+    rtx inlo = gen_lowpart (SImode, operands[1]);
+    rtx inhi = gen_highpart_mode (SImode, <MODE>mode, operands[1]);
+    rtx outlo = gen_lowpart (SImode, operands[0]);
+    rtx outhi = gen_highpart_mode (SImode, <MODE>mode, operands[0]);
+
+    /* Ensure that overlapping registers aren't corrupted.  */
+    if (REGNO (outlo) == REGNO (inhi))
+      {
+	operands[0] = outhi;
+	operands[1] = inhi;
+	operands[2] = outlo;
+	operands[3] = inlo;
+      }
+    else
+      {
+	operands[0] = outlo;
+	operands[1] = inlo;
+	operands[2] = outhi;
+	operands[3] = inhi;
+      }
+  }
+  [(set_attr "type" "sop1,sop1,mult,smem,smem,vmult,vmult,vmult,vmult,flat,
+		     flat,ds,ds,flat,flat")
+   (set_attr "exec" "*,*,*,*,*,*,*,*,*,single,single,single,single,single,
+		     single")
+   (set_attr "length" "4,8,*,12,12,*,*,*,*,12,12,12,12,12,12")])
+
+; 128-bit move.
+
+(define_insn_and_split "*movti_insn"
+  [(set (match_operand:TI 0 "nonimmediate_operand"
+				      "=SD,RS,Sm,RF, v,v, v,SD,RM, v,RL, v")
+	(match_operand:TI 1 "general_operand"  
+				      "SSB,Sm,RS, v,RF,v,SS, v, v,RM, v,RL"))]
+  ""
+  "@
+  #
+  s_store_dwordx4\t%1, %A0\;s_waitcnt\tlgkmcnt(0)
+  s_load_dwordx4\t%0, %A1\;s_waitcnt\tlgkmcnt(0)
+  flat_store_dwordx4\t%A0, %1%O0%g0\;s_waitcnt\t0
+  flat_load_dwordx4\t%0, %A1%O1%g1\;s_waitcnt\t0
+  #
+  #
+  #
+  global_store_dwordx4\t%A0, %1%O0%g0\;s_waitcnt\tvmcnt(0)
+  global_load_dwordx4\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
+  ds_write_b128\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0)
+  ds_read_b128\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)"
+  "reload_completed
+   && REG_P (operands[0])
+   && (REG_P (operands[1]) || GET_CODE (operands[1]) == CONST_INT)"
+  [(set (match_dup 0) (match_dup 1))
+   (set (match_dup 2) (match_dup 3))
+   (set (match_dup 4) (match_dup 5))
+   (set (match_dup 6) (match_dup 7))]
+  {
+    operands[6] = gcn_operand_part (TImode, operands[0], 3);
+    operands[7] = gcn_operand_part (TImode, operands[1], 3);
+    operands[4] = gcn_operand_part (TImode, operands[0], 2);
+    operands[5] = gcn_operand_part (TImode, operands[1], 2);
+    operands[2] = gcn_operand_part (TImode, operands[0], 1);
+    operands[3] = gcn_operand_part (TImode, operands[1], 1);
+    operands[0] = gcn_operand_part (TImode, operands[0], 0);
+    operands[1] = gcn_operand_part (TImode, operands[1], 0);
+  }
+  [(set_attr "type" "mult,smem,smem,flat,flat,vmult,vmult,vmult,flat,flat,\
+		     ds,ds")
+   (set_attr "exec" "*,*,*,single,single,*,*,*,single,single,single,single")
+   (set_attr "length" "*,12,12,12,12,*,*,*,12,12,12,12")])
+
+;; }}}
+;; {{{ Prologue/Epilogue
+
+(define_insn "prologue_use"
+  [(unspec_volatile [(match_operand 0)] UNSPECV_PROLOGUE_USE)]
+  ""
+  ""
+  [(set_attr "length" "0")])
+
+(define_expand "prologue"
+  [(const_int 0)]
+  ""
+  {
+    gcn_expand_prologue ();
+    DONE;
+  })
+
+(define_expand "epilogue"
+  [(const_int 0)]
+  ""
+  {
+    gcn_expand_epilogue ();
+    DONE;
+  })
+
+;; }}}
+;; {{{ Control flow
+
+; This pattern must satisfy simplejump_p, which means it cannot be a parallel
+; that clobbers SCC.  Thus, we must preserve SCC if we're generating a long
+; branch sequence.
+
+(define_insn "jump"
+  [(set (pc)
+	(label_ref (match_operand 0)))]
+  ""
+  {
+    if (get_attr_length (insn) == 4)
+      return "s_branch\t%0";
+    else
+      /* !!! This sequence clobbers EXEC_SAVE_REG and CC_SAVE_REG.  */
+      return "; s_mov_b32\ts22, scc is not supported by the assembler.\;"
+	     ".long\t0xbe9600fd\;"
+	     "s_getpc_b64\ts[20:21]\;"
+	     "s_add_u32\ts20, s20, %0@rel32@lo+4\;"
+	     "s_addc_u32\ts21, s21, %0@rel32@hi+4\;"
+	     "s_cmpk_lg_u32\ts22, 0\;"
+	     "s_setpc_b64\ts[20:21]";
+  }
+  [(set_attr "type" "sopp")
+   (set (attr "length")
+	(if_then_else (and (ge (minus (match_dup 0) (pc))
+			       (const_int -131072))
+			   (lt (minus (match_dup 0) (pc))
+			       (const_int 131072)))
+		      (const_int 4)
+		      (const_int 32)))])
+
+(define_insn "indirect_jump"
+  [(set (pc)
+	(match_operand:DI 0 "register_operand" "Sg"))]
+  ""
+  "s_setpc_b64\t%0"
+  [(set_attr "type" "sop1")
+   (set_attr "length" "4")])
+
+(define_insn "cjump"
+  [(set (pc)
+	(if_then_else
+	  (match_operator:BI 1 "gcn_conditional_operator"
+	    [(match_operand:BI 2 "gcn_conditional_register_operand" " ca")
+	     (const_int 0)])
+	  (label_ref (match_operand 0))
+	  (pc)))
+   (clobber (match_scratch:BI 3					    "=cs"))]
+  ""
+  {
+    if (get_attr_length (insn) == 4)
+      return "s_cbranch%C1\t%0";
+    else
+      {
+	operands[1] = gen_rtx_fmt_ee (reverse_condition
+				       (GET_CODE (operands[1])),
+				      BImode, operands[2], const0_rtx);
+	/* !!! This sequence clobbers EXEC_SAVE_REG and SCC.  */
+	return "s_cbranch%C1\t.skip%=\;"
+	       "s_getpc_b64\ts[20:21]\;"
+	       "s_add_u32\ts20, s20, %0@rel32@lo+4\;"
+	       "s_addc_u32\ts21, s21, %0@rel32@hi+4\;"
+	       "s_setpc_b64\ts[20:21]\n"
+	       ".skip%=:";
+      }
+  }
+  [(set_attr "type" "sopp")
+   (set (attr "length")
+	(if_then_else (and (ge (minus (match_dup 0) (pc))
+			       (const_int -131072))
+			   (lt (minus (match_dup 0) (pc))
+			       (const_int 131072)))
+		      (const_int 4)
+		      (const_int 28)))])
+
+; Returning from a normal function is different to returning from a
+; kernel function.
+
+(define_insn "gcn_return"
+  [(return)]
+  ""
+  {
+    if (cfun && cfun->machine && cfun->machine->normal_function)
+      return "s_setpc_b64\ts[18:19]";
+    else
+      return "s_dcache_wb\;s_endpgm";
+  }
+  [(set_attr "type" "sop1")
+   (set_attr "length" "8")])
+
+(define_expand "call"
+  [(parallel [(call (match_operand 0 "")
+		    (match_operand 1 ""))
+	      (clobber (reg:DI LR_REGNUM))
+	      (clobber (match_scratch:DI 2))])]
+  ""
+  {})
+
+(define_insn "gcn_simple_call"
+  [(call (mem (match_operand 0 "immediate_operand" "Y,B"))
+	 (match_operand 1 "const_int_operand"))
+   (clobber (reg:DI LR_REGNUM))
+   (clobber (match_scratch:DI 2 "=&Sg,X"))]
+  ""
+  "@
+  s_getpc_b64\t%2\;s_add_u32\t%L2, %L2, %0@rel32@lo+4\;s_addc_u32\t%H2, %H2, %0@rel32@hi+4\;s_swappc_b64\ts[18:19], %2
+  s_swappc_b64\ts[18:19], %0"
+  [(set_attr "type" "mult,sop1")
+   (set_attr "length" "24,4")])
+
+(define_insn "movdi_symbol"
+ [(set (match_operand:DI 0 "nonimmediate_operand" "=Sg")
+       (match_operand:DI 1 "general_operand" "Y"))
+  (clobber (reg:BI SCC_REG))]
+ "GET_CODE (operands[1]) == SYMBOL_REF || GET_CODE (operands[1]) == LABEL_REF"
+  {
+    if (SYMBOL_REF_P (operands[1])
+	&& SYMBOL_REF_WEAK (operands[1]))
+	return "s_getpc_b64\t%0\;"
+	       "s_add_u32\t%L0, %L0, %1@gotpcrel32@lo+4\;"
+	       "s_addc_u32\t%H0, %H0, %1@gotpcrel32@hi+4\;"
+	       "s_load_dwordx2\t%0, %0\;"
+	       "s_waitcnt\tlgkmcnt(0)";
+
+    return "s_getpc_b64\t%0\;"
+	   "s_add_u32\t%L0, %L0, %1@rel32@lo+4\;"
+	   "s_addc_u32\t%H0, %H0, %1@rel32@hi+4";
+  }
+ [(set_attr "type" "mult")
+  (set_attr "length" "32")])
+
+(define_insn "gcn_indirect_call"
+  [(call (mem (match_operand:DI 0 "register_operand" "Sg"))
+	 (match_operand 1 "" ""))
+   (clobber (reg:DI LR_REGNUM))
+   (clobber (match_scratch:DI 2 "=X"))]
+  ""
+  "s_swappc_b64\ts[18:19], %0"
+  [(set_attr "type" "sop1")
+   (set_attr "length" "4")])
+
+(define_expand "call_value"
+  [(parallel [(set (match_operand 0 "")
+		   (call (match_operand 1 "")
+			 (match_operand 2 "")))
+	      (clobber (reg:DI LR_REGNUM))
+	      (clobber (match_scratch:DI 3))])]
+  ""
+  {})
+
+(define_insn "gcn_call_value"
+  [(set (match_operand 0 "register_operand" "=Sg,Sg")
+	(call (mem (match_operand 1 "immediate_operand" "Y,B"))
+	      (match_operand 2 "const_int_operand")))
+   (clobber (reg:DI LR_REGNUM))
+   (clobber (match_scratch:DI 3 "=&Sg,X"))]
+  ""
+  "@
+  s_getpc_b64\t%3\;s_add_u32\t%L3, %L3, %1@rel32@lo+4\;s_addc_u32\t%H3, %H3, %1@rel32@hi+4\;s_swappc_b64\ts[18:19], %3
+  s_swappc_b64\ts[18:19], %1"
+  [(set_attr "type" "sop1")
+   (set_attr "length" "24")])
+
+(define_insn "gcn_call_value_indirect"
+  [(set (match_operand 0 "register_operand" "=Sg")
+	(call (mem (match_operand:DI 1 "register_operand" "Sg"))
+	      (match_operand 2 "" "")))
+   (clobber (reg:DI LR_REGNUM))
+   (clobber (match_scratch:DI 3 "=X"))]
+  ""
+  "s_swappc_b64\ts[18:19], %1"
+  [(set_attr "type" "sop1")
+   (set_attr "length" "4")])
+
+; GCN does not have an instruction to clear only part of the instruction
+; cache, so the operands are ignored.
+
+(define_insn "clear_icache"
+  [(unspec_volatile
+    [(match_operand 0 "") (match_operand 1 "")]
+    UNSPECV_ICACHE_INV)]
+  ""
+  "s_icache_inv"
+  [(set_attr "type" "sopp")
+   (set_attr "length" "4")])
+
+;; }}}
+;; {{{ Conditionals
+
+; 32-bit compare, scalar unit only
+
+(define_insn "cstoresi4"
+  [(set (match_operand:BI 0 "gcn_conditional_register_operand"
+							 "=cs, cs, cs, cs")
+	(match_operator:BI 1 "gcn_compare_operator"
+	  [(match_operand:SI 2 "gcn_alu_operand"	 "SSA,SSA,SSB, SS")
+	   (match_operand:SI 3 "gcn_alu_operand"	 "SSA,SSL, SS,SSB")]))]
+  ""
+  "@
+   s_cmp%D1\t%2, %3
+   s_cmpk%D1\t%2, %3
+   s_cmp%D1\t%2, %3
+   s_cmp%D1\t%2, %3"
+  [(set_attr "type" "sopc,sopk,sopk,sopk")
+   (set_attr "length" "4,4,8,8")])
+
+(define_expand "cbranchsi4"
+  [(match_operator 0 "gcn_compare_operator"
+     [(match_operand:SI 1 "gcn_alu_operand")
+      (match_operand:SI 2 "gcn_alu_operand")])
+   (match_operand 3)]
+  ""
+  {
+    rtx cc = gen_reg_rtx (BImode);
+    emit_insn (gen_cstoresi4 (cc, operands[0], operands[1], operands[2]));
+    emit_jump_insn (gen_cjump (operands[3],
+			       gen_rtx_NE (BImode, cc, const0_rtx), cc));
+    DONE;
+  })
+
+; 64-bit compare; either unit
+
+(define_expand "cstoredi4"
+  [(parallel [(set (match_operand:BI 0 "gcn_conditional_register_operand")
+		   (match_operator:BI 1 "gcn_compare_operator"
+		     [(match_operand:DI 2 "gcn_alu_operand")
+		      (match_operand:DI 3 "gcn_alu_operand")]))
+	      (use (match_dup 4))])]
+  ""
+  {
+    operands[4] = gcn_scalar_exec ();
+  })
+
+(define_insn "cstoredi4_vec_and_scalar"
+  [(set (match_operand:BI 0 "gcn_conditional_register_operand" "= cs,  cV")
+	(match_operator:BI 1 "gcn_compare_64bit_operator"
+	  [(match_operand:DI 2 "gcn_alu_operand"	       "%SSA,vSSC")
+	   (match_operand:DI 3 "gcn_alu_operand"	       " SSC,   v")]))
+   (use (match_operand:DI 4 "gcn_exec_operand"		       "   n,   e"))]
+  ""
+  "@
+   s_cmp%D1\t%2, %3
+   v_cmp%E1\tvcc, %2, %3"
+  [(set_attr "type" "sopc,vopc")
+   (set_attr "length" "8")])
+
+(define_insn "cstoredi4_vector"
+  [(set (match_operand:BI 0 "gcn_conditional_register_operand" "= cV")
+	(match_operator:BI 1 "gcn_compare_operator"
+          [(match_operand:DI 2 "gcn_alu_operand"	       "vSSB")
+	   (match_operand:DI 3 "gcn_alu_operand"	       "   v")]))
+   (use (match_operand:DI 4 "gcn_exec_operand"		       "   e"))]
+  ""
+  "v_cmp%E1\tvcc, %2, %3"
+  [(set_attr "type" "vopc")
+   (set_attr "length" "8")])
+
+(define_expand "cbranchdi4"
+  [(match_operator 0 "gcn_compare_operator"
+     [(match_operand:DI 1 "gcn_alu_operand")
+      (match_operand:DI 2 "gcn_alu_operand")])
+   (match_operand 3)]
+  ""
+  {
+    rtx cc = gen_reg_rtx (BImode);
+    emit_insn (gen_cstoredi4 (cc, operands[0], operands[1], operands[2]));
+    emit_jump_insn (gen_cjump (operands[3],
+			       gen_rtx_NE (BImode, cc, const0_rtx), cc));
+    DONE;
+  })
+
+; FP compare; vector unit only
+
+(define_expand "cstore<mode>4"
+  [(parallel [(set (match_operand:BI 0 "gcn_conditional_register_operand")
+		   (match_operator:BI 1 "gcn_fp_compare_operator"
+		     [(match_operand:SFDF 2 "gcn_alu_operand")
+		      (match_operand:SFDF 3 "gcn_alu_operand")]))
+	      (use (match_dup 4))])]
+  ""
+  {
+    operands[4] = gcn_scalar_exec ();
+  })
+
+(define_insn "cstore<mode>4_vec_and_scalar"
+  [(set (match_operand:BI 0 "gcn_conditional_register_operand" "=cV")
+	(match_operator:BI 1 "gcn_fp_compare_operator"
+	  [(match_operand:SFDF 2 "gcn_alu_operand"		"vB")
+	   (match_operand:SFDF 3 "gcn_alu_operand"		 "v")]))
+   (use (match_operand:DI 4 "gcn_exec_operand"			 "e"))]
+  ""
+  "v_cmp%E1\tvcc, %2, %3"
+  [(set_attr "type" "vopc")
+   (set_attr "length" "8")])
+
+(define_expand "cbranch<mode>4"
+  [(match_operator 0 "gcn_fp_compare_operator"
+     [(match_operand:SFDF 1 "gcn_alu_operand")
+      (match_operand:SFDF 2 "gcn_alu_operand")])
+   (match_operand 3)]
+  ""
+  {
+    rtx cc = gen_reg_rtx (BImode);
+    emit_insn (gen_cstore<mode>4 (cc, operands[0], operands[1], operands[2]));
+    emit_jump_insn (gen_cjump (operands[3],
+			       gen_rtx_NE (BImode, cc, const0_rtx), cc));
+    DONE;
+  })
+
+;; }}}
+;; {{{ ALU special cases: Plus
+
+(define_code_iterator plus_minus [plus minus])
+
+(define_predicate "plus_minus_operator"
+  (match_code "plus,minus"))
+
+(define_expand "<expander>si3"
+  [(parallel [(set (match_operand:SI 0 "register_operand")
+		   (plus_minus:SI (match_operand:SI 1 "gcn_alu_operand")
+				  (match_operand:SI 2 "gcn_alu_operand")))
+	      (use (match_dup 3))
+	      (clobber (reg:BI SCC_REG))
+	      (clobber (reg:DI VCC_REG))])]
+  ""
+  {
+    operands[3] = gcn_scalar_exec ();
+  })
+
+; 32-bit add; pre-reload undecided unit.
+
+(define_insn "*addsi3_vec_and_scalar"
+  [(set (match_operand:SI 0 "register_operand"         "= Sg, Sg, Sg,   v")
+        (plus:SI (match_operand:SI 1 "gcn_alu_operand" "%SgA,  0,SgA,   v")
+		 (match_operand:SI 2 "gcn_alu_operand" " SgA,SgJ,  B,vBSg")))
+   (use (match_operand:DI 3 "gcn_exec_operand"         "   n,  n,  n,   e"))
+   (clobber (reg:BI SCC_REG))
+   (clobber (reg:DI VCC_REG))]
+  ""
+  "@
+   s_add_i32\t%0, %1, %2
+   s_addk_i32\t%0, %2
+   s_add_i32\t%0, %1, %2
+   v_add_i32\t%0, %1, %2"
+  [(set_attr "type" "sop2,sopk,sop2,vop2")
+   (set_attr "length" "4,4,8,8")])
+
+; Discard VCC clobber, post reload.
+
+(define_split
+  [(set (match_operand:SIDI 0 "register_operand")
+        (match_operator:SIDI 3 "plus_minus_operator"
+	  [(match_operand:SIDI 1 "gcn_alu_operand")
+	   (match_operand:SIDI 2 "gcn_alu_operand")]))
+   (use (match_operand:DI 4 "" ""))
+   (clobber (reg:BI SCC_REG))
+   (clobber (reg:DI VCC_REG))]
+  "reload_completed && gcn_sdst_register_operand (operands[0], VOIDmode)"
+  [(parallel [(set (match_dup 0)
+		   (match_op_dup 3 [(match_dup 1) (match_dup 2)]))
+	      (clobber (reg:BI SCC_REG))])])
+
+; Discard SCC clobber, post reload.
+; FIXME: do we have an insn for this?
+
+(define_split
+  [(set (match_operand:SIDI 0 "register_operand")
+        (match_operator:SIDI 3 "plus_minus_operator"
+			 [(match_operand:SIDI 1 "gcn_alu_operand")
+			  (match_operand:SIDI 2 "gcn_alu_operand")]))
+   (use (match_operand:DI 4 ""))
+   (clobber (reg:BI SCC_REG))
+   (clobber (reg:DI VCC_REG))]
+  "reload_completed && gcn_vgpr_register_operand (operands[0], VOIDmode)"
+  [(parallel [(set (match_dup 0)
+		   (match_op_dup 3 [(match_dup 1) (match_dup 2)]))
+	      (use (match_dup 4))
+	      (clobber (reg:DI VCC_REG))])])
+
+; 32-bit add, scalar unit.
+
+(define_insn "*addsi3_scalar"
+  [(set (match_operand:SI 0 "register_operand"	       "= Sg, Sg, Sg")
+	(plus:SI (match_operand:SI 1 "gcn_alu_operand" "%SgA,  0,SgA")
+		 (match_operand:SI 2 "gcn_alu_operand" " SgA,SgJ,  B")))
+   (clobber (reg:BI SCC_REG))]
+  ""
+  "@
+   s_add_i32\t%0, %1, %2
+   s_addk_i32\t%0, %2
+   s_add_i32\t%0, %1, %2"
+  [(set_attr "type" "sop2,sopk,sop2")
+   (set_attr "length" "4,4,8")])
+
+; Having this as an insn_and_split allows us to keep together DImode adds
+; through some RTL optimisation passes, and means the CC reg we set isn't
+; dependent on the constraint alternative (which doesn't seem to work well).
+
+; There's an early clobber in the case where "v[0:1]=v[1:2]+?" but
+; "v[0:1]=v[0:1]+?" is fine (as is "v[1:2]=v[0:1]+?", but that's trickier).
+
+; If v_addc_u32 is used to add with carry, a 32-bit literal constant cannot be
+; used as an operand due to the read of VCC, so we restrict constants to the
+; inlinable range for that alternative.
+
+(define_insn_and_split "adddi3"
+  [(set (match_operand:DI 0 "register_operand"		
+					      "=&Sg,&Sg,&Sg,&Sg,&v,&v,&v,&v")
+	(plus:DI (match_operand:DI 1 "register_operand" 
+					      "  Sg,  0,  0, Sg, v, 0, 0, v")
+		 (match_operand:DI 2 "nonmemory_operand"
+					      "   0,SgB,  0,SgB, 0,vA, 0,vA")))
+   (clobber (reg:BI SCC_REG))
+   (clobber (reg:DI VCC_REG))]
+  ""
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+  {
+    rtx cc = gen_rtx_REG (BImode, gcn_vgpr_register_operand (operands[1],
+							     DImode)
+			  ? VCC_REG : SCC_REG);
+
+    emit_insn (gen_addsi3_scalar_carry
+	       (gcn_operand_part (DImode, operands[0], 0),
+		gcn_operand_part (DImode, operands[1], 0),
+		gcn_operand_part (DImode, operands[2], 0),
+		cc));
+    rtx val = gcn_operand_part (DImode, operands[2], 1);
+    if (val != const0_rtx)
+      emit_insn (gen_addcsi3_scalar
+		 (gcn_operand_part (DImode, operands[0], 1),
+		  gcn_operand_part (DImode, operands[1], 1),
+		  gcn_operand_part (DImode, operands[2], 1),
+		  cc, cc));
+    else
+      emit_insn (gen_addcsi3_scalar_zero
+		 (gcn_operand_part (DImode, operands[0], 1),
+		  gcn_operand_part (DImode, operands[1], 1),
+		  cc));
+    DONE;
+  }
+  [(set_attr "type" "mult,mult,mult,mult,vmult,vmult,vmult,vmult")
+   (set_attr "length" "8")
+   ; FIXME: These patterns should have (use (exec)) but that messes up
+   ;        the generic splitters, so use single instead
+   (set_attr "exec" "*,*,*,*,single,single,single,single")])
+
+;; Add with carry.
+
+(define_insn "addsi3_scalar_carry"
+  [(set (match_operand:SI 0 "register_operand"	       "= Sg, v")
+	(plus:SI (match_operand:SI 1 "gcn_alu_operand" "%SgA, v")
+		 (match_operand:SI 2 "gcn_alu_operand" " SgB,vB")))
+   (set (match_operand:BI 3 "register_operand"	       "= cs,cV")
+	(ltu:BI (plus:SI (match_dup 1)
+			 (match_dup 2))
+		(match_dup 1)))]
+  ""
+  "@
+   s_add_u32\t%0, %1, %2
+   v_add%^_u32\t%0, vcc, %2, %1"
+  [(set_attr "type" "sop2,vop2")
+   (set_attr "length" "8,8")
+   (set_attr "exec" "*,single")])
+
+(define_insn "addsi3_scalar_carry_cst"
+  [(set (match_operand:SI 0 "register_operand"           "=Sg, v")
+        (plus:SI (match_operand:SI 1 "gcn_alu_operand"   "SgA, v")
+		 (match_operand:SI 2 "const_int_operand" "  n, n")))
+   (set (match_operand:BI 4 "register_operand"           "=cs,cV")
+	(geu:BI (plus:SI (match_dup 1)
+			 (match_dup 2))
+		(match_operand:SI 3 "const_int_operand"  "  n, n")))]
+  "INTVAL (operands[2]) == -INTVAL (operands[3])"
+  "@
+   s_add_u32\t%0, %1, %2
+   v_add%^_u32\t%0, vcc, %2, %1"
+  [(set_attr "type" "sop2,vop2")
+   (set_attr "length" "4")
+   (set_attr "exec" "*,single")])
+
+(define_insn "addcsi3_scalar"
+  [(set (match_operand:SI 0 "register_operand"			   "= Sg, v")
+	(plus:SI (plus:SI (zero_extend:SI
+			    (match_operand:BI 3 "register_operand" "= cs,cV"))
+			  (match_operand:SI 1 "gcn_alu_operand"    "%SgA, v"))
+		 (match_operand:SI 2 "gcn_alu_operand"		   " SgB,vA")))
+   (set (match_operand:BI 4 "register_operand"			   "=  3, 3")
+	(ior:BI (ltu:BI (plus:SI
+			  (plus:SI
+			    (zero_extend:SI (match_dup 3))
+			    (match_dup 1))
+			  (match_dup 2))
+			(match_dup 2))
+		(ltu:BI (plus:SI (zero_extend:SI (match_dup 3)) (match_dup 1))
+			(match_dup 1))))]
+  ""
+  "@
+   s_addc_u32\t%0, %1, %2
+   v_addc%^_u32\t%0, vcc, %2, %1, vcc"
+  [(set_attr "type" "sop2,vop2")
+   (set_attr "length" "8,4")
+   (set_attr "exec" "*,single")])
+
+(define_insn "addcsi3_scalar_zero"
+  [(set (match_operand:SI 0 "register_operand"		  "=Sg, v")
+        (plus:SI (zero_extend:SI
+		   (match_operand:BI 2 "register_operand" "=cs,cV"))
+		 (match_operand:SI 1 "gcn_alu_operand"    "SgA, v")))
+   (set (match_dup 2)
+	(ltu:BI (plus:SI (zero_extend:SI (match_dup 2))
+			 (match_dup 1))
+		(match_dup 1)))]
+  ""
+  "@
+   s_addc_u32\t%0, %1, 0
+   v_addc%^_u32\t%0, vcc, 0, %1, vcc"
+  [(set_attr "type" "sop2,vop2")
+   (set_attr "length" "4")
+   (set_attr "exec" "*,single")])
+
+; "addptr" is the same as "add" except that it must not write to VCC or SCC
+; as a side-effect.  Unfortunately GCN does not have a suitable instruction
+; for this, so we use a custom VOP3 add with CC_SAVE_REG as a temp.
+; Note that it is not safe to save/clobber/restore SCC because doing so will
+; break data-flow analysis, so this must use vector registers.
+
+(define_insn "addptrdi3"
+  [(set (match_operand:DI 0 "register_operand"		 "= &v")
+	(plus:DI (match_operand:DI 1 "register_operand"	 "  v0")
+		 (match_operand:DI 2 "nonmemory_operand" "vDA0")))]
+  ""
+  {
+    rtx new_operands[4] = { operands[0], operands[1], operands[2],
+			    gen_rtx_REG (DImode, CC_SAVE_REG) };
+
+    output_asm_insn ("v_add%^_u32 %L0, %3, %L2, %L1", new_operands);
+    output_asm_insn ("v_addc%^_u32 %H0, %3, %H2, %H1, %3", new_operands);
+
+    return "";
+  }
+  [(set_attr "type" "vmult")
+   (set_attr "length" "16")
+   (set_attr "exec" "single")])
+
+;; }}}
+;; {{{ ALU special cases: Minus
+
+;; Note that the expand and splitters are shared with add, above.
+;; See "plus_minus".
+
+(define_insn "*subsi3_vec_and_scalar"
+  [(set (match_operand:SI 0 "register_operand"          "=Sg, Sg,    v,   v")
+	(minus:SI (match_operand:SI 1 "gcn_alu_operand" "SgA,SgA,    v,vBSg")
+		  (match_operand:SI 2 "gcn_alu_operand" "SgA,  B, vBSg,   v")))
+   (use (match_operand:DI 3 "gcn_exec_operand"          "  n,  n,    e,   e"))
+   (clobber (reg:BI SCC_REG))
+   (clobber (reg:DI VCC_REG))]
+  ""
+  "@
+   s_sub_i32\t%0, %1, %2
+   s_sub_i32\t%0, %1, %2
+   v_sub_i32\t%0, %1, %2
+   v_sub_i32\t%0, %1, %2"
+  [(set_attr "type" "sop2,sop2,vop2,vop2")
+   (set_attr "length" "4,8,8,8")])
+
+(define_insn "*subsi3_scalar"
+  [(set (match_operand:SI 0 "register_operand"          "=Sg, Sg")
+        (minus:SI (match_operand:SI 1 "gcn_alu_operand" "SgA,SgA")
+		  (match_operand:SI 2 "gcn_alu_operand" "SgA,  B")))
+   (clobber (reg:BI SCC_REG))]
+  ""
+  "s_sub_i32\t%0, %1, %2"
+  [(set_attr "type" "sop2,sop2")
+   (set_attr "length" "4,8")])
+
+(define_insn_and_split "subdi3"
+  [(set (match_operand:DI 0 "register_operand"        "=Sg, Sg")
+	(minus:DI
+		(match_operand:DI 1 "gcn_alu_operand" "SgA,SgB")
+		(match_operand:DI 2 "gcn_alu_operand" "SgB,SgA")))
+   (clobber (reg:BI SCC_REG))]
+  ""
+  "#"
+  "reload_completed"
+  [(const_int 0)]
+  {
+    emit_insn (gen_subsi3_scalar_carry
+	       (gcn_operand_part (DImode, operands[0], 0),
+		gcn_operand_part (DImode, operands[1], 0),
+		gcn_operand_part (DImode, operands[2], 0)));
+    rtx val = gcn_operand_part (DImode, operands[2], 1);
+    if (val != const0_rtx)
+      emit_insn (gen_subcsi3_scalar
+		 (gcn_operand_part (DImode, operands[0], 1),
+		  gcn_operand_part (DImode, operands[1], 1),
+		  gcn_operand_part (DImode, operands[2], 1)));
+    else
+      emit_insn (gen_subcsi3_scalar_zero
+		 (gcn_operand_part (DImode, operands[0], 1),
+		  gcn_operand_part (DImode, operands[1], 1)));
+    DONE;
+  }
+  [(set_attr "length" "8")])
+
+(define_insn "subsi3_scalar_carry"
+  [(set (match_operand:SI 0 "register_operand"          "=Sg, Sg")
+        (minus:SI (match_operand:SI 1 "gcn_alu_operand" "SgA,SgB")
+		  (match_operand:SI 2 "gcn_alu_operand" "SgB,SgA")))
+   (set (reg:BI SCC_REG)
+	(gtu:BI (minus:SI (match_dup 1)
+			  (match_dup 2))
+		(match_dup 1)))]
+  ""
+  "s_sub_u32\t%0, %1, %2"
+  [(set_attr "type" "sop2")
+   (set_attr "length" "8")])
+
+(define_insn "subsi3_scalar_carry_cst"
+  [(set (match_operand:SI 0 "register_operand"           "=Sg")
+        (minus:SI (match_operand:SI 1 "gcn_alu_operand"  "SgA")
+		 (match_operand:SI 2 "const_int_operand" "  n")))
+   (set (reg:BI SCC_REG)
+	(leu:BI (minus:SI (match_dup 1)
+			 (match_dup 2))
+		(match_operand:SI 3 "const_int_operand"  "  n")))]
+  "INTVAL (operands[2]) == -INTVAL (operands[3])"
+  "s_sub_u32\t%0, %1, %2"
+  [(set_attr "type" "sop2")
+   (set_attr "length" "4")])
+
+(define_insn "subcsi3_scalar"
+  [(set (match_operand:SI 0 "register_operand"                    "=Sg, Sg")
+        (minus:SI (minus:SI (zero_extend:SI (reg:BI SCC_REG))
+			    (match_operand:SI 1 "gcn_alu_operand" "SgA,SgB"))
+		 (match_operand:SI 2 "gcn_alu_operand"            "SgB,SgA")))
+   (set (reg:BI SCC_REG)
+	(ior:BI (gtu:BI (minus:SI (minus:SI (zero_extend:SI (reg:BI SCC_REG))
+					    (match_dup 1))
+				 (match_dup 2))
+			(match_dup 1))
+		(gtu:BI (minus:SI (zero_extend:SI (reg:BI SCC_REG))
+				  (match_dup 1))
+			(match_dup 1))))]
+  ""
+  "s_subb_u32\t%0, %1, %2"
+  [(set_attr "type" "sop2")
+   (set_attr "length" "8")])
+
+(define_insn "subcsi3_scalar_zero"
+  [(set (match_operand:SI 0 "register_operand"		"=Sg")
+        (minus:SI (zero_extend:SI (reg:BI SCC_REG))
+		  (match_operand:SI 1 "gcn_alu_operand" "SgA")))
+   (set (reg:BI SCC_REG)
+	(gtu:BI (minus:SI (zero_extend:SI (reg:BI SCC_REG)) (match_dup 1))
+		(match_dup 1)))]
+  ""
+  "s_subb_u32\t%0, %1, 0"
+  [(set_attr "type" "sop2")
+   (set_attr "length" "4")])
+
+;; }}}
+;; {{{ ALU: mult
+
+(define_expand "mulsi3"
+  [(set (match_operand:SI 0 "register_operand")
+        (mult:SI (match_operand:SI 1 "gcn_alu_operand")
+		 (match_operand:SI 2 "gcn_alu_operand")))
+   (use (match_dup 3))]
+  ""
+  {
+    operands[3] = gcn_scalar_exec ();
+  })
+
+; Vector multiply has vop3a encoding, but no corresponding vop2a, so no long
+; immediate.
+(define_insn_and_split "*mulsi3_vec_and_scalar"
+  [(set (match_operand:SI 0 "register_operand"	       "= Sg,Sg, Sg,   v")
+        (mult:SI (match_operand:SI 1 "gcn_alu_operand" "%SgA, 0,SgA,   v")
+		 (match_operand:SI 2 "gcn_alu_operand" " SgA, J,  B,vASg")))
+   (use (match_operand:DI 3 "gcn_exec_operand"         "   n, n,  n,   e"))]
+  ""
+  "@
+   #
+   #
+   #
+   v_mul_lo_i32\t%0, %1, %2"
+  "reload_completed && gcn_sdst_register_operand (operands[0], VOIDmode)"
+   [(set (match_dup 0)
+	 (mult:SI (match_dup 1)
+		  (match_dup 2)))]
+  {}
+  [(set_attr "type" "sop2,sopk,sop2,vop3a")
+   (set_attr "length" "4,4,8,4")])
+
+(define_insn "*mulsi3_scalar"
+  [(set (match_operand:SI 0 "register_operand"	       "= Sg,Sg, Sg")
+	(mult:SI (match_operand:SI 1 "gcn_alu_operand" "%SgA, 0,SgA")
+		 (match_operand:SI 2 "gcn_alu_operand" " SgA, J,  B")))]
+  ""
+  "@
+   s_mul_i32\t%0, %1, %2
+   s_mulk_i32\t%0, %2
+   s_mul_i32\t%0, %1, %2"
+  [(set_attr "type" "sop2,sopk,sop2")
+   (set_attr "length" "4,4,8")])
+
+;; }}}
+;; {{{ ALU: generic 32-bit unop
+
+(define_code_iterator vec_and_scalar_unop [not popcount])
+
+; The const0_rtx serves as a device to differentiate patterns
+(define_expand "<expander>si2"
+  [(parallel [(set (match_operand:SI 0 "register_operand")
+		   (vec_and_scalar_unop:SI
+		     (match_operand:SI 1 "gcn_alu_operand")))
+	      (use (match_dup 2))
+	      (use (match_dup 3))
+	      (clobber (reg:BI SCC_REG))])]
+  ""
+  {
+    operands[2] = gcn_scalar_exec ();
+    operands[3] = const0_rtx;
+  })
+
+(define_insn "*<expander>si2"
+  [(set (match_operand:SI 0 "register_operand"  "=Sg,   v")
+        (vec_and_scalar_unop:SI
+	  (match_operand:SI 1 "gcn_alu_operand" "SgB,vSgB")))
+   (use (match_operand:DI 2 "gcn_exec_operand"  "  n,   e"))
+   (use (const_int 0))
+   (clobber (reg:BI SCC_REG))]
+  ""
+  "@
+   s_<s_mnemonic>0\t%0, %1
+   v_<s_mnemonic>0\t%0, %1"
+  [(set_attr "type" "sop1,vop1")
+   (set_attr "length" "8")])
+
+(define_insn "*<expander>si2_scalar"
+  [(set (match_operand:SI 0 "register_operand"			      "=Sg")
+        (vec_and_scalar_unop:SI (match_operand:SI 1 "gcn_alu_operand" "SgB")))
+   (clobber (reg:BI SCC_REG))]
+  ""
+  "s_<s_mnemonic>0\t%0, %1"
+  [(set_attr "type" "sop1")
+   (set_attr "length" "8")])
+
+;; }}}
+;; {{{ ALU: generic 32-bit binop
+
+(define_code_iterator vec_and_scalar [and ior xor ashift lshiftrt
+				      ashiftrt smin smax umin umax])
+
+(define_expand "<expander>si3"
+  [(parallel [(set (match_operand:SI 0 "register_operand")
+		   (vec_and_scalar:SI
+		     (match_operand:SI 1 "gcn_alu_operand")
+		     (match_operand:SI 2 "gcn_alu_operand")))
+	      (use (match_dup 3))
+	      (clobber (reg:BI SCC_REG))])]
+  ""
+  {
+    operands[3] = gcn_scalar_exec ();
+  })
+
+; No plus and mult - they have variant with 16bit immediate
+; and thus are defined later.
+(define_code_iterator vec_and_scalar_com [and ior xor smin smax umin umax])
+(define_code_iterator vec_and_scalar_nocom [ashift lshiftrt ashiftrt])
+
+(define_insn "*<expander>si3"
+  [(set (match_operand:SI 0 "register_operand"  "= Sg,   v")
+        (vec_and_scalar_com:SI
+	  (match_operand:SI 1 "gcn_alu_operand" "%SgA,   v")
+	  (match_operand:SI 2 "gcn_alu_operand" " SgB,vSgB")))
+   (use (match_operand:DI 3 "gcn_exec_operand"  "   n,   e"))
+   (clobber (reg:BI SCC_REG))]
+  ""
+  "@
+   s_<mnemonic>0\t%0, %1, %2
+   v_<mnemonic>0\t%0, %1, %2"
+  [(set_attr "type" "sop2,vop2")
+   (set_attr "length" "8")])
+
+(define_insn "*<expander>si3_scalar"
+  [(set (match_operand:SI 0 "register_operand"   "= Sg")
+        (vec_and_scalar_com:SI
+	  (match_operand:SI 1 "register_operand" "%SgA")
+	  (match_operand:SI 2 "gcn_alu_operand"  " SgB")))
+   (clobber (reg:BI SCC_REG))]
+  ""
+  "s_<mnemonic>0\t%0, %1, %2"
+  [(set_attr "type" "sop2")
+   (set_attr "length" "8")])
+
+; We expect this to be split, post-reload to remove the dependency on the
+; exec register in the scalar case.
+
+(define_insn "*<expander>si3_vec_and_scalar"
+  [(set (match_operand:SI 0 "register_operand"	 "=Sg, Sg,   v")
+        (vec_and_scalar_nocom:SI
+	  (match_operand:SI 1 "gcn_alu_operand"  "SgB,SgA,   v")
+	  (match_operand:SI 2 "gcn_alu_operand"  "SgA,SgB,vSgB")))
+     (use (match_operand:DI 3 "gcn_exec_operand" "  n,  n,   e"))
+   (clobber (reg:BI SCC_REG))]
+  ""
+  "@
+   s_<mnemonic>0\t%0, %1, %2
+   s_<mnemonic>0\t%0, %1, %2
+   v_<mnemonic>0\t%0, %1, %2"
+  [(set_attr "type" "sop2,sop2,vop2")
+   (set_attr "length" "8")])
+
+(define_insn "<expander>si3_scalar"
+  [(set (match_operand:SI 0 "register_operand"  "=Sg,Sg")
+        (vec_and_scalar_nocom:SI
+	  (match_operand:SI 1 "gcn_alu_operand" "SgB,SgA")
+	  (match_operand:SI 2 "gcn_alu_operand" "SgA,SgB")))
+   (clobber (reg:BI SCC_REG))]
+  ""
+  "@
+   s_<mnemonic>0\t%0, %1, %2
+   s_<mnemonic>0\t%0, %1, %2"
+  [(set_attr "type" "sop2,sop2")
+   (set_attr "length" "8")])
+
+;; }}}
+;; {{{ ALU: generic 64-bit
+
+(define_code_iterator vec_and_scalar64_com [and ior xor])
+
+(define_expand "<expander>di3"
+  [(parallel [(set (match_operand:DI 0 "register_operand")
+		    (vec_and_scalar64_com:DI
+			(match_operand:DI 1 "gcn_alu_operand")
+			(match_operand:DI 2 "gcn_alu_operand")))
+	      (use (match_dup 3))
+	      (clobber (reg:BI SCC_REG))])]
+  ""
+  {
+    operands[3] = gcn_scalar_exec ();
+  })
+
+(define_insn_and_split "*<expander>di3_vec_and_scalar"
+   [(set (match_operand:DI 0 "register_operand"   "= Sg,  &v,  &v")
+	 (vec_and_scalar64_com:DI
+	  (match_operand:DI 1 "gcn_alu_operand"   "%SgA,   v,   0")
+	   (match_operand:DI 2 "gcn_alu_operand"  " SgC,vSgB,vSgB")))
+      (use (match_operand:DI 3 "gcn_exec_operand" "   n,   e,   e"))
+   (clobber (reg:BI SCC_REG))]
+  ""
+  "@
+   s_<mnemonic>0\t%0, %1, %2
+   #
+   #"
+  "reload_completed && gcn_vgpr_register_operand (operands[0], DImode)"
+  [(parallel [(set (match_dup 4)
+		   (vec_and_scalar64_com:SI (match_dup 5) (match_dup 6)))
+	      (use (match_dup 3))
+	      (clobber (reg:BI SCC_REG))])
+   (parallel [(set (match_dup 7)
+		   (vec_and_scalar64_com:SI (match_dup 8) (match_dup 9)))
+	      (use (match_dup 3))
+	      (clobber (reg:BI SCC_REG))])]
+  {
+    operands[4] = gcn_operand_part (DImode, operands[0], 0);
+    operands[5] = gcn_operand_part (DImode, operands[1], 0);
+    operands[6] = gcn_operand_part (DImode, operands[2], 0);
+    operands[7] = gcn_operand_part (DImode, operands[0], 1);
+    operands[8] = gcn_operand_part (DImode, operands[1], 1);
+    operands[9] = gcn_operand_part (DImode, operands[2], 1);
+  }
+  [(set_attr "type" "sop2,vop2,vop2")
+   (set_attr "length" "8")])
+
+(define_insn "*<expander>di3_scalar"
+  [(set (match_operand:DI 0 "register_operand"  "= Sg")
+        (vec_and_scalar64_com:DI
+	  (match_operand:DI 1 "gcn_alu_operand" "%SgA")
+	  (match_operand:DI 2 "gcn_alu_operand" " SgC")))
+   (clobber (reg:BI SCC_REG))]
+  ""
+  "s_<mnemonic>0\t%0, %1, %2"
+  [(set_attr "type" "sop2")
+   (set_attr "length" "8")])
+
+(define_expand "<expander>di3"
+  [(parallel [(set (match_operand:DI 0 "register_operand")
+		   (vec_and_scalar_nocom:DI
+		     (match_operand:DI 1 "gcn_alu_operand")
+		     (match_operand:SI 2 "gcn_alu_operand")))
+	      (clobber (reg:BI SCC_REG))])]
+  ""
+  {
+    operands[3] = gcn_scalar_exec ();
+  })
+
+(define_insn "*<expander>di3_vec_and_scalar"
+  [(set (match_operand:DI 0 "register_operand"   "=Sg, Sg,   v")
+	(vec_and_scalar_nocom:DI
+	  (match_operand:DI 1 "gcn_alu_operand"  "SgC,SgA,   v")
+	  (match_operand:SI 2 "gcn_alu_operand"  "SgA,SgC,vSgC")))
+     (use (match_operand:DI 3 "gcn_exec_operand" "  n,  n,   e"))
+   (clobber (reg:BI SCC_REG))]
+  ""
+  "@
+   s_<mnemonic>0\t%0, %1, %2
+   s_<mnemonic>0\t%0, %1, %2
+   v_<mnemonic>0\t%0, %1, %2"
+  [(set_attr "type" "sop2,sop2,vop2")
+   (set_attr "length" "8")])
+
+(define_insn "*<expander>di3_scalar"
+  [(set (match_operand:DI 0 "register_operand"  "=Sg, Sg")
+        (vec_and_scalar_nocom:DI
+	  (match_operand:DI 1 "gcn_alu_operand" "SgC,SgA")
+	  (match_operand:SI 2 "gcn_alu_operand" "SgA,SgC")))
+   (clobber (reg:BI SCC_REG))]
+  ""
+  "s_<mnemonic>0\t%0, %1, %2"
+  [(set_attr "type" "sop2,sop2")
+   (set_attr "length" "8")])
+
+;; }}}
+;; {{{ Generic splitters
+
+;; These choose the proper insn variant once we've decided on using
+;; vector or scalar ALU.
+
+; Discard (use EXEC) from scalar unops.
+
+(define_split
+  [(set (match_operand 0 "gcn_sdst_register_operand")
+        (match_operator 3 "unary_operator"
+	  [(match_operand 1 "gcn_alu_operand")]))
+   (use (match_operand:DI 2 ""))
+   (use (const_int 0))]
+  "reload_completed"
+  [(set (match_dup 0) (match_op_dup 3 [(match_dup 1)]))])
+
+; Discard const0 from valu unops.
+
+(define_split
+  [(set (match_operand 0 "gcn_vgpr_register_operand")
+        (match_operator 3 "unary_operator"
+	  [(match_operand 1 "gcn_alu_operand")]))
+   (use (match_operand:DI 2 ""))
+   (use (const_int 0))]
+  "reload_completed"
+  [(parallel [(set (match_dup 0)
+		   (match_op_dup 3 [(match_dup 1)]))
+              (use (match_dup 2))])])
+
+; Discard (use EXEC) from scalar binops.
+
+(define_split
+  [(set (match_operand 0 "gcn_sdst_register_operand")
+        (match_operator 4 "binary_operator"
+	  [(match_operand 1 "gcn_alu_operand")
+	   (match_operand 2 "gcn_alu_operand")]))
+   (use (match_operand:DI 3 ""))
+   (clobber (reg:BI SCC_REG))]
+  "reload_completed"
+  [(parallel [(set (match_dup 0)
+		   (match_op_dup 4 [(match_dup 1) (match_dup 2)]))
+              (clobber (reg:BI SCC_REG))])])
+
+; Discard (clobber SCC) from valu binops.
+
+(define_split
+  [(set (match_operand 0 "gcn_vgpr_register_operand")
+        (match_operator 4 "binary_operator"
+	  [(match_operand 1 "gcn_alu_operand")
+	   (match_operand 2 "gcn_alu_operand")]))
+   (use (match_operand:DI 3 ""))
+   (clobber (reg:BI SCC_REG))]
+  "reload_completed"
+  [(parallel [(set (match_dup 0)
+		   (match_op_dup 4 [(match_dup 1) (match_dup 2)]))
+              (use (match_dup 3))])])
+
+;; }}}
+;; {{{ Atomics
+
+; Each compute unit has it's own L1 cache. The L2 cache is shared between
+; all the compute units.  Any load or store instruction can skip L1 and
+; access L2 directly using the "glc" flag.  Atomic instructions also skip
+; L1.  The L1 cache can be flushed and invalidated using instructions.
+;
+; Therefore, in order for "acquire" and "release" atomic modes to work
+; correctly across compute units we must flush before each "release"
+; and invalidate the cache after each "acquire".  It might seem like
+; invalidation could be safely done before an "acquire", but since each
+; compute unit can run up to 40 threads simultaneously, all reading values
+; into the L1 cache, this is not actually safe.
+;
+; Additionally, scalar flat instructions access L2 via a different cache
+; (the "constant cache"), so they have separate constrol instructions.  We
+; do not attempt to invalidate both caches at once; instead, atomics
+; operating on scalar flat pointers will flush the constant cache, and
+; atomics operating on flat or global pointers will flush L1.  It is up to
+; the programmer to get this right.
+
+(define_code_iterator atomicops [plus minus and ior xor])
+(define_mode_attr X [(SI "") (DI "_X2")])
+
+;; TODO compare_and_swap test_and_set inc dec
+;; Hardware also supports min and max, but GCC does not.
+
+(define_expand "memory_barrier"
+  [(set (match_dup 0)
+	(unspec:BLK [(match_dup 0)] UNSPEC_MEMORY_BARRIER))]
+  ""
+  {
+    operands[0] = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode));
+    MEM_VOLATILE_P (operands[0]) = 1;
+  })
+
+(define_insn "*memory_barrier"
+  [(set (match_operand:BLK 0)
+	(unspec:BLK [(match_dup 0)] UNSPEC_MEMORY_BARRIER))]
+  ""
+  "buffer_wbinvl1_vol"
+  [(set_attr "type" "mubuf")
+   (set_attr "length" "4")])
+
+; FIXME: These patterns have been disabled as they do not seem to work
+; reliably - they can cause hangs or incorrect results.
+; TODO: flush caches according to memory model
+(define_expand "atomic_fetch_<bare_mnemonic><mode>"
+  [(parallel [(set (match_operand:SIDI 0 "register_operand")
+		   (match_operand:SIDI 1 "memory_operand"))
+	      (set (match_dup 1)
+		   (unspec_volatile:SIDI
+		     [(atomicops:SIDI
+		       (match_dup 1)
+		       (match_operand:SIDI 2 "register_operand"))]
+		     UNSPECV_ATOMIC))
+	      (use (match_operand 3 "const_int_operand"))
+	      (use (match_dup 4))])]
+  "0 /* Disabled.  */"
+  {
+    operands[4] = gcn_scalar_exec ();
+  })
+
+(define_insn "*atomic_fetch_<bare_mnemonic><mode>_insn"
+  [(set (match_operand:SIDI 0 "register_operand"     "=Sm, v, v")
+	(match_operand:SIDI 1 "memory_operand"	     "+RS,RF,RM"))
+   (set (match_dup 1)
+	(unspec_volatile:SIDI
+	  [(atomicops:SIDI
+	    (match_dup 1)
+	    (match_operand:SIDI 2 "register_operand" " Sm, v, v"))]
+	   UNSPECV_ATOMIC))
+   (use (match_operand 3 "const_int_operand"))
+   (use (match_operand:DI 4 "gcn_exec_operand"       "  n, e, e"))]
+  "0 /* Disabled.  */"
+  "@
+   s_atomic_<bare_mnemonic><X>\t%0, %1, %2 glc\;s_waitcnt\tlgkmcnt(0)
+   flat_atomic_<bare_mnemonic><X>\t%0, %1, %2 glc\;s_waitcnt\t0
+   global_atomic_<bare_mnemonic><X>\t%0, %A1, %2%O1 glc\;s_waitcnt\tvmcnt(0)"
+  [(set_attr "type" "smem,flat,flat")
+   (set_attr "length" "12")
+   (set_attr "gcn_version" "gcn5,*,gcn5")])
+
+; FIXME: These patterns are disabled because the instructions don't
+; seem to work as advertised.  Specifically, OMP "team distribute"
+; reductions apparently "lose" some of the writes, similar to what
+; you might expect from a concurrent non-atomic read-modify-write.
+; TODO: flush caches according to memory model
+
+(define_expand "atomic_<bare_mnemonic><mode>"
+  [(parallel [(set (match_operand:SIDI 0 "memory_operand")
+		   (unspec_volatile:SIDI
+		     [(atomicops:SIDI
+		       (match_dup 0)
+		       (match_operand:SIDI 1 "register_operand"))]
+		    UNSPECV_ATOMIC))
+	      (use (match_operand 2 "const_int_operand"))
+	      (use (match_dup 3))])]
+  "0 /* Disabled.  */"
+  {
+    operands[3] = gcn_scalar_exec ();
+  })
+
+(define_insn "*atomic_<bare_mnemonic><mode>_insn"
+  [(set (match_operand:SIDI 0 "memory_operand"       "+RS,RF,RM")
+	(unspec_volatile:SIDI
+	  [(atomicops:SIDI
+	    (match_dup 0)
+	    (match_operand:SIDI 1 "register_operand" " Sm, v, v"))]
+	  UNSPECV_ATOMIC))
+   (use (match_operand 2 "const_int_operand"))
+   (use (match_operand:DI 3 "gcn_exec_operand"       "  n, e, e"))]
+  "0 /* Disabled.  */"
+  "@
+   s_atomic_<bare_mnemonic><X>\t%0, %1\;s_waitcnt\tlgkmcnt(0)
+   flat_atomic_<bare_mnemonic><X>\t%0, %1\;s_waitcnt\t0
+   global_atomic_<bare_mnemonic><X>\t%A0, %1%O0\;s_waitcnt\tvmcnt(0)"
+  [(set_attr "type" "smem,flat,flat")
+   (set_attr "length" "12")
+   (set_attr "gcn_version" "gcn5,*,gcn5")])
+
+(define_mode_attr x2 [(SI "DI") (DI "TI")])
+(define_mode_attr size [(SI "4") (DI "8")])
+(define_mode_attr bitsize [(SI "32") (DI "64")])
+
+(define_expand "sync_compare_and_swap<mode>"
+  [(match_operand:SIDI 0 "register_operand")
+   (match_operand:SIDI 1 "memory_operand")
+   (match_operand:SIDI 2 "register_operand")
+   (match_operand:SIDI 3 "register_operand")]
+  ""
+  {
+    if (MEM_ADDR_SPACE (operands[1]) == ADDR_SPACE_LDS)
+      {
+	rtx exec = gcn_scalar_exec ();
+	emit_insn (gen_sync_compare_and_swap<mode>_lds_insn (operands[0],
+							     operands[1],
+							     operands[2],
+							     operands[3],
+							     exec));
+	DONE;
+      }
+
+    /* Operands 2 and 3 must be placed in consecutive registers, and passed
+       as a combined value.  */
+    rtx src_cmp = gen_reg_rtx (<x2>mode);
+    emit_move_insn (gen_rtx_SUBREG (<MODE>mode, src_cmp, 0), operands[3]);
+    emit_move_insn (gen_rtx_SUBREG (<MODE>mode, src_cmp, <size>), operands[2]);
+    emit_insn (gen_sync_compare_and_swap<mode>_insn (operands[0],
+						     operands[1],
+						     src_cmp,
+						     gcn_scalar_exec ()));
+    DONE;
+  })
+
+(define_insn "sync_compare_and_swap<mode>_insn"
+  [(set (match_operand:SIDI 0 "register_operand"    "=Sm, v, v")
+	(match_operand:SIDI 1 "memory_operand"      "+RS,RF,RM"))
+   (set (match_dup 1)
+	(unspec_volatile:SIDI
+	  [(match_operand:<x2> 2 "register_operand" " Sm, v, v")]
+	  UNSPECV_ATOMIC))
+   (use (match_operand:DI 3 "gcn_exec_operand"      "  n, e, e"))]
+  ""
+  "@
+   s_atomic_cmpswap<X>\t%0, %1, %2 glc\;s_waitcnt\tlgkmcnt(0)
+   flat_atomic_cmpswap<X>\t%0, %1, %2 glc\;s_waitcnt\t0
+   global_atomic_cmpswap<X>\t%0, %A1, %2%O1 glc\;s_waitcnt\tvmcnt(0)"
+  [(set_attr "type" "smem,flat,flat")
+   (set_attr "length" "12")
+   (set_attr "gcn_version" "gcn5,*,gcn5")])
+
+(define_insn "sync_compare_and_swap<mode>_lds_insn"
+  [(set (match_operand:SIDI 0 "register_operand"    "= v")
+	(unspec_volatile:SIDI
+	  [(match_operand:SIDI 1 "memory_operand"   "+RL")]
+	  UNSPECV_ATOMIC))
+   (set (match_dup 1)
+	(unspec_volatile:SIDI
+	  [(match_operand:SIDI 2 "register_operand" "  v")
+	   (match_operand:SIDI 3 "register_operand" "  v")]
+	  UNSPECV_ATOMIC))
+   (use (match_operand:DI 4 "gcn_exec_operand"      "  e"))]
+  ""
+  "ds_cmpst_rtn_b<bitsize> %0, %1, %2, %3\;s_waitcnt\tlgkmcnt(0)"
+  [(set_attr "type" "ds")
+   (set_attr "length" "12")])
+
+(define_expand "atomic_load<mode>"
+  [(match_operand:SIDI 0 "register_operand")
+   (match_operand:SIDI 1 "memory_operand")
+   (match_operand 2 "immediate_operand")]
+  ""
+  {
+    emit_insn (gen_atomic_load<mode>_insn (operands[0], operands[1],
+					   operands[2], gcn_scalar_exec ()));
+    DONE;
+  })
+
+(define_insn "atomic_load<mode>_insn"
+  [(set (match_operand:SIDI 0 "register_operand"  "=Sm, v, v")
+	(unspec_volatile:SIDI
+	  [(match_operand:SIDI 1 "memory_operand" " RS,RF,RM")]
+	  UNSPECV_ATOMIC))
+   (use (match_operand:SIDI 2 "immediate_operand" "  i, i, i"))
+   (use (match_operand:DI 3 "gcn_exec_operand"    "  n, e, e"))]
+  ""
+  {
+    switch (INTVAL (operands[2]))
+      {
+      case MEMMODEL_RELAXED:
+	switch (which_alternative)
+	  {
+	  case 0:
+	    return "s_load%o0\t%0, %A1 glc\;s_waitcnt\tlgkmcnt(0)";
+	  case 1:
+	    return "flat_load%o0\t%0, %A1%O1 glc\;s_waitcnt\t0";
+	  case 2:
+	    return "global_load%o0\t%0, %A1%O1 glc\;s_waitcnt\tvmcnt(0)";
+	  }
+	break;
+      case MEMMODEL_CONSUME:
+      case MEMMODEL_ACQUIRE:
+      case MEMMODEL_SYNC_ACQUIRE:
+	switch (which_alternative)
+	  {
+	  case 0:
+	    return "s_load%o0\t%0, %A1 glc\;s_waitcnt\tlgkmcnt(0)\;"
+		   "s_dcache_wb_vol";
+	  case 1:
+	    return "flat_load%o0\t%0, %A1%O1 glc\;s_waitcnt\t0\;"
+		   "buffer_wbinvl1_vol";
+	  case 2:
+	    return "global_load%o0\t%0, %A1%O1 glc\;s_waitcnt\tvmcnt(0)\;"
+		   "buffer_wbinvl1_vol";
+	  }
+	break;
+      case MEMMODEL_ACQ_REL:
+      case MEMMODEL_SEQ_CST:
+      case MEMMODEL_SYNC_SEQ_CST:
+	switch (which_alternative)
+	  {
+	  case 0:
+	    return "s_dcache_wb_vol\;s_load%o0\t%0, %A1 glc\;"
+		   "s_waitcnt\tlgkmcnt(0)\;s_dcache_inv_vol";
+	  case 1:
+	    return "buffer_wbinvl1_vol\;flat_load%o0\t%0, %A1%O1 glc\;"
+		   "s_waitcnt\t0\;buffer_wbinvl1_vol";
+	  case 2:
+	    return "buffer_wbinvl1_vol\;global_load%o0\t%0, %A1%O1 glc\;"
+		   "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol";
+	  }
+	break;
+      }
+    gcc_unreachable ();
+  }
+  [(set_attr "type" "smem,flat,flat")
+   (set_attr "length" "20")
+   (set_attr "gcn_version" "gcn5,*,gcn5")])
+
+(define_expand "atomic_store<mode>"
+  [(match_operand:SIDI 0 "memory_operand")
+   (match_operand:SIDI 1 "register_operand")
+   (match_operand 2 "immediate_operand")]
+  ""
+  {
+    emit_insn (gen_atomic_store<mode>_insn (operands[0], operands[1],
+					    operands[2], gcn_scalar_exec ()));
+    DONE;
+  })
+
+(define_insn "atomic_store<mode>_insn"
+  [(set (match_operand:SIDI 0 "memory_operand"      "=RS,RF,RM")
+	(unspec_volatile:SIDI
+	  [(match_operand:SIDI 1 "register_operand" " Sm, v, v")]
+	  UNSPECV_ATOMIC))
+  (use (match_operand:SIDI 2 "immediate_operand"    "  i, i, i"))
+  (use (match_operand:DI 3 "gcn_exec_operand"       "  n, e, e"))]
+  ""
+  {
+    switch (INTVAL (operands[2]))
+      {
+      case MEMMODEL_RELAXED:
+	switch (which_alternative)
+	  {
+	  case 0:
+	    return "s_store%o1\t%1, %A0 glc\;s_waitcnt\tlgkmcnt(0)";
+	  case 1:
+	    return "flat_store%o1\t%A0, %1%O0 glc\;s_waitcnt\t0";
+	  case 2:
+	    return "global_store%o1\t%A0, %1%O0 glc\;s_waitcnt\tvmcnt(0)";
+	  }
+	break;
+      case MEMMODEL_RELEASE:
+      case MEMMODEL_SYNC_RELEASE:
+	switch (which_alternative)
+	  {
+	  case 0:
+	    return "s_dcache_wb_vol\;s_store%o1\t%1, %A0 glc\;"
+		   "s_waitcnt\tlgkmcnt(0)";
+	  case 1:
+	    return "buffer_wbinvl1_vol\;flat_store%o1\t%A0, %1%O0 glc\;"
+		   "s_waitcnt\t0";
+	  case 2:
+	    return "buffer_wbinvl1_vol\;global_store%o1\t%A0, %1%O0 glc\;"
+		   "s_waitcnt\tvmcnt(0)";
+	  }
+	break;
+      case MEMMODEL_ACQ_REL:
+      case MEMMODEL_SEQ_CST:
+      case MEMMODEL_SYNC_SEQ_CST:
+	switch (which_alternative)
+	  {
+	  case 0:
+	    return "s_dcache_wb_vol\;s_store%o1\t%1, %A0 glc\;"
+		   "s_waitcnt\tlgkmcnt(0)\;s_dcache_inv_vol";
+	  case 1:
+	    return "buffer_wbinvl1_vol\;flat_store%o1\t%A0, %1%O0 glc\;"
+		   "s_waitcnt\t0\;buffer_wbinvl1_vol";
+	  case 2:
+	    return "buffer_wbinvl1_vol\;global_store%o1\t%A0, %1%O0 glc\;"
+		   "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol";
+	  }
+	break;
+      }
+    gcc_unreachable ();
+  }
+  [(set_attr "type" "smem,flat,flat")
+   (set_attr "length" "20")
+   (set_attr "gcn_version" "gcn5,*,gcn5")])
+
+(define_expand "atomic_exchange<mode>"
+  [(match_operand:SIDI 0 "register_operand")
+   (match_operand:SIDI 1 "memory_operand")
+   (match_operand:SIDI 2 "register_operand")
+   (match_operand 3 "immediate_operand")]
+  ""
+  {
+    emit_insn (gen_atomic_exchange<mode>_insn (operands[0], operands[1],
+					       operands[2], operands[3],
+					       gcn_scalar_exec ()));
+    DONE;
+  })
+
+(define_insn "atomic_exchange<mode>_insn"
+  [(set (match_operand:SIDI 0 "register_operand"    "=Sm, v, v")
+        (match_operand:SIDI 1 "memory_operand"	    "+RS,RF,RM"))
+   (set (match_dup 1)
+	(unspec_volatile:SIDI
+	  [(match_operand:SIDI 2 "register_operand" " Sm, v, v")]
+	  UNSPECV_ATOMIC))
+   (use (match_operand 3 "immediate_operand"))
+   (use (match_operand:DI 4 "gcn_exec_operand"	    "  n, e, e"))]
+  ""
+  {
+    switch (INTVAL (operands[3]))
+      {
+      case MEMMODEL_RELAXED:
+	switch (which_alternative)
+	  {
+	  case 0:
+	    return "s_atomic_swap<X>\t%0, %1, %2 glc\;s_waitcnt\tlgkmcnt(0)";
+	  case 1:
+	    return "flat_atomic_swap<X>\t%0, %1, %2 glc\;s_waitcnt\t0";
+	  case 2:
+	    return "global_atomic_swap<X>\t%0, %A1, %2%O1 glc\;"
+		   "s_waitcnt\tvmcnt(0)";
+	  }
+	break;
+      case MEMMODEL_CONSUME:
+      case MEMMODEL_ACQUIRE:
+      case MEMMODEL_SYNC_ACQUIRE:
+	switch (which_alternative)
+	  {
+	  case 0:
+	    return "s_atomic_swap<X>\t%0, %1, %2 glc\;s_waitcnt\tlgkmcnt(0)\;"
+		   "s_dcache_wb_vol\;s_dcache_inv_vol";
+	  case 1:
+	    return "flat_atomic_swap<X>\t%0, %1, %2 glc\;s_waitcnt\t0\;"
+		   "buffer_wbinvl1_vol";
+	  case 2:
+	    return "global_atomic_swap<X>\t%0, %A1, %2%O1 glc\;"
+		   "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol";
+	  }
+	break;
+      case MEMMODEL_RELEASE:
+      case MEMMODEL_SYNC_RELEASE:
+	switch (which_alternative)
+	  {
+	  case 0:
+	    return "s_dcache_wb_vol\;s_atomic_swap<X>\t%0, %1, %2 glc\;"
+		   "s_waitcnt\tlgkmcnt(0)";
+	  case 1:
+	    return "buffer_wbinvl1_vol\;flat_atomic_swap<X>\t%0, %1, %2 glc\;"
+		   "s_waitcnt\t0";
+	  case 2:
+	    return "buffer_wbinvl1_vol\;"
+		   "global_atomic_swap<X>\t%0, %A1, %2%O1 glc\;"
+		   "s_waitcnt\tvmcnt(0)";
+	  }
+	break;
+      case MEMMODEL_ACQ_REL:
+      case MEMMODEL_SEQ_CST:
+      case MEMMODEL_SYNC_SEQ_CST:
+	switch (which_alternative)
+	  {
+	  case 0:
+	    return "s_dcache_wb_vol\;s_atomic_swap<X>\t%0, %1, %2 glc\;"
+		   "s_waitcnt\tlgkmcnt(0)\;s_dcache_inv_vol";
+	  case 1:
+	    return "buffer_wbinvl1_vol\;flat_atomic_swap<X>\t%0, %1, %2 glc\;"
+		   "s_waitcnt\t0\;buffer_wbinvl1_vol";
+	  case 2:
+	    return "buffer_wbinvl1_vol\;"
+		   "global_atomic_swap<X>\t%0, %A1, %2%O1 glc\;"
+		   "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol";
+	  }
+	break;
+      }
+    gcc_unreachable ();
+  }
+  [(set_attr "type" "smem,flat,flat")
+   (set_attr "length" "20")
+   (set_attr "gcn_version" "gcn5,*,gcn5")])
+
+;; }}}
+;; {{{ OpenACC / OpenMP
+
+(define_expand "oacc_dim_size"
+  [(match_operand:SI 0 "register_operand")
+   (match_operand:SI 1 "const_int_operand")]
+  ""
+  {
+    rtx tmp = gcn_oacc_dim_size (INTVAL (operands[1]));
+    emit_move_insn (operands[0], gen_lowpart (SImode, tmp));
+    DONE;
+  })
+
+(define_expand "oacc_dim_pos"
+  [(match_operand:SI 0 "register_operand")
+   (match_operand:SI 1 "const_int_operand")]
+  ""
+  {
+    emit_move_insn (operands[0], gcn_oacc_dim_pos (INTVAL (operands[1])));
+    DONE;
+  })
+
+(define_expand "gcn_wavefront_barrier"
+  [(set (match_dup 0)
+	(unspec_volatile:BLK [(match_dup 0)] UNSPECV_BARRIER))]
+  ""
+  {
+    operands[0] = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode));
+    MEM_VOLATILE_P (operands[0]) = 1;
+  })
+
+(define_insn "*gcn_wavefront_barrier"
+  [(set (match_operand:BLK 0 "")
+	(unspec_volatile:BLK [(match_dup 0)] UNSPECV_BARRIER))]
+  ""
+  "s_barrier"
+  [(set_attr "type" "sopp")])
+
+(define_expand "oacc_fork"
+  [(set (match_operand:SI 0 "")
+	(match_operand:SI 1 ""))
+   (use (match_operand:SI 2 ""))]
+  ""
+  {
+    /* We need to have oacc_fork/oacc_join named patterns as a pair,
+       but the fork isn't actually used.  */
+    gcc_unreachable ();
+  })
+
+(define_expand "oacc_join"
+  [(set (match_operand:SI 0 "")
+	(match_operand:SI 1 ""))
+   (use (match_operand:SI 2 ""))]
+  ""
+  {
+    emit_insn (gen_gcn_wavefront_barrier ());
+    DONE;
+  })
+
+;; }}}
+
+(include "gcn-valu.md")
diff --git a/gcc/config/gcn/predicates.md b/gcc/config/gcn/predicates.md
new file mode 100644
index 0000000..df961a4
--- /dev/null
+++ b/gcc/config/gcn/predicates.md
@@ -0,0 +1,193 @@ 
+;; Predicate definitions for GCN.
+;; Copyright (C) 2016-2018 Free Software Foundation, Inc.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;; GNU General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+;; Return true if VALUE can be stored in a sign extended immediate field.
+
+(define_predicate "gcn_conditional_register_operand"
+  (match_operand 0 "register_operand")
+{
+  if (GET_CODE (op) == SUBREG)
+    op = SUBREG_REG (op);
+
+  if (!REG_P (op) || GET_MODE (op) != BImode)
+    return 0;
+
+  return REGNO (op) == VCCZ_REG
+	 || REGNO (op) == SCC_REG
+	 || REGNO (op) == EXECZ_REG
+	 || REGNO (op) >= FIRST_PSEUDO_REGISTER;
+})
+
+(define_predicate "gcn_ssrc_register_operand"
+  (match_operand 0 "register_operand")
+{
+  if (GET_CODE (op) == SUBREG)
+    op = SUBREG_REG (op);
+
+  if (!REG_P (op))
+    return false;
+
+  return SSRC_REGNO_P (REGNO (op)) || REGNO (op) >= FIRST_PSEUDO_REGISTER;
+})
+
+(define_predicate "gcn_sdst_register_operand"
+  (match_operand 0 "register_operand")
+{
+  if (GET_CODE (op) == SUBREG)
+    op = SUBREG_REG (op);
+
+  if (!REG_P (op))
+    return false;
+
+  return SDST_REGNO_P (REGNO (op)) || REGNO (op) >= FIRST_PSEUDO_REGISTER;
+})
+
+(define_predicate "gcn_vgpr_register_operand"
+  (match_operand 0 "register_operand")
+{
+  if (GET_CODE (op) == SUBREG)
+    op = SUBREG_REG (op);
+
+  if (!REG_P (op))
+    return false;
+
+  return VGPR_REGNO_P (REGNO (op)) || REGNO (op) >= FIRST_PSEUDO_REGISTER;
+})
+
+(define_predicate "gcn_inline_immediate_operand"
+  (match_code "const_int,const_double,const_vector")
+{
+  return gcn_inline_constant_p (op);
+})
+
+(define_predicate "gcn_vop3_operand"
+  (ior (match_operand 0 "gcn_inline_immediate_operand")
+       (match_operand 0 "register_operand")))
+
+(define_predicate "gcn_vec0_operand"
+  (match_code "const_vector")
+{
+  return CONST_VECTOR_ELT (op, 0) == const0_rtx && gcn_inline_constant_p (op);
+})
+
+(define_predicate "gcn_vec1_operand"
+  (match_code "const_vector")
+{
+  return CONST_VECTOR_ELT (op, 0) == const1_rtx && gcn_inline_constant_p (op);
+})
+
+(define_predicate "gcn_vec1d_operand"
+  (match_code "const_vector")
+{
+  if (!gcn_inline_constant_p (op))
+    return false;
+
+  rtx elem = CONST_VECTOR_ELT (op, 0);
+  if (!CONST_DOUBLE_P (elem))
+    return false;
+  return real_identical (CONST_DOUBLE_REAL_VALUE (elem), &dconst1);
+})
+
+(define_predicate "gcn_const1d_operand"
+  (match_code "const_double")
+{
+  return gcn_inline_constant_p (op)
+      && real_identical (CONST_DOUBLE_REAL_VALUE (op), &dconst1);
+})
+
+(define_predicate "gcn_32bit_immediate_operand"
+  (match_code "const_int,const_double,const_vector,symbol_ref,label_ref")
+{
+  return gcn_constant_p (op);
+})
+
+; LRA works smoother when exec values are immediate constants
+; prior register allocation.
+(define_predicate "gcn_exec_operand"
+  (ior (match_operand 0 "register_operand")
+       (match_code "const_int")))
+
+(define_predicate "gcn_exec_reg_operand"
+  (match_operand 0 "register_operand"))
+
+(define_predicate "gcn_load_operand"
+  (ior (match_operand 0 "nonimmediate_operand")
+       (match_operand 0 "gcn_32bit_immediate_operand")))
+
+(define_predicate "gcn_alu_operand"
+  (ior (match_operand 0 "register_operand")
+       (match_operand 0 "gcn_32bit_immediate_operand")))
+
+(define_predicate "gcn_ds_memory_operand"
+  (and (match_code "mem")
+       (and (match_test "AS_LDS_P (MEM_ADDR_SPACE (op)) || AS_GDS_P (MEM_ADDR_SPACE (op))")
+	    (match_operand 0 "memory_operand"))))
+
+(define_predicate "gcn_valu_dst_operand"
+  (ior (match_operand 0 "register_operand")
+       (match_operand 0 "gcn_ds_memory_operand")))
+
+(define_predicate "gcn_valu_src0_operand"
+  (ior (match_operand 0 "register_operand")
+       (ior (match_operand 0 "gcn_32bit_immediate_operand")
+	    (match_operand 0 "gcn_ds_memory_operand"))))
+
+(define_predicate "gcn_valu_src1_operand"
+  (match_operand 0 "register_operand"))
+
+(define_predicate "gcn_valu_src1com_operand"
+  (ior (match_operand 0 "register_operand")
+       (match_operand 0 "gcn_32bit_immediate_operand")))
+
+(define_predicate "gcn_conditional_operator"
+  (match_code "eq,ne"))
+
+(define_predicate "gcn_compare_64bit_operator"
+  (match_code "eq,ne"))
+
+(define_predicate "gcn_compare_operator"
+  (match_code "eq,ne,gt,ge,lt,le,gtu,geu,ltu,leu"))
+
+(define_predicate "gcn_fp_compare_operator"
+  (match_code "eq,ne,gt,ge,lt,le,gtu,geu,ltu,leu,ordered,unordered"))
+
+(define_predicate "unary_operator"
+  (match_code "not,popcount"))
+
+(define_predicate "binary_operator"
+  (match_code "and,ior,xor,ashift,lshiftrt,ashiftrt,smin,smax,umin,umax"))
+
+(define_predicate "gcn_unspec_operand"
+  (and (match_code "unspec")
+       (match_test "XINT (op, 1) == UNSPEC_VECTOR")))
+
+(define_predicate "gcn_register_or_unspec_operand"
+  (ior (match_operand 0 "register_operand")
+       (and (match_code "unspec")
+            (match_test "XINT (op, 1) == UNSPEC_VECTOR"))))
+
+(define_predicate "gcn_alu_or_unspec_operand"
+  (ior (match_operand 0 "gcn_alu_operand")
+       (and (match_code "unspec")
+            (match_test "XINT (op, 1) == UNSPEC_VECTOR"))))
+
+(define_predicate "gcn_register_ds_or_unspec_operand"
+  (ior (match_operand 0 "register_operand")
+       (ior (match_operand 0 "gcn_ds_memory_operand")
+	    (and (match_code "unspec")
+              (match_test "XINT (op, 1) == UNSPEC_VECTOR")))))