x86: Encode 256-bit/512-bit VEX/EVEX insns with 128-bit VEX

Message ID 20190317100804.22160-1-hjl.tools@gmail.com
State New
Headers show
Series
  • x86: Encode 256-bit/512-bit VEX/EVEX insns with 128-bit VEX
Related show

Commit Message

H.J. Lu March 17, 2019, 10:08 a.m.
Since all AVX512 processors support AVX, we can encode 256-bit/512-bit
VEX/EVEX vector register clearing instructions with 128-bit VEX vector
register clearing instructions at -O1.

	* config/tc-i386.c (optimize_encoding): Encode 256-bit/512-bit
	VEX/EVEX vector register clearing instructions with 128-bit VEX
	vector register clearing instructions at -O1.
	* doc/c-i386.texi: Update -O1 and -O2 documentation.
	* testsuite/gas/i386/i386.exp: Run optimize-1a and
	x86-64-optimize-2a.
	* testsuite/gas/i386/optimize-1a.d: New file.
	* testsuite/gas/i386/x86-64-optimize-2a.d: Likewise.
---
 gas/config/tc-i386.c                        |  22 ++--
 gas/doc/c-i386.texi                         |  10 +-
 gas/testsuite/gas/i386/i386.exp             |   2 +
 gas/testsuite/gas/i386/optimize-1a.d        |  66 ++++++++++++
 gas/testsuite/gas/i386/x86-64-optimize-2a.d | 110 ++++++++++++++++++++
 5 files changed, 195 insertions(+), 15 deletions(-)
 create mode 100644 gas/testsuite/gas/i386/optimize-1a.d
 create mode 100644 gas/testsuite/gas/i386/x86-64-optimize-2a.d

-- 
2.20.1

Comments

Jan Beulich March 18, 2019, 11:42 a.m. | #1
>>> On 17.03.19 at 11:08, <hjl.tools@gmail.com> wrote:

> --- a/gas/config/tc-i386.c

> +++ b/gas/config/tc-i386.c

> @@ -3977,8 +3977,7 @@ optimize_encoding (void)

>  	    }

>  	}

>      }

> -  else if (optimize > 1

> -	   && i.reg_operands == 3

> +  else if (i.reg_operands == 3

>  	   && i.op[0].regs == i.op[1].regs

>  	   && !i.types[2].bitfield.xmmword

>  	   && (i.tm.opcode_modifier.vex

> @@ -4009,15 +4008,15 @@ optimize_encoding (void)

>  		|| i.tm.base_opcode == 0x6647)

>  	       && i.tm.extension_opcode == None))

>      {

> -      /* Optimize: -O2:

> +      /* Optimize: -O1:

>  	   VOP, one of vandnps, vandnpd, vxorps, vxorpd, vpsubb, vpsubd,

>  	   vpsubq and vpsubw:

>  	     EVEX VOP %zmmM, %zmmM, %zmmN

>  	       -> VEX VOP %xmmM, %xmmM, %xmmN (M and N < 16)

> -	       -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16)

> +	       -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16) (-O2)

>  	     EVEX VOP %ymmM, %ymmM, %ymmN

>  	       -> VEX VOP %xmmM, %xmmM, %xmmN (M and N < 16)

> -	       -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16)

> +	       -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16) (-O2)

>  	     VEX VOP %ymmM, %ymmM, %ymmN

>  	       -> VEX VOP %xmmM, %xmmM, %xmmN

>  	   VOP, one of vpandn and vpxor:

> @@ -4026,17 +4025,17 @@ optimize_encoding (void)

>  	   VOP, one of vpandnd and vpandnq:

>  	     EVEX VOP %zmmM, %zmmM, %zmmN

>  	       -> VEX vpandn %xmmM, %xmmM, %xmmN (M and N < 16)

> -	       -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16)

> +	       -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16) (-O2)

>  	     EVEX VOP %ymmM, %ymmM, %ymmN

>  	       -> VEX vpandn %xmmM, %xmmM, %xmmN (M and N < 16)

> -	       -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16)

> +	       -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16) (-O2)

>  	   VOP, one of vpxord and vpxorq:

>  	     EVEX VOP %zmmM, %zmmM, %zmmN

>  	       -> VEX vpxor %xmmM, %xmmM, %xmmN (M and N < 16)

> -	       -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16)

> +	       -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16) (-O2)

>  	     EVEX VOP %ymmM, %ymmM, %ymmN

>  	       -> VEX vpxor %xmmM, %xmmM, %xmmN (M and N < 16)

> -	       -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16)

> +	       -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16) (-O2)

>  	   VOP, one of kxord and kxorq:

>  	     VEX VOP %kM, %kM, %kN

>  	       -> VEX kxorw %kM, %kM, %kN


I disagree - as per my earlier reply to another patch there shouldn't
be any use of cpu_arch_flags here, and -O2 should not all of the
sudden imply AVX512VL to be available when it wasn't explicitly
enabled. Effectively this will make it impossible to add any other,
ISA-independent optimization to -O2 later on.

Jan
H.J. Lu March 19, 2019, 2:53 a.m. | #2
On Mon, Mar 18, 2019 at 7:42 PM Jan Beulich <JBeulich@suse.com> wrote:
>

> >>> On 17.03.19 at 11:08, <hjl.tools@gmail.com> wrote:

> > --- a/gas/config/tc-i386.c

> > +++ b/gas/config/tc-i386.c

> > @@ -3977,8 +3977,7 @@ optimize_encoding (void)

> >           }

> >       }

> >      }

> > -  else if (optimize > 1

> > -        && i.reg_operands == 3

> > +  else if (i.reg_operands == 3

> >          && i.op[0].regs == i.op[1].regs

> >          && !i.types[2].bitfield.xmmword

> >          && (i.tm.opcode_modifier.vex

> > @@ -4009,15 +4008,15 @@ optimize_encoding (void)

> >               || i.tm.base_opcode == 0x6647)

> >              && i.tm.extension_opcode == None))

> >      {

> > -      /* Optimize: -O2:

> > +      /* Optimize: -O1:

> >          VOP, one of vandnps, vandnpd, vxorps, vxorpd, vpsubb, vpsubd,

> >          vpsubq and vpsubw:

> >            EVEX VOP %zmmM, %zmmM, %zmmN

> >              -> VEX VOP %xmmM, %xmmM, %xmmN (M and N < 16)

> > -            -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16)

> > +            -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16) (-O2)

> >            EVEX VOP %ymmM, %ymmM, %ymmN

> >              -> VEX VOP %xmmM, %xmmM, %xmmN (M and N < 16)

> > -            -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16)

> > +            -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16) (-O2)

> >            VEX VOP %ymmM, %ymmM, %ymmN

> >              -> VEX VOP %xmmM, %xmmM, %xmmN

> >          VOP, one of vpandn and vpxor:

> > @@ -4026,17 +4025,17 @@ optimize_encoding (void)

> >          VOP, one of vpandnd and vpandnq:

> >            EVEX VOP %zmmM, %zmmM, %zmmN

> >              -> VEX vpandn %xmmM, %xmmM, %xmmN (M and N < 16)

> > -            -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16)

> > +            -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16) (-O2)

> >            EVEX VOP %ymmM, %ymmM, %ymmN

> >              -> VEX vpandn %xmmM, %xmmM, %xmmN (M and N < 16)

> > -            -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16)

> > +            -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16) (-O2)

> >          VOP, one of vpxord and vpxorq:

> >            EVEX VOP %zmmM, %zmmM, %zmmN

> >              -> VEX vpxor %xmmM, %xmmM, %xmmN (M and N < 16)

> > -            -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16)

> > +            -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16) (-O2)

> >            EVEX VOP %ymmM, %ymmM, %ymmN

> >              -> VEX vpxor %xmmM, %xmmM, %xmmN (M and N < 16)

> > -            -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16)

> > +            -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16) (-O2)

> >          VOP, one of kxord and kxorq:

> >            VEX VOP %kM, %kM, %kN

> >              -> VEX kxorw %kM, %kM, %kN

>

> I disagree - as per my earlier reply to another patch there shouldn't

> be any use of cpu_arch_flags here, and -O2 should not all of the

> sudden imply AVX512VL to be available when it wasn't explicitly

> enabled. Effectively this will make it impossible to add any other,

> ISA-independent optimization to -O2 later on.

>

>


Does

https://sourceware.org/ml/binutils/2019-03/msg00111.html

fix this problem?

-- 
H.J.
Jan Beulich March 19, 2019, 8:23 a.m. | #3
>>> On 19.03.19 at 03:53, <hjl.tools@gmail.com> wrote:

> On Mon, Mar 18, 2019 at 7:42 PM Jan Beulich <JBeulich@suse.com> wrote:

>>

>> >>> On 17.03.19 at 11:08, <hjl.tools@gmail.com> wrote:

>> > --- a/gas/config/tc-i386.c

>> > +++ b/gas/config/tc-i386.c

>> > @@ -3977,8 +3977,7 @@ optimize_encoding (void)

>> >           }

>> >       }

>> >      }

>> > -  else if (optimize > 1

>> > -        && i.reg_operands == 3

>> > +  else if (i.reg_operands == 3

>> >          && i.op[0].regs == i.op[1].regs

>> >          && !i.types[2].bitfield.xmmword

>> >          && (i.tm.opcode_modifier.vex

>> > @@ -4009,15 +4008,15 @@ optimize_encoding (void)

>> >               || i.tm.base_opcode == 0x6647)

>> >              && i.tm.extension_opcode == None))

>> >      {

>> > -      /* Optimize: -O2:

>> > +      /* Optimize: -O1:

>> >          VOP, one of vandnps, vandnpd, vxorps, vxorpd, vpsubb, vpsubd,

>> >          vpsubq and vpsubw:

>> >            EVEX VOP %zmmM, %zmmM, %zmmN

>> >              -> VEX VOP %xmmM, %xmmM, %xmmN (M and N < 16)

>> > -            -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16)

>> > +            -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16) (-O2)

>> >            EVEX VOP %ymmM, %ymmM, %ymmN

>> >              -> VEX VOP %xmmM, %xmmM, %xmmN (M and N < 16)

>> > -            -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16)

>> > +            -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16) (-O2)

>> >            VEX VOP %ymmM, %ymmM, %ymmN

>> >              -> VEX VOP %xmmM, %xmmM, %xmmN

>> >          VOP, one of vpandn and vpxor:

>> > @@ -4026,17 +4025,17 @@ optimize_encoding (void)

>> >          VOP, one of vpandnd and vpandnq:

>> >            EVEX VOP %zmmM, %zmmM, %zmmN

>> >              -> VEX vpandn %xmmM, %xmmM, %xmmN (M and N < 16)

>> > -            -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16)

>> > +            -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16) (-O2)

>> >            EVEX VOP %ymmM, %ymmM, %ymmN

>> >              -> VEX vpandn %xmmM, %xmmM, %xmmN (M and N < 16)

>> > -            -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16)

>> > +            -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16) (-O2)

>> >          VOP, one of vpxord and vpxorq:

>> >            EVEX VOP %zmmM, %zmmM, %zmmN

>> >              -> VEX vpxor %xmmM, %xmmM, %xmmN (M and N < 16)

>> > -            -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16)

>> > +            -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16) (-O2)

>> >            EVEX VOP %ymmM, %ymmM, %ymmN

>> >              -> VEX vpxor %xmmM, %xmmM, %xmmN (M and N < 16)

>> > -            -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16)

>> > +            -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16) (-O2)

>> >          VOP, one of kxord and kxorq:

>> >            VEX VOP %kM, %kM, %kN

>> >              -> VEX kxorw %kM, %kM, %kN

>>

>> I disagree - as per my earlier reply to another patch there shouldn't

>> be any use of cpu_arch_flags here, and -O2 should not all of the

>> sudden imply AVX512VL to be available when it wasn't explicitly

>> enabled. Effectively this will make it impossible to add any other,

>> ISA-independent optimization to -O2 later on.

>>

>>

> 

> Does

> 

> https://sourceware.org/ml/binutils/2019-03/msg00111.html 

> 

> fix this problem?


I think so, yes, but I also think you've gone a little too far
with code you remove:

>--- a/gas/config/tc-i386.c

>+++ b/gas/config/tc-i386.c

>@@ -3985,9 +3985,6 @@ optimize_encoding (void)

> 		   && !i.rounding

> 		   && is_evex_encoding (&i.tm)

> 		   && (i.vec_encoding != vex_encoding_evex

>-		       || cpu_arch_flags.bitfield.cpuavx

>-		       || cpu_arch_isa_flags.bitfield.cpuavx

>-		       || cpu_arch_flags.bitfield.cpuavx512vl

> 		       || cpu_arch_isa_flags.bitfield.cpuavx512vl

> 		       || i.tm.cpu_flags.bitfield.cpuavx512vl

> 		       || (i.tm.operand_types[2].bitfield.zmmword


I agree here.

>@@ -4045,17 +4042,13 @@ optimize_encoding (void)

>        */

>       if (is_evex_encoding (&i.tm))

> 	{

>-	  if (i.vec_encoding != vex_encoding_evex

>-	      && (cpu_arch_flags.bitfield.cpuavx

>-		  || cpu_arch_isa_flags.bitfield.cpuavx))

>+	  if (i.vec_encoding != vex_encoding_evex)

> 	    {

> 	      i.tm.opcode_modifier.vex = VEX128;

> 	      i.tm.opcode_modifier.vexw = VEXW0;

> 	      i.tm.opcode_modifier.evex = 0;

> 	    }

>-	  else if (optimize > 1

>-		   && (cpu_arch_flags.bitfield.cpuavx512vl

>-		       || cpu_arch_isa_flags.bitfield.cpuavx512vl))

>+	  else if (optimize > 1)

> 	    i.tm.opcode_modifier.evex = EVEX128;


But don't you need to retain the cpu_arch_isa_flags check here?

Btw., I see you're removing two of the cpuavx checks here that
I've mentioned in the mail sent a few minutes ago - thanks.
There's a 3rd one though:

  else if ((cpu_arch_flags.bitfield.cpuavx
	    || cpu_arch_isa_flags.bitfield.cpuavx)
	   && i.vec_encoding != vex_encoding_evex
	   && !i.types[0].bitfield.zmmword
	   && !i.mask
	   && is_evex_encoding (&i.tm)
	   && (i.tm.base_opcode == 0x666f
	       || (i.tm.base_opcode ^ Opcode_SIMD_IntD) == 0x666f

Jan

Patch

diff --git a/gas/config/tc-i386.c b/gas/config/tc-i386.c
index 8047ddf8b1..856c18d672 100644
--- a/gas/config/tc-i386.c
+++ b/gas/config/tc-i386.c
@@ -3977,8 +3977,7 @@  optimize_encoding (void)
 	    }
 	}
     }
-  else if (optimize > 1
-	   && i.reg_operands == 3
+  else if (i.reg_operands == 3
 	   && i.op[0].regs == i.op[1].regs
 	   && !i.types[2].bitfield.xmmword
 	   && (i.tm.opcode_modifier.vex
@@ -4009,15 +4008,15 @@  optimize_encoding (void)
 		|| i.tm.base_opcode == 0x6647)
 	       && i.tm.extension_opcode == None))
     {
-      /* Optimize: -O2:
+      /* Optimize: -O1:
 	   VOP, one of vandnps, vandnpd, vxorps, vxorpd, vpsubb, vpsubd,
 	   vpsubq and vpsubw:
 	     EVEX VOP %zmmM, %zmmM, %zmmN
 	       -> VEX VOP %xmmM, %xmmM, %xmmN (M and N < 16)
-	       -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16)
+	       -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16) (-O2)
 	     EVEX VOP %ymmM, %ymmM, %ymmN
 	       -> VEX VOP %xmmM, %xmmM, %xmmN (M and N < 16)
-	       -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16)
+	       -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16) (-O2)
 	     VEX VOP %ymmM, %ymmM, %ymmN
 	       -> VEX VOP %xmmM, %xmmM, %xmmN
 	   VOP, one of vpandn and vpxor:
@@ -4026,17 +4025,17 @@  optimize_encoding (void)
 	   VOP, one of vpandnd and vpandnq:
 	     EVEX VOP %zmmM, %zmmM, %zmmN
 	       -> VEX vpandn %xmmM, %xmmM, %xmmN (M and N < 16)
-	       -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16)
+	       -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16) (-O2)
 	     EVEX VOP %ymmM, %ymmM, %ymmN
 	       -> VEX vpandn %xmmM, %xmmM, %xmmN (M and N < 16)
-	       -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16)
+	       -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16) (-O2)
 	   VOP, one of vpxord and vpxorq:
 	     EVEX VOP %zmmM, %zmmM, %zmmN
 	       -> VEX vpxor %xmmM, %xmmM, %xmmN (M and N < 16)
-	       -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16)
+	       -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16) (-O2)
 	     EVEX VOP %ymmM, %ymmM, %ymmN
 	       -> VEX vpxor %xmmM, %xmmM, %xmmN (M and N < 16)
-	       -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16)
+	       -> EVEX VOP %xmmM, %xmmM, %xmmN (M || N >= 16) (-O2)
 	   VOP, one of kxord and kxorq:
 	     VEX VOP %kM, %kM, %kN
 	       -> VEX kxorw %kM, %kM, %kN
@@ -4054,8 +4053,9 @@  optimize_encoding (void)
 	      i.tm.opcode_modifier.vexw = VEXW0;
 	      i.tm.opcode_modifier.evex = 0;
 	    }
-	  else if (cpu_arch_flags.bitfield.cpuavx512vl
-		   || cpu_arch_isa_flags.bitfield.cpuavx512vl)
+	  else if (optimize > 1
+		   && (cpu_arch_flags.bitfield.cpuavx512vl
+		       || cpu_arch_isa_flags.bitfield.cpuavx512vl))
 	    i.tm.opcode_modifier.evex = EVEX128;
 	  else
 	    return;
diff --git a/gas/doc/c-i386.texi b/gas/doc/c-i386.texi
index 6c63560dbc..7e5f5c257e 100644
--- a/gas/doc/c-i386.texi
+++ b/gas/doc/c-i386.texi
@@ -453,10 +453,12 @@  Intel64 ISA in 64-bit mode.  The default is to accept both.
 Optimize instruction encoding with smaller instruction size.  @samp{-O}
 and @samp{-O1} encode 64-bit register load instructions with 64-bit
 immediate as 32-bit register load instructions with 31-bit or 32-bits
-immediates and encode 64-bit register clearing instructions with 32-bit
-register clearing instructions.  @samp{-O2} includes @samp{-O1}
-optimization plus encodes 256-bit and 512-bit vector register clearing
-instructions with 128-bit vector register clearing instructions.
+immediates, encode 64-bit register clearing instructions with 32-bit
+register clearing instructions and encode 256-bit/512-bit VEX/EVEX
+vector register clearing instructions with 128-bit VEX vector register
+clearing instructions.  @samp{-O2} includes @samp{-O1} optimization plus
+encodes 256-bit/512-bit EVEX vector register clearing instructions with
+128-bit EVEX vector register clearing instructions.
 @samp{-Os} includes @samp{-O2} optimization plus encodes 16-bit, 32-bit
 and 64-bit register tests with immediate as 8-bit register test with
 immediate.  @samp{-O0} turns off this optimization.
diff --git a/gas/testsuite/gas/i386/i386.exp b/gas/testsuite/gas/i386/i386.exp
index 61a13eaf9e..798bfb564a 100644
--- a/gas/testsuite/gas/i386/i386.exp
+++ b/gas/testsuite/gas/i386/i386.exp
@@ -468,6 +468,7 @@  if [expr ([istarget "i*86-*-*"] ||  [istarget "x86_64-*-*"]) && [gas_32_check]]
     run_dump_test "nop-1"
     run_dump_test "nop-2"
     run_dump_test "optimize-1"
+    run_dump_test "optimize-1a"
     run_dump_test "optimize-2"
     run_dump_test "optimize-3"
     run_dump_test "optimize-4"
@@ -981,6 +982,7 @@  if [expr ([istarget "i*86-*-*"] || [istarget "x86_64-*-*"]) && [gas_64_check]] t
     run_dump_test "x86-64-nop-2"
     run_dump_test "x86-64-optimize-1"
     run_dump_test "x86-64-optimize-2"
+    run_dump_test "x86-64-optimize-2a"
     run_dump_test "x86-64-optimize-3"
     run_dump_test "x86-64-optimize-4"
     run_dump_test "x86-64-optimize-5"
diff --git a/gas/testsuite/gas/i386/optimize-1a.d b/gas/testsuite/gas/i386/optimize-1a.d
new file mode 100644
index 0000000000..e6e6d81fe4
--- /dev/null
+++ b/gas/testsuite/gas/i386/optimize-1a.d
@@ -0,0 +1,66 @@ 
+#source: optimize-1.s
+#as: -O
+#objdump: -drw
+#name: optimized encoding 1a with -O
+
+.*: +file format .*
+
+
+Disassembly of section .text:
+
+0+ <_start>:
+ +[a-f0-9]+:	62 f1 f5 4f 55 e9    	vandnpd %zmm1,%zmm1,%zmm5\{%k7\}
+ +[a-f0-9]+:	c5 f1 55 e9          	vandnpd %xmm1,%xmm1,%xmm5
+ +[a-f0-9]+:	c5 f1 55 e9          	vandnpd %xmm1,%xmm1,%xmm5
+ +[a-f0-9]+:	c5 f1 55 e9          	vandnpd %xmm1,%xmm1,%xmm5
+ +[a-f0-9]+:	62 f1 74 4f 55 e9    	vandnps %zmm1,%zmm1,%zmm5\{%k7\}
+ +[a-f0-9]+:	c5 f0 55 e9          	vandnps %xmm1,%xmm1,%xmm5
+ +[a-f0-9]+:	c5 f0 55 e9          	vandnps %xmm1,%xmm1,%xmm5
+ +[a-f0-9]+:	c5 f0 55 e9          	vandnps %xmm1,%xmm1,%xmm5
+ +[a-f0-9]+:	c5 f1 df e9          	vpandn %xmm1,%xmm1,%xmm5
+ +[a-f0-9]+:	62 f1 75 4f df e9    	vpandnd %zmm1,%zmm1,%zmm5\{%k7\}
+ +[a-f0-9]+:	c5 f1 df e9          	vpandn %xmm1,%xmm1,%xmm5
+ +[a-f0-9]+:	c5 f1 df e9          	vpandn %xmm1,%xmm1,%xmm5
+ +[a-f0-9]+:	c5 f1 df e9          	vpandn %xmm1,%xmm1,%xmm5
+ +[a-f0-9]+:	62 f1 f5 4f df e9    	vpandnq %zmm1,%zmm1,%zmm5\{%k7\}
+ +[a-f0-9]+:	c5 f1 df e9          	vpandn %xmm1,%xmm1,%xmm5
+ +[a-f0-9]+:	c5 f1 df e9          	vpandn %xmm1,%xmm1,%xmm5
+ +[a-f0-9]+:	c5 f1 df e9          	vpandn %xmm1,%xmm1,%xmm5
+ +[a-f0-9]+:	62 f1 f5 4f 57 e9    	vxorpd %zmm1,%zmm1,%zmm5\{%k7\}
+ +[a-f0-9]+:	c5 f1 57 e9          	vxorpd %xmm1,%xmm1,%xmm5
+ +[a-f0-9]+:	c5 f1 57 e9          	vxorpd %xmm1,%xmm1,%xmm5
+ +[a-f0-9]+:	c5 f1 57 e9          	vxorpd %xmm1,%xmm1,%xmm5
+ +[a-f0-9]+:	62 f1 74 4f 57 e9    	vxorps %zmm1,%zmm1,%zmm5\{%k7\}
+ +[a-f0-9]+:	c5 f0 57 e9          	vxorps %xmm1,%xmm1,%xmm5
+ +[a-f0-9]+:	c5 f0 57 e9          	vxorps %xmm1,%xmm1,%xmm5
+ +[a-f0-9]+:	c5 f0 57 e9          	vxorps %xmm1,%xmm1,%xmm5
+ +[a-f0-9]+:	c5 f1 ef e9          	vpxor  %xmm1,%xmm1,%xmm5
+ +[a-f0-9]+:	62 f1 75 4f ef e9    	vpxord %zmm1,%zmm1,%zmm5\{%k7\}
+ +[a-f0-9]+:	c5 f1 ef e9          	vpxor  %xmm1,%xmm1,%xmm5
+ +[a-f0-9]+:	c5 f1 ef e9          	vpxor  %xmm1,%xmm1,%xmm5
+ +[a-f0-9]+:	c5 f1 ef e9          	vpxor  %xmm1,%xmm1,%xmm5
+ +[a-f0-9]+:	62 f1 f5 4f ef e9    	vpxorq %zmm1,%zmm1,%zmm5\{%k7\}
+ +[a-f0-9]+:	c5 f1 ef e9          	vpxor  %xmm1,%xmm1,%xmm5
+ +[a-f0-9]+:	c5 f1 ef e9          	vpxor  %xmm1,%xmm1,%xmm5
+ +[a-f0-9]+:	c5 f1 ef e9          	vpxor  %xmm1,%xmm1,%xmm5
+ +[a-f0-9]+:	62 f1 75 4f f8 e9    	vpsubb %zmm1,%zmm1,%zmm5\{%k7\}
+ +[a-f0-9]+:	c5 f1 f8 e9          	vpsubb %xmm1,%xmm1,%xmm5
+ +[a-f0-9]+:	c5 f1 f8 e9          	vpsubb %xmm1,%xmm1,%xmm5
+ +[a-f0-9]+:	c5 f1 f8 e9          	vpsubb %xmm1,%xmm1,%xmm5
+ +[a-f0-9]+:	62 f1 75 4f f9 e9    	vpsubw %zmm1,%zmm1,%zmm5\{%k7\}
+ +[a-f0-9]+:	c5 f1 f9 e9          	vpsubw %xmm1,%xmm1,%xmm5
+ +[a-f0-9]+:	c5 f1 f9 e9          	vpsubw %xmm1,%xmm1,%xmm5
+ +[a-f0-9]+:	c5 f1 f9 e9          	vpsubw %xmm1,%xmm1,%xmm5
+ +[a-f0-9]+:	62 f1 75 4f fa e9    	vpsubd %zmm1,%zmm1,%zmm5\{%k7\}
+ +[a-f0-9]+:	c5 f1 fa e9          	vpsubd %xmm1,%xmm1,%xmm5
+ +[a-f0-9]+:	c5 f1 fa e9          	vpsubd %xmm1,%xmm1,%xmm5
+ +[a-f0-9]+:	c5 f1 fa e9          	vpsubd %xmm1,%xmm1,%xmm5
+ +[a-f0-9]+:	62 f1 f5 4f fb e9    	vpsubq %zmm1,%zmm1,%zmm5\{%k7\}
+ +[a-f0-9]+:	c5 f1 fb e9          	vpsubq %xmm1,%xmm1,%xmm5
+ +[a-f0-9]+:	c5 f1 fb e9          	vpsubq %xmm1,%xmm1,%xmm5
+ +[a-f0-9]+:	c5 f1 fb e9          	vpsubq %xmm1,%xmm1,%xmm5
+ +[a-f0-9]+:	c5 f4 47 e9          	kxorw  %k1,%k1,%k5
+ +[a-f0-9]+:	c5 f4 47 e9          	kxorw  %k1,%k1,%k5
+ +[a-f0-9]+:	c5 f4 42 e9          	kandnw %k1,%k1,%k5
+ +[a-f0-9]+:	c5 f4 42 e9          	kandnw %k1,%k1,%k5
+#pass
diff --git a/gas/testsuite/gas/i386/x86-64-optimize-2a.d b/gas/testsuite/gas/i386/x86-64-optimize-2a.d
new file mode 100644
index 0000000000..9c6466d4ae
--- /dev/null
+++ b/gas/testsuite/gas/i386/x86-64-optimize-2a.d
@@ -0,0 +1,110 @@ 
+#source: x86-64-optimize-2.s
+#as: -O
+#objdump: -drw
+#name: x86-64 optimized encoding 2a with -O
+
+.*: +file format .*
+
+
+Disassembly of section .text:
+
+0+ <_start>:
+ +[a-f0-9]+:	62 71 f5 4f 55 f9    	vandnpd %zmm1,%zmm1,%zmm15\{%k7\}
+ +[a-f0-9]+:	c5 71 55 f9          	vandnpd %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	c5 71 55 f9          	vandnpd %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	c5 71 55 f9          	vandnpd %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	62 e1 f5 48 55 c1    	vandnpd %zmm1,%zmm1,%zmm16
+ +[a-f0-9]+:	62 e1 f5 28 55 c1    	vandnpd %ymm1,%ymm1,%ymm16
+ +[a-f0-9]+:	62 b1 f5 40 55 c9    	vandnpd %zmm17,%zmm17,%zmm1
+ +[a-f0-9]+:	62 b1 f5 20 55 c9    	vandnpd %ymm17,%ymm17,%ymm1
+ +[a-f0-9]+:	62 71 74 4f 55 f9    	vandnps %zmm1,%zmm1,%zmm15\{%k7\}
+ +[a-f0-9]+:	c5 70 55 f9          	vandnps %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	c5 70 55 f9          	vandnps %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	c5 70 55 f9          	vandnps %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	62 e1 74 48 55 c1    	vandnps %zmm1,%zmm1,%zmm16
+ +[a-f0-9]+:	62 e1 74 28 55 c1    	vandnps %ymm1,%ymm1,%ymm16
+ +[a-f0-9]+:	62 b1 74 40 55 c9    	vandnps %zmm17,%zmm17,%zmm1
+ +[a-f0-9]+:	62 b1 74 20 55 c9    	vandnps %ymm17,%ymm17,%ymm1
+ +[a-f0-9]+:	c5 71 df f9          	vpandn %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	62 71 75 4f df f9    	vpandnd %zmm1,%zmm1,%zmm15\{%k7\}
+ +[a-f0-9]+:	c5 71 df f9          	vpandn %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	c5 71 df f9          	vpandn %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	c5 71 df f9          	vpandn %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	62 e1 75 48 df c1    	vpandnd %zmm1,%zmm1,%zmm16
+ +[a-f0-9]+:	62 e1 75 28 df c1    	vpandnd %ymm1,%ymm1,%ymm16
+ +[a-f0-9]+:	62 b1 75 40 df c9    	vpandnd %zmm17,%zmm17,%zmm1
+ +[a-f0-9]+:	62 b1 75 20 df c9    	vpandnd %ymm17,%ymm17,%ymm1
+ +[a-f0-9]+:	62 71 f5 4f df f9    	vpandnq %zmm1,%zmm1,%zmm15\{%k7\}
+ +[a-f0-9]+:	c5 71 df f9          	vpandn %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	c5 71 df f9          	vpandn %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	c5 71 df f9          	vpandn %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	62 e1 f5 48 df c1    	vpandnq %zmm1,%zmm1,%zmm16
+ +[a-f0-9]+:	62 e1 f5 28 df c1    	vpandnq %ymm1,%ymm1,%ymm16
+ +[a-f0-9]+:	62 b1 f5 40 df c9    	vpandnq %zmm17,%zmm17,%zmm1
+ +[a-f0-9]+:	62 b1 f5 20 df c9    	vpandnq %ymm17,%ymm17,%ymm1
+ +[a-f0-9]+:	62 71 f5 4f 57 f9    	vxorpd %zmm1,%zmm1,%zmm15\{%k7\}
+ +[a-f0-9]+:	c5 71 57 f9          	vxorpd %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	c5 71 57 f9          	vxorpd %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	c5 71 57 f9          	vxorpd %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	62 e1 f5 48 57 c1    	vxorpd %zmm1,%zmm1,%zmm16
+ +[a-f0-9]+:	62 e1 f5 28 57 c1    	vxorpd %ymm1,%ymm1,%ymm16
+ +[a-f0-9]+:	62 b1 f5 40 57 c9    	vxorpd %zmm17,%zmm17,%zmm1
+ +[a-f0-9]+:	62 b1 f5 20 57 c9    	vxorpd %ymm17,%ymm17,%ymm1
+ +[a-f0-9]+:	62 71 74 4f 57 f9    	vxorps %zmm1,%zmm1,%zmm15\{%k7\}
+ +[a-f0-9]+:	c5 70 57 f9          	vxorps %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	c5 70 57 f9          	vxorps %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	c5 70 57 f9          	vxorps %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	62 e1 74 48 57 c1    	vxorps %zmm1,%zmm1,%zmm16
+ +[a-f0-9]+:	62 e1 74 28 57 c1    	vxorps %ymm1,%ymm1,%ymm16
+ +[a-f0-9]+:	62 b1 74 40 57 c9    	vxorps %zmm17,%zmm17,%zmm1
+ +[a-f0-9]+:	62 b1 74 20 57 c9    	vxorps %ymm17,%ymm17,%ymm1
+ +[a-f0-9]+:	c5 71 ef f9          	vpxor  %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	62 71 75 4f ef f9    	vpxord %zmm1,%zmm1,%zmm15\{%k7\}
+ +[a-f0-9]+:	c5 71 ef f9          	vpxor  %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	c5 71 ef f9          	vpxor  %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	c5 71 ef f9          	vpxor  %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	62 e1 75 48 ef c1    	vpxord %zmm1,%zmm1,%zmm16
+ +[a-f0-9]+:	62 e1 75 28 ef c1    	vpxord %ymm1,%ymm1,%ymm16
+ +[a-f0-9]+:	62 b1 75 40 ef c9    	vpxord %zmm17,%zmm17,%zmm1
+ +[a-f0-9]+:	62 b1 75 20 ef c9    	vpxord %ymm17,%ymm17,%ymm1
+ +[a-f0-9]+:	62 71 f5 4f ef f9    	vpxorq %zmm1,%zmm1,%zmm15\{%k7\}
+ +[a-f0-9]+:	c5 71 ef f9          	vpxor  %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	c5 71 ef f9          	vpxor  %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	c5 71 ef f9          	vpxor  %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	62 e1 f5 48 ef c1    	vpxorq %zmm1,%zmm1,%zmm16
+ +[a-f0-9]+:	62 e1 f5 28 ef c1    	vpxorq %ymm1,%ymm1,%ymm16
+ +[a-f0-9]+:	62 b1 f5 40 ef c9    	vpxorq %zmm17,%zmm17,%zmm1
+ +[a-f0-9]+:	62 b1 f5 20 ef c9    	vpxorq %ymm17,%ymm17,%ymm1
+ +[a-f0-9]+:	62 71 75 4f f8 f9    	vpsubb %zmm1,%zmm1,%zmm15\{%k7\}
+ +[a-f0-9]+:	c5 71 f8 f9          	vpsubb %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	c5 71 f8 f9          	vpsubb %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	c5 71 f8 f9          	vpsubb %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	62 e1 75 48 f8 c1    	vpsubb %zmm1,%zmm1,%zmm16
+ +[a-f0-9]+:	62 e1 75 28 f8 c1    	vpsubb %ymm1,%ymm1,%ymm16
+ +[a-f0-9]+:	62 b1 75 40 f8 c9    	vpsubb %zmm17,%zmm17,%zmm1
+ +[a-f0-9]+:	62 b1 75 20 f8 c9    	vpsubb %ymm17,%ymm17,%ymm1
+ +[a-f0-9]+:	62 71 75 4f f9 f9    	vpsubw %zmm1,%zmm1,%zmm15\{%k7\}
+ +[a-f0-9]+:	c5 71 f9 f9          	vpsubw %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	c5 71 f9 f9          	vpsubw %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	c5 71 f9 f9          	vpsubw %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	62 e1 75 48 f9 c1    	vpsubw %zmm1,%zmm1,%zmm16
+ +[a-f0-9]+:	62 e1 75 28 f9 c1    	vpsubw %ymm1,%ymm1,%ymm16
+ +[a-f0-9]+:	62 b1 75 40 f9 c9    	vpsubw %zmm17,%zmm17,%zmm1
+ +[a-f0-9]+:	62 b1 75 20 f9 c9    	vpsubw %ymm17,%ymm17,%ymm1
+ +[a-f0-9]+:	62 71 75 4f fa f9    	vpsubd %zmm1,%zmm1,%zmm15\{%k7\}
+ +[a-f0-9]+:	c5 71 fa f9          	vpsubd %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	c5 71 fa f9          	vpsubd %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	c5 71 fa f9          	vpsubd %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	62 e1 75 48 fa c1    	vpsubd %zmm1,%zmm1,%zmm16
+ +[a-f0-9]+:	62 e1 75 28 fa c1    	vpsubd %ymm1,%ymm1,%ymm16
+ +[a-f0-9]+:	62 b1 75 40 fa c9    	vpsubd %zmm17,%zmm17,%zmm1
+ +[a-f0-9]+:	62 b1 75 20 fa c9    	vpsubd %ymm17,%ymm17,%ymm1
+ +[a-f0-9]+:	62 71 f5 4f fb f9    	vpsubq %zmm1,%zmm1,%zmm15\{%k7\}
+ +[a-f0-9]+:	c5 71 fb f9          	vpsubq %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	c5 71 fb f9          	vpsubq %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	c5 71 fb f9          	vpsubq %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	62 e1 f5 48 fb c1    	vpsubq %zmm1,%zmm1,%zmm16
+ +[a-f0-9]+:	62 e1 f5 28 fb c1    	vpsubq %ymm1,%ymm1,%ymm16
+ +[a-f0-9]+:	62 b1 f5 40 fb c9    	vpsubq %zmm17,%zmm17,%zmm1
+ +[a-f0-9]+:	62 b1 f5 20 fb c9    	vpsubq %ymm17,%ymm17,%ymm1
+#pass