[24/46] i386: Emulate MMX maskmovq with SSE2 maskmovdqu

Message ID 20190201211809.963-25-hjl.tools@gmail.com
State New
Headers show
Series
  • Implement MMX intrinsics with SSE
Related show

Commit Message

H.J. Lu Feb. 1, 2019, 9:17 p.m.
Emulate MMX maskmovq with SSE2 maskmovdqu by zeroing out the upper 64
bits of the mask operand.  A warning is issued since invalid memory
access may happen when bits 64:127 at memory location are unmapped:

xmmintrin.h:1168:3: note: Emulate MMX maskmovq with SSE2 maskmovdqu may result in invalid memory access
 1168 |   __builtin_ia32_maskmovq ((__v8qi)__A, (__v8qi)__N, __P);
      |   ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Only SSE register source operand is allowed.

	PR target/89021
	* config/i386/mmx.md (mmx_maskmovq): Emulate MMX maskmovq with
	SSE2 maskmovdqu and a warning.
	(sse2_maskmovq_<mode>): New.
	(*mmx_maskmovq): Add "&& !TARGET_MMX_WITH_SSE".
	* config/i386/sse.md (*sse2_maskmovdqu): Renamed to ...
	(sse2_maskmovdqu_<mode>): This.
---
 gcc/config/i386/mmx.md | 59 ++++++++++++++++++++++++++++++++++++++++--
 gcc/config/i386/sse.md |  2 +-
 2 files changed, 58 insertions(+), 3 deletions(-)

-- 
2.20.1

Comments

H.J. Lu Feb. 3, 2019, 4:07 p.m. | #1
On Fri, Feb 01, 2019 at 01:17:47PM -0800, H.J. Lu wrote:
> Emulate MMX maskmovq with SSE2 maskmovdqu by zeroing out the upper 64

> bits of the mask operand.  A warning is issued since invalid memory

> access may happen when bits 64:127 at memory location are unmapped:

> 

> xmmintrin.h:1168:3: note: Emulate MMX maskmovq with SSE2 maskmovdqu may result in invalid memory access

>  1168 |   __builtin_ia32_maskmovq ((__v8qi)__A, (__v8qi)__N, __P);

>       |   ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

> 

> Only SSE register source operand is allowed.

> 


Here is the updated patch to handle unmapped bits 64:127 at memory address
by adjusting source and mask operands together with memory address.


H.J.
---
Emulate MMX maskmovq with SSE2 maskmovdqu by zero-extending source and
mask operands to 128 bits.  Handle unmapped bits 64:127 at memory address
by adjusting source and mask operands together with memory address.

	PR target/89021
	* config/i386/i386.c (ix86_init_mmx_sse_builtins): Don't
	provide __builtin_ia32_maskmovq for TARGET_MMX_WITH_SSE.
	* config/i386/mmx.md (mmx_maskmovq): Add "&& !TARGET_MMX_WITH_SSE".
	(*mmx_maskmovq): Likewise.
	* config/i386/xmmintrin.h: Emulate MMX maskmovq with SSE2
	maskmovdqu.
---
 gcc/config/i386/i386.c      | 15 ++++++++------
 gcc/config/i386/mmx.md      |  4 ++--
 gcc/config/i386/xmmintrin.h | 39 +++++++++++++++++++++++++++++++++++++
 3 files changed, 50 insertions(+), 8 deletions(-)

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 5f4f7e9ddde..b7cbc3f8a2d 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -31048,12 +31048,15 @@ ix86_init_mmx_sse_builtins (void)
   def_builtin_pure (OPTION_MASK_ISA_SSE, 0, "__builtin_ia32_stmxcsr",
 		    UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
 
-  /* SSE or 3DNow!A */
-  def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
-	       /* As it uses V4HImode, we have to require -mmmx too.  */
-	       | OPTION_MASK_ISA_MMX, 0,
-	       "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
-	       IX86_BUILTIN_MASKMOVQ);
+  /* SSE or 3DNow!A.  NB: We can't emulate MMX maskmovq directly with
+     SSE2 maskmovdqu since invalid memory access may happen when bits
+     64:127 at memory location are unmapped.  */
+  if (!TARGET_MMX_WITH_SSE)
+    def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
+		 /* As it uses V4HImode, we have to require -mmmx too.  */
+		 | OPTION_MASK_ISA_MMX, 0,
+		 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
+		 IX86_BUILTIN_MASKMOVQ);
 
   /* SSE2 */
   def_builtin (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_maskmovdqu",
diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index f90574a7255..a1b732ad7be 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -1748,7 +1748,7 @@
 		      (match_operand:V8QI 2 "register_operand")
 		      (match_dup 0)]
 		     UNSPEC_MASKMOV))]
-  "TARGET_SSE || TARGET_3DNOW_A")
+  "(TARGET_SSE || TARGET_3DNOW_A) && !TARGET_MMX_WITH_SSE")
 
 (define_insn "*mmx_maskmovq"
   [(set (mem:V8QI (match_operand:P 0 "register_operand" "D"))
@@ -1756,7 +1756,7 @@
 		      (match_operand:V8QI 2 "register_operand" "y")
 		      (mem:V8QI (match_dup 0))]
 		     UNSPEC_MASKMOV))]
-  "TARGET_SSE || TARGET_3DNOW_A"
+  "(TARGET_SSE || TARGET_3DNOW_A) && !TARGET_MMX_WITH_SSE"
   ;; @@@ check ordering of operands in intel/nonintel syntax
   "maskmovq\t{%2, %1|%1, %2}"
   [(set_attr "type" "mmxcvt")
diff --git a/gcc/config/i386/xmmintrin.h b/gcc/config/i386/xmmintrin.h
index 58284378514..680256f5fab 100644
--- a/gcc/config/i386/xmmintrin.h
+++ b/gcc/config/i386/xmmintrin.h
@@ -1165,7 +1165,46 @@ _m_pshufw (__m64 __A, int const __N)
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P)
 {
+#ifdef __x86_64__
+# ifdef __MMX__
+  __builtin_ia32_maskmovq ((__v8qi)__A, (__v8qi)__N, __P);
+# else
+  /* Emulate MMX maskmovq with SSE2 maskmovdqu and handle unmapped bits
+     64:127 at address __P.  */
+  typedef long long __v2di __attribute__ ((__vector_size__ (16)));
+  typedef char __v16qi __attribute__ ((__vector_size__ (16)));
+  __v2di __A128, __N128;
+
+  /* Check the alignment of __P.  */
+  __SIZE_TYPE__ offset = ((__SIZE_TYPE__) __P) & 0xf;
+  if (offset)
+    {
+      /* If the misalignment of __P > 8, subtract __P by 8 bytes.
+	 Otherwise, subtract __P by the misalignment.  */
+      if (offset > 8)
+	offset = 8;
+      __P = (char *) (((__SIZE_TYPE__) __P) - offset);
+
+      /* Zero-extend __A and __N to 128 bits and shift right by the
+	 adjustment.  */
+      unsigned __int128 __a128 = ((__v1di) __A)[0];
+      unsigned __int128 __n128 = ((__v1di) __N)[0];
+      __a128 <<= offset * 8;
+      __n128 <<= offset * 8;
+      __A128 = __extension__ (__v2di) { __a128, __a128 >> 64 };
+      __N128 = __extension__ (__v2di) { __n128, __n128 >> 64 };
+    }
+  else
+    {
+      /* Zero-extend __A and __N to 128 bits.  */
+      __A128 = __extension__ (__v2di) { ((__v1di) __A)[0], 0 };
+      __N128 = __extension__ (__v2di) { ((__v1di) __N)[0], 0 };
+    }
+  __builtin_ia32_maskmovdqu ((__v16qi)__A128, (__v16qi)__N128, __P);
+# endif
+#else
   __builtin_ia32_maskmovq ((__v8qi)__A, (__v8qi)__N, __P);
+#endif
 }
 
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-- 
2.20.1
Jakub Jelinek Feb. 4, 2019, 11:19 a.m. | #2
On Sun, Feb 03, 2019 at 08:07:22AM -0800, H.J. Lu wrote:
> +      /* If the misalignment of __P > 8, subtract __P by 8 bytes.

> +	 Otherwise, subtract __P by the misalignment.  */

> +      if (offset > 8)

> +	offset = 8;

> +      __P = (char *) (((__SIZE_TYPE__) __P) - offset);

> +

> +      /* Zero-extend __A and __N to 128 bits and shift right by the

> +	 adjustment.  */

> +      unsigned __int128 __a128 = ((__v1di) __A)[0];

> +      unsigned __int128 __n128 = ((__v1di) __N)[0];

> +      __a128 <<= offset * 8;

> +      __n128 <<= offset * 8;

> +      __A128 = __extension__ (__v2di) { __a128, __a128 >> 64 };

> +      __N128 = __extension__ (__v2di) { __n128, __n128 >> 64 };


We have _mm_slli_si128/__builtin_ia32_pslldqi128, why can't you use that
instead of doing the arithmetics in unsigned __int128 scalars?

	Jakub
H.J. Lu Feb. 4, 2019, 12:45 p.m. | #3
On Mon, Feb 4, 2019 at 3:19 AM Jakub Jelinek <jakub@redhat.com> wrote:
>

> On Sun, Feb 03, 2019 at 08:07:22AM -0800, H.J. Lu wrote:

> > +      /* If the misalignment of __P > 8, subtract __P by 8 bytes.

> > +      Otherwise, subtract __P by the misalignment.  */

> > +      if (offset > 8)

> > +     offset = 8;

> > +      __P = (char *) (((__SIZE_TYPE__) __P) - offset);

> > +

> > +      /* Zero-extend __A and __N to 128 bits and shift right by the

> > +      adjustment.  */

> > +      unsigned __int128 __a128 = ((__v1di) __A)[0];

> > +      unsigned __int128 __n128 = ((__v1di) __N)[0];

> > +      __a128 <<= offset * 8;

> > +      __n128 <<= offset * 8;

> > +      __A128 = __extension__ (__v2di) { __a128, __a128 >> 64 };

> > +      __N128 = __extension__ (__v2di) { __n128, __n128 >> 64 };

>

> We have _mm_slli_si128/__builtin_ia32_pslldqi128, why can't you use that

> instead of doing the arithmetics in unsigned __int128 scalars?

>


Since "PSLLDQ xmm1, imm8" takes an immediate operand,  __int128
doesn't need a switch statement.

-- 
H.J.
H.J. Lu Feb. 4, 2019, 1:36 p.m. | #4
On Mon, Feb 04, 2019 at 04:45:24AM -0800, H.J. Lu wrote:
> On Mon, Feb 4, 2019 at 3:19 AM Jakub Jelinek <jakub@redhat.com> wrote:

> >

> > On Sun, Feb 03, 2019 at 08:07:22AM -0800, H.J. Lu wrote:

> > > +      /* If the misalignment of __P > 8, subtract __P by 8 bytes.

> > > +      Otherwise, subtract __P by the misalignment.  */

> > > +      if (offset > 8)

> > > +     offset = 8;

> > > +      __P = (char *) (((__SIZE_TYPE__) __P) - offset);

> > > +

> > > +      /* Zero-extend __A and __N to 128 bits and shift right by the

> > > +      adjustment.  */

> > > +      unsigned __int128 __a128 = ((__v1di) __A)[0];

> > > +      unsigned __int128 __n128 = ((__v1di) __N)[0];

> > > +      __a128 <<= offset * 8;

> > > +      __n128 <<= offset * 8;

> > > +      __A128 = __extension__ (__v2di) { __a128, __a128 >> 64 };

> > > +      __N128 = __extension__ (__v2di) { __n128, __n128 >> 64 };

> >

> > We have _mm_slli_si128/__builtin_ia32_pslldqi128, why can't you use that

> > instead of doing the arithmetics in unsigned __int128 scalars?

> >

> 

> Since "PSLLDQ xmm1, imm8" takes an immediate operand,  __int128

> doesn't need a switch statement.

> 


This updated patch uses __builtin_ia32_pslldqi128.


H.J.
---
Emulate MMX maskmovq with SSE2 maskmovdqu by zero-extending source and
mask operands to 128 bits.  Handle unmapped bits 64:127 at memory address
by adjusting source and mask operands together with memory address.

	PR target/89021
	* config/i386/i386.c (ix86_init_mmx_sse_builtins): Don't
	provide __builtin_ia32_maskmovq for TARGET_MMX_WITH_SSE.
	* config/i386/mmx.md (mmx_maskmovq): Add "&& !TARGET_MMX_WITH_SSE".
	(*mmx_maskmovq): Likewise.
	* config/i386/xmmintrin.h: Emulate MMX maskmovq with SSE2
	maskmovdqu.
---
 gcc/config/i386/i386.c      | 15 +++++----
 gcc/config/i386/mmx.md      |  4 +--
 gcc/config/i386/xmmintrin.h | 65 +++++++++++++++++++++++++++++++++++++
 3 files changed, 76 insertions(+), 8 deletions(-)

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 5f4f7e9ddde..b7cbc3f8a2d 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -31048,12 +31048,15 @@ ix86_init_mmx_sse_builtins (void)
   def_builtin_pure (OPTION_MASK_ISA_SSE, 0, "__builtin_ia32_stmxcsr",
 		    UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
 
-  /* SSE or 3DNow!A */
-  def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
-	       /* As it uses V4HImode, we have to require -mmmx too.  */
-	       | OPTION_MASK_ISA_MMX, 0,
-	       "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
-	       IX86_BUILTIN_MASKMOVQ);
+  /* SSE or 3DNow!A.  NB: We can't emulate MMX maskmovq directly with
+     SSE2 maskmovdqu since invalid memory access may happen when bits
+     64:127 at memory location are unmapped.  */
+  if (!TARGET_MMX_WITH_SSE)
+    def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
+		 /* As it uses V4HImode, we have to require -mmmx too.  */
+		 | OPTION_MASK_ISA_MMX, 0,
+		 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
+		 IX86_BUILTIN_MASKMOVQ);
 
   /* SSE2 */
   def_builtin (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_maskmovdqu",
diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index f90574a7255..a1b732ad7be 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -1748,7 +1748,7 @@
 		      (match_operand:V8QI 2 "register_operand")
 		      (match_dup 0)]
 		     UNSPEC_MASKMOV))]
-  "TARGET_SSE || TARGET_3DNOW_A")
+  "(TARGET_SSE || TARGET_3DNOW_A) && !TARGET_MMX_WITH_SSE")
 
 (define_insn "*mmx_maskmovq"
   [(set (mem:V8QI (match_operand:P 0 "register_operand" "D"))
@@ -1756,7 +1756,7 @@
 		      (match_operand:V8QI 2 "register_operand" "y")
 		      (mem:V8QI (match_dup 0))]
 		     UNSPEC_MASKMOV))]
-  "TARGET_SSE || TARGET_3DNOW_A"
+  "(TARGET_SSE || TARGET_3DNOW_A) && !TARGET_MMX_WITH_SSE"
   ;; @@@ check ordering of operands in intel/nonintel syntax
   "maskmovq\t{%2, %1|%1, %2}"
   [(set_attr "type" "mmxcvt")
diff --git a/gcc/config/i386/xmmintrin.h b/gcc/config/i386/xmmintrin.h
index 58284378514..95152f8b337 100644
--- a/gcc/config/i386/xmmintrin.h
+++ b/gcc/config/i386/xmmintrin.h
@@ -1165,7 +1165,72 @@ _m_pshufw (__m64 __A, int const __N)
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P)
 {
+#ifdef __x86_64__
+# ifdef __MMX__
+  __builtin_ia32_maskmovq ((__v8qi)__A, (__v8qi)__N, __P);
+# else
+  /* Emulate MMX maskmovq with SSE2 maskmovdqu and handle unmapped bits
+     64:127 at address __P.  */
+  typedef long long __v2di __attribute__ ((__vector_size__ (16)));
+  typedef char __v16qi __attribute__ ((__vector_size__ (16)));
+  /* Zero-extend __A and __N to 128 bits.  */
+  __v2di __A128 = __extension__ (__v2di) { ((__v1di) __A)[0], 0 };
+  __v2di __N128 = __extension__ (__v2di) { ((__v1di) __N)[0], 0 };
+
+  /* Check the alignment of __P.  */
+  __SIZE_TYPE__ offset = ((__SIZE_TYPE__) __P) & 0xf;
+  if (offset)
+    {
+      /* If the misalignment of __P > 8, subtract __P by 8 bytes.
+	 Otherwise, subtract __P by the misalignment.  */
+      if (offset > 8)
+	offset = 8;
+      __P = (char *) (((__SIZE_TYPE__) __P) - offset);
+
+      /* Shift __A128 and __N128 to the left by the adjustment.  */
+      switch (offset)
+	{
+	case 1:
+	  __A128 = __builtin_ia32_pslldqi128 (__A128, 8);
+	  __N128 = __builtin_ia32_pslldqi128 (__N128, 8);
+	  break;
+	case 2:
+	  __A128 = __builtin_ia32_pslldqi128 (__A128, 2 * 8);
+	  __N128 = __builtin_ia32_pslldqi128 (__N128, 2 * 8);
+	  break;
+	case 3:
+	  __A128 = __builtin_ia32_pslldqi128 (__A128, 3 * 8);
+	  __N128 = __builtin_ia32_pslldqi128 (__N128, 3 * 8);
+	  break;
+	case 4:
+	  __A128 = __builtin_ia32_pslldqi128 (__A128, 4 * 8);
+	  __N128 = __builtin_ia32_pslldqi128 (__N128, 4 * 8);
+	  break;
+	case 5:
+	  __A128 = __builtin_ia32_pslldqi128 (__A128, 5 * 8);
+	  __N128 = __builtin_ia32_pslldqi128 (__N128, 5 * 8);
+	  break;
+	case 6:
+	  __A128 = __builtin_ia32_pslldqi128 (__A128, 6 * 8);
+	  __N128 = __builtin_ia32_pslldqi128 (__N128, 6 * 8);
+	  break;
+	case 7:
+	  __A128 = __builtin_ia32_pslldqi128 (__A128, 7 * 8);
+	  __N128 = __builtin_ia32_pslldqi128 (__N128, 7 * 8);
+	  break;
+	case 8:
+	  __A128 = __builtin_ia32_pslldqi128 (__A128, 8 * 8);
+	  __N128 = __builtin_ia32_pslldqi128 (__N128, 8 * 8);
+	  break;
+	default:
+	  break;
+	}
+    }
+  __builtin_ia32_maskmovdqu ((__v16qi)__A128, (__v16qi)__N128, __P);
+# endif
+#else
   __builtin_ia32_maskmovq ((__v8qi)__A, (__v8qi)__N, __P);
+#endif
 }
 
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-- 
2.20.1
Jakub Jelinek Feb. 4, 2019, 1:59 p.m. | #5
On Mon, Feb 04, 2019 at 05:36:12AM -0800, H.J. Lu wrote:
> +      /* Shift __A128 and __N128 to the left by the adjustment.  */

> +      switch (offset)


Ah, no, sorry, that is a bad suggestion then.  On the other side,
(zext (word_var)) << shift
where zext is from "word" to double-word and shift is 1 to word bitsize - 1
can be done as (word_var << shift) | ((word_var >> (word_bitsize - shift) } << word_bitsize))
so you could avoid the int128 shifts anyway and just shift left and right
and construct v2di from that.

	Jakub
H.J. Lu Feb. 4, 2019, 2:22 p.m. | #6
On Mon, Feb 4, 2019 at 5:59 AM Jakub Jelinek <jakub@redhat.com> wrote:
>

> On Mon, Feb 04, 2019 at 05:36:12AM -0800, H.J. Lu wrote:

> > +      /* Shift __A128 and __N128 to the left by the adjustment.  */

> > +      switch (offset)

>

> Ah, no, sorry, that is a bad suggestion then.  On the other side,


The generated codes aren't too bad:

(gdb) disass test_maskmovq
Dump of assembler code for function test_maskmovq:
   0x00000000004011b0 <+0>: mov    %rdx,%rax
   0x00000000004011b3 <+3>: movq   (%rdi),%xmm0
   0x00000000004011b7 <+7>: movq   (%rsi),%xmm1
   0x00000000004011bb <+11>: and    $0xf,%eax
   0x00000000004011be <+14>: je     0x4011d4 <test_maskmovq+36>
   0x00000000004011c0 <+16>: cmp    $0x8,%rax
   0x00000000004011c4 <+20>: jbe    0x4011e0 <test_maskmovq+48>
   0x00000000004011c6 <+22>: sub    $0x8,%rdx
   0x00000000004011ca <+26>: pslldq $0x8,%xmm0
   0x00000000004011cf <+31>: pslldq $0x8,%xmm1
   0x00000000004011d4 <+36>: mov    %rdx,%rdi
   0x00000000004011d7 <+39>: maskmovdqu %xmm1,%xmm0
   0x00000000004011db <+43>: retq
   0x00000000004011dc <+44>: nopl   0x0(%rax)
   0x00000000004011e0 <+48>: sub    %rax,%rdx
   0x00000000004011e3 <+51>: jmpq   *0x402008(,%rax,8)
   0x00000000004011ea <+58>: nopw   0x0(%rax,%rax,1)
   0x00000000004011f0 <+64>: pslldq $0x7,%xmm0
   0x00000000004011f5 <+69>: pslldq $0x7,%xmm1
   0x00000000004011fa <+74>: jmp    0x4011d4 <test_maskmovq+36>
   0x00000000004011fc <+76>: nopl   0x0(%rax)
   0x0000000000401200 <+80>: pslldq $0x2,%xmm0
   0x0000000000401205 <+85>: pslldq $0x2,%xmm1
   0x000000000040120a <+90>: jmp    0x4011d4 <test_maskmovq+36>
   0x000000000040120c <+92>: nopl   0x0(%rax)
   0x0000000000401210 <+96>: pslldq $0x3,%xmm0
   0x0000000000401215 <+101>: pslldq $0x3,%xmm1
   0x000000000040121a <+106>: jmp    0x4011d4 <test_maskmovq+36>
   0x000000000040121c <+108>: nopl   0x0(%rax)
   0x0000000000401220 <+112>: pslldq $0x4,%xmm0
   0x0000000000401225 <+117>: pslldq $0x4,%xmm1
   0x000000000040122a <+122>: jmp    0x4011d4 <test_maskmovq+36>
   0x000000000040122c <+124>: nopl   0x0(%rax)
   0x0000000000401230 <+128>: pslldq $0x5,%xmm0
   0x0000000000401235 <+133>: pslldq $0x5,%xmm1
   0x000000000040123a <+138>: jmp    0x4011d4 <test_maskmovq+36>
   0x000000000040123c <+140>: nopl   0x0(%rax)
   0x0000000000401240 <+144>: pslldq $0x6,%xmm0
   0x0000000000401245 <+149>: pslldq $0x6,%xmm1
   0x000000000040124a <+154>: jmp    0x4011d4 <test_maskmovq+36>
   0x000000000040124c <+156>: pslldq $0x1,%xmm0
   0x0000000000401251 <+161>: pslldq $0x1,%xmm1
   0x0000000000401256 <+166>: jmpq   0x4011d4 <test_maskmovq+36>
End of assembler dump.

__int128 isn't much better:

(gdb) disass test_maskmovq
Dump of assembler code for function test_maskmovq:
   0x00000000004011b0 <+0>: mov    %rdx,%rcx
   0x00000000004011b3 <+3>: mov    (%rdi),%rax
   0x00000000004011b6 <+6>: mov    (%rsi),%rdi
   0x00000000004011b9 <+9>: and    $0xf,%ecx
   0x00000000004011bc <+12>: je     0x401240 <test_maskmovq+144>
   0x00000000004011c2 <+18>: cmp    $0x8,%rcx
   0x00000000004011c6 <+22>: mov    $0x8,%esi
   0x00000000004011cb <+27>: mov    %rax,%r8
   0x00000000004011ce <+30>: push   %rbx
   0x00000000004011cf <+31>: cmova  %rsi,%rcx
   0x00000000004011d3 <+35>: sar    $0x3f,%rax
   0x00000000004011d7 <+39>: mov    %r8,%r10
   0x00000000004011da <+42>: mov    %rdi,%rbx
   0x00000000004011dd <+45>: mov    %rax,%r11
   0x00000000004011e0 <+48>: sar    $0x3f,%rdi
   0x00000000004011e4 <+52>: xor    %eax,%eax
   0x00000000004011e6 <+54>: sub    %rcx,%rdx
   0x00000000004011e9 <+57>: shl    $0x3,%ecx
   0x00000000004011ec <+60>: mov    %rdi,%rsi
   0x00000000004011ef <+63>: shl    %cl,%r10
   0x00000000004011f2 <+66>: shld   %cl,%r8,%r11
   0x00000000004011f6 <+70>: test   $0x40,%cl
   0x00000000004011f9 <+73>: cmovne %r10,%r11
   0x00000000004011fd <+77>: cmovne %rax,%r10
   0x0000000000401201 <+81>: shld   %cl,%rbx,%rsi
   0x0000000000401205 <+85>: xor    %edi,%edi
   0x0000000000401207 <+87>: shl    %cl,%rbx
   0x000000000040120a <+90>: test   $0x40,%cl
   0x000000000040120d <+93>: mov    %r11,-0x8(%rsp)
   0x0000000000401212 <+98>: cmovne %rbx,%rsi
   0x0000000000401216 <+102>: movq   %r10,%xmm0
   0x000000000040121b <+107>: cmovne %rdi,%rbx
   0x000000000040121f <+111>: mov    %rdx,%rdi
   0x0000000000401222 <+114>: movq   %rbx,%xmm1
   0x0000000000401227 <+119>: movhps -0x8(%rsp),%xmm0
   0x000000000040122c <+124>: mov    %rsi,-0x8(%rsp)
   0x0000000000401231 <+129>: movhps -0x8(%rsp),%xmm1
   0x0000000000401236 <+134>: maskmovdqu %xmm1,%xmm0
   0x000000000040123a <+138>: pop    %rbx
   0x000000000040123b <+139>: retq
   0x000000000040123c <+140>: nopl   0x0(%rax)
   0x0000000000401240 <+144>: movq   %rdi,%xmm1
   0x0000000000401245 <+149>: movq   %rax,%xmm0
   0x000000000040124a <+154>: mov    %rdx,%rdi
   0x000000000040124d <+157>: maskmovdqu %xmm1,%xmm0
   0x0000000000401251 <+161>: retq
End of assembler dump.

> (zext (word_var)) << shift

> where zext is from "word" to double-word and shift is 1 to word bitsize - 1

> can be done as (word_var << shift) | ((word_var >> (word_bitsize - shift) } << word_bitsize))

> so you could avoid the int128 shifts anyway and just shift left and right

> and construct v2di from that.

>


This requires 2 64-bit variables for one 128-bit variable.  There is much
a difference for x86-64.  I don't think we can emulate MMX with SSE in
32-bit mode since __m64 is passed and returned in MMX registers.


--
H.J.

Patch

diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index f90574a7255..92252984482 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -1748,7 +1748,62 @@ 
 		      (match_operand:V8QI 2 "register_operand")
 		      (match_dup 0)]
 		     UNSPEC_MASKMOV))]
-  "TARGET_SSE || TARGET_3DNOW_A")
+  "TARGET_SSE || TARGET_3DNOW_A"
+{
+  if (TARGET_MMX_WITH_SSE)
+    {
+      /* Emulate MMX maskmovq with SSE2 maskmovdqu and issue a warning
+	 since they aren't equivalent.  */
+      inform (input_location, "Emulate MMX maskmovq with SSE2 maskmovdqu "
+	      "may result in invalid memory access");
+      rtx insn;
+      rtx op = gen_reg_rtx (V2DImode);
+      if (Pmode == SImode)
+	insn = gen_sse2_maskmovq_si (XEXP (operands[0], 0),
+				     operands[1], operands[2], op, op);
+      else
+	insn = gen_sse2_maskmovq_di (XEXP (operands[0], 0),
+				     operands[1], operands[2], op, op);
+      emit_insn (insn);
+      DONE;
+    }
+})
+
+(define_insn_and_split "sse2_maskmovq_<mode>"
+  [(set (mem:V8QI (match_operand:P 0 "register_operand" "D"))
+	(unspec:V8QI [(match_operand:V8QI 1 "register_operand" "Yy")
+		      (match_operand:V8QI 2 "register_operand" "Yy")
+		      (mem:V8QI (match_dup 0))]
+		     UNSPEC_MASKMOV))
+   (set (match_operand:V2DI 3 "register_operand" "=Yy")
+	(unspec:V2DI [(match_operand:V2DI 4 "register_operand" "3")]
+		     UNSPEC_MASKMOV))]
+  "TARGET_MMX_WITH_SSE"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  /* Copy the lower 64 bits of operand 2 (the mask operan) to operand 3.
+     NB: Invalid memory access may happen when bits 64:127 at memory
+     location are unmapped.  */
+  rtx op3 = operands[3];
+  rtx op2 = gen_rtx_REG (V2DImode, REGNO (operands[2]));
+  rtx insn = gen_sse2_movq128 (op3, op2);
+  emit_insn (insn);
+
+  /* Generate SSE2 maskmovdqu with operand 3.  */
+  rtx op1 = gen_rtx_REG (V16QImode, REGNO (operands[1]));
+  op3 = gen_rtx_REG (V16QImode, REGNO (operands[3]));
+  if (Pmode == SImode)
+    insn = gen_sse2_maskmovdqu_si (operands[0], op1, op3);
+  else
+    insn = gen_sse2_maskmovdqu_di (operands[0], op1, op3);
+  emit_insn (insn);
+  DONE;
+}
+  [(set_attr "type" "ssemov")
+   (set_attr "znver1_decode" "vector")
+   (set_attr "mode" "TI")])
 
 (define_insn "*mmx_maskmovq"
   [(set (mem:V8QI (match_operand:P 0 "register_operand" "D"))
@@ -1756,7 +1811,7 @@ 
 		      (match_operand:V8QI 2 "register_operand" "y")
 		      (mem:V8QI (match_dup 0))]
 		     UNSPEC_MASKMOV))]
-  "TARGET_SSE || TARGET_3DNOW_A"
+  "(TARGET_SSE || TARGET_3DNOW_A) && !TARGET_MMX_WITH_SSE"
   ;; @@@ check ordering of operands in intel/nonintel syntax
   "maskmovq\t{%2, %1|%1, %2}"
   [(set_attr "type" "mmxcvt")
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 9ecd9789c1e..7218c9cd646 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -15142,7 +15142,7 @@ 
 		      UNSPEC_MASKMOV))]
   "TARGET_SSE2")
 
-(define_insn "*sse2_maskmovdqu"
+(define_insn "sse2_maskmovdqu_<mode>"
   [(set (mem:V16QI (match_operand:P 0 "register_operand" "D"))
 	(unspec:V16QI [(match_operand:V16QI 1 "register_operand" "x")
 		       (match_operand:V16QI 2 "register_operand" "x")