[21/40] i386: Emulate MMX maskmovq with SSE2 maskmovdqu

Message ID 20190211225553.32050-22-hjl.tools@gmail.com
State Superseded
Headers show
Series
  • V4: Emulate MMX intrinsics with SSE
Related show

Commit Message

H.J. Lu Feb. 11, 2019, 10:55 p.m.
Emulate MMX maskmovq with SSE2 maskmovdqu in 64-bit mode by zero-extending
source and mask operands to 128 bits.  Handle unmapped bits 64:127 at
memory address by adjusting source and mask operands together with memory
address.

	PR target/89021
	* config/i386/xmmintrin.h: Emulate MMX maskmovq with SSE2
	maskmovdqu in 64-bit mode.
---
 gcc/config/i386/xmmintrin.h | 61 +++++++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)

-- 
2.20.1

Comments

Uros Bizjak Feb. 12, 2019, 10:50 a.m. | #1
On Mon, Feb 11, 2019 at 11:55 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>

> Emulate MMX maskmovq with SSE2 maskmovdqu in 64-bit mode by zero-extending

> source and mask operands to 128 bits.  Handle unmapped bits 64:127 at

> memory address by adjusting source and mask operands together with memory

> address.

>

>         PR target/89021

>         * config/i386/xmmintrin.h: Emulate MMX maskmovq with SSE2

>         maskmovdqu in 64-bit mode.

> ---

>  gcc/config/i386/xmmintrin.h | 61 +++++++++++++++++++++++++++++++++++++

>  1 file changed, 61 insertions(+)

>

> diff --git a/gcc/config/i386/xmmintrin.h b/gcc/config/i386/xmmintrin.h

> index 58284378514..e797795f127 100644

> --- a/gcc/config/i386/xmmintrin.h

> +++ b/gcc/config/i386/xmmintrin.h

> @@ -1165,7 +1165,68 @@ _m_pshufw (__m64 __A, int const __N)

>  extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))

>  _mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P)

>  {

> +#ifdef __x86_64__


We need __MMX_WITH_SSE__ target macro defined from the compiler here.

Uros.

> +  /* Emulate MMX maskmovq with SSE2 maskmovdqu and handle unmapped bits

> +     64:127 at address __P.  */

> +  typedef long long __v2di __attribute__ ((__vector_size__ (16)));

> +  typedef char __v16qi __attribute__ ((__vector_size__ (16)));

> +  /* Zero-extend __A and __N to 128 bits.  */

> +  __v2di __A128 = __extension__ (__v2di) { ((__v1di) __A)[0], 0 };

> +  __v2di __N128 = __extension__ (__v2di) { ((__v1di) __N)[0], 0 };

> +

> +  /* Check the alignment of __P.  */

> +  __SIZE_TYPE__ offset = ((__SIZE_TYPE__) __P) & 0xf;

> +  if (offset)

> +    {

> +      /* If the misalignment of __P > 8, subtract __P by 8 bytes.

> +        Otherwise, subtract __P by the misalignment.  */

> +      if (offset > 8)

> +       offset = 8;

> +      __P = (char *) (((__SIZE_TYPE__) __P) - offset);

> +

> +      /* Shift __A128 and __N128 to the left by the adjustment.  */

> +      switch (offset)

> +       {

> +       case 1:

> +         __A128 = __builtin_ia32_pslldqi128 (__A128, 8);

> +         __N128 = __builtin_ia32_pslldqi128 (__N128, 8);

> +         break;

> +       case 2:

> +         __A128 = __builtin_ia32_pslldqi128 (__A128, 2 * 8);

> +         __N128 = __builtin_ia32_pslldqi128 (__N128, 2 * 8);

> +         break;

> +       case 3:

> +         __A128 = __builtin_ia32_pslldqi128 (__A128, 3 * 8);

> +         __N128 = __builtin_ia32_pslldqi128 (__N128, 3 * 8);

> +         break;

> +       case 4:

> +         __A128 = __builtin_ia32_pslldqi128 (__A128, 4 * 8);

> +         __N128 = __builtin_ia32_pslldqi128 (__N128, 4 * 8);

> +         break;

> +       case 5:

> +         __A128 = __builtin_ia32_pslldqi128 (__A128, 5 * 8);

> +         __N128 = __builtin_ia32_pslldqi128 (__N128, 5 * 8);

> +         break;

> +       case 6:

> +         __A128 = __builtin_ia32_pslldqi128 (__A128, 6 * 8);

> +         __N128 = __builtin_ia32_pslldqi128 (__N128, 6 * 8);

> +         break;

> +       case 7:

> +         __A128 = __builtin_ia32_pslldqi128 (__A128, 7 * 8);

> +         __N128 = __builtin_ia32_pslldqi128 (__N128, 7 * 8);

> +         break;

> +       case 8:

> +         __A128 = __builtin_ia32_pslldqi128 (__A128, 8 * 8);

> +         __N128 = __builtin_ia32_pslldqi128 (__N128, 8 * 8);

> +         break;

> +       default:

> +         break;

> +       }

> +    }

> +  __builtin_ia32_maskmovdqu ((__v16qi)__A128, (__v16qi)__N128, __P);

> +#else

>    __builtin_ia32_maskmovq ((__v8qi)__A, (__v8qi)__N, __P);

> +#endif

>  }

>

>  extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))

> --

> 2.20.1

>

Patch

diff --git a/gcc/config/i386/xmmintrin.h b/gcc/config/i386/xmmintrin.h
index 58284378514..e797795f127 100644
--- a/gcc/config/i386/xmmintrin.h
+++ b/gcc/config/i386/xmmintrin.h
@@ -1165,7 +1165,68 @@  _m_pshufw (__m64 __A, int const __N)
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P)
 {
+#ifdef __x86_64__
+  /* Emulate MMX maskmovq with SSE2 maskmovdqu and handle unmapped bits
+     64:127 at address __P.  */
+  typedef long long __v2di __attribute__ ((__vector_size__ (16)));
+  typedef char __v16qi __attribute__ ((__vector_size__ (16)));
+  /* Zero-extend __A and __N to 128 bits.  */
+  __v2di __A128 = __extension__ (__v2di) { ((__v1di) __A)[0], 0 };
+  __v2di __N128 = __extension__ (__v2di) { ((__v1di) __N)[0], 0 };
+
+  /* Check the alignment of __P.  */
+  __SIZE_TYPE__ offset = ((__SIZE_TYPE__) __P) & 0xf;
+  if (offset)
+    {
+      /* If the misalignment of __P > 8, subtract __P by 8 bytes.
+	 Otherwise, subtract __P by the misalignment.  */
+      if (offset > 8)
+	offset = 8;
+      __P = (char *) (((__SIZE_TYPE__) __P) - offset);
+
+      /* Shift __A128 and __N128 to the left by the adjustment.  */
+      switch (offset)
+	{
+	case 1:
+	  __A128 = __builtin_ia32_pslldqi128 (__A128, 8);
+	  __N128 = __builtin_ia32_pslldqi128 (__N128, 8);
+	  break;
+	case 2:
+	  __A128 = __builtin_ia32_pslldqi128 (__A128, 2 * 8);
+	  __N128 = __builtin_ia32_pslldqi128 (__N128, 2 * 8);
+	  break;
+	case 3:
+	  __A128 = __builtin_ia32_pslldqi128 (__A128, 3 * 8);
+	  __N128 = __builtin_ia32_pslldqi128 (__N128, 3 * 8);
+	  break;
+	case 4:
+	  __A128 = __builtin_ia32_pslldqi128 (__A128, 4 * 8);
+	  __N128 = __builtin_ia32_pslldqi128 (__N128, 4 * 8);
+	  break;
+	case 5:
+	  __A128 = __builtin_ia32_pslldqi128 (__A128, 5 * 8);
+	  __N128 = __builtin_ia32_pslldqi128 (__N128, 5 * 8);
+	  break;
+	case 6:
+	  __A128 = __builtin_ia32_pslldqi128 (__A128, 6 * 8);
+	  __N128 = __builtin_ia32_pslldqi128 (__N128, 6 * 8);
+	  break;
+	case 7:
+	  __A128 = __builtin_ia32_pslldqi128 (__A128, 7 * 8);
+	  __N128 = __builtin_ia32_pslldqi128 (__N128, 7 * 8);
+	  break;
+	case 8:
+	  __A128 = __builtin_ia32_pslldqi128 (__A128, 8 * 8);
+	  __N128 = __builtin_ia32_pslldqi128 (__N128, 8 * 8);
+	  break;
+	default:
+	  break;
+	}
+    }
+  __builtin_ia32_maskmovdqu ((__v16qi)__A128, (__v16qi)__N128, __P);
+#else
   __builtin_ia32_maskmovq ((__v8qi)__A, (__v8qi)__N, __P);
+#endif
 }
 
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))