[v2,rs6000] PR89338, PR89339: Fix compat vector intrinsics for BE and 32-bit

Message ID 50b726a8-5857-3cd1-0d3b-a08e0e13fdf9@us.ibm.com
State New
Headers show
Series
  • [v2,rs6000] PR89338, PR89339: Fix compat vector intrinsics for BE and 32-bit
Related show

Commit Message

Paul Clarke Feb. 19, 2019, 9:03 p.m.
Test FAILS: sse2-cvtpd2dq-1, sse2-cvtpd2ps, sse2-cvttpd2dq on powerpc64
(big-endian).

_mm_cvtpd_epi32, _mm_cvtpd_ps, _mm_cvttpd_epi32: Type conversion from
vector doubleword type to vector word type leaves the results in even
lanes in big endian mode.

Test FAILS: sse-cvtss2si-1, sse-cvtss2si-2, sse-movmskb-1 on powerpc
(32-bit big-endian).

Incorrect type for interpreting the result from mfvsrd instruction leads
to incorrect results.  Also, mfvsrd instruction only works as expected in
64-bit mode or for 32-bit quantities in 32-bit mode.  A more general,
if slower, solution is needed for 32-bit mode.

2019-02-19  Paul A. Clarke  <pc@us.ibm.com>

[gcc]

	* config/rs6000/emmintrin.h (_mm_cvtpd_epi32): Fix big endian.
	(_mm_cvtpd_ps): Likewise.
	(_mm_cvttpd_epi32): Likewise.

	PR89338
	* config/rs6000/xmmintrin.h (_mm_cvtss_f32):  Fix type mismatch.
	(_mm_cvt_ss2si): Fix type mismatch and 32-bit.

	PR89339
	* config/rs6000/xmmintrin.h (_mm_movemask_pi8): Fix 32-bit.

---
v2: more elegant solution for the 32-bit mode fix in _mm_movemask_pi8,
    as suggested by Segher.

Comments

Paul Clarke Feb. 25, 2019, 6:16 p.m. | #1
ping.

On 02/19/2019 03:03 PM, Paul Clarke wrote:
> Test FAILS: sse2-cvtpd2dq-1, sse2-cvtpd2ps, sse2-cvttpd2dq on powerpc64

> (big-endian).

> 

> _mm_cvtpd_epi32, _mm_cvtpd_ps, _mm_cvttpd_epi32: Type conversion from

> vector doubleword type to vector word type leaves the results in even

> lanes in big endian mode.

> 

> Test FAILS: sse-cvtss2si-1, sse-cvtss2si-2, sse-movmskb-1 on powerpc

> (32-bit big-endian).

> 

> Incorrect type for interpreting the result from mfvsrd instruction leads

> to incorrect results.  Also, mfvsrd instruction only works as expected in

> 64-bit mode or for 32-bit quantities in 32-bit mode.  A more general,

> if slower, solution is needed for 32-bit mode.

> 

> 2019-02-19  Paul A. Clarke  <pc@us.ibm.com>

> 

> [gcc]

> 

> 	* config/rs6000/emmintrin.h (_mm_cvtpd_epi32): Fix big endian.

> 	(_mm_cvtpd_ps): Likewise.

> 	(_mm_cvttpd_epi32): Likewise.

> 

> 	PR89338

> 	* config/rs6000/xmmintrin.h (_mm_cvtss_f32):  Fix type mismatch.

> 	(_mm_cvt_ss2si): Fix type mismatch and 32-bit.

> 

> 	PR89339

> 	* config/rs6000/xmmintrin.h (_mm_movemask_pi8): Fix 32-bit.

> 

> ---

> v2: more elegant solution for the 32-bit mode fix in _mm_movemask_pi8,

>     as suggested by Segher.

> 

> Index: gcc/config/rs6000/emmintrin.h

> ===================================================================

> diff --git a/trunk/gcc/config/rs6000/emmintrin.h b/trunk/gcc/config/rs6000/emmintrin.h

> --- a/trunk/gcc/config/rs6000/emmintrin.h	(revision 268997)

> +++ b/trunk/gcc/config/rs6000/emmintrin.h	(working copy)

> @@ -887,7 +887,11 @@ _mm_cvtpd_epi32 (__m128d __A)

>        : );

>  

>  #ifdef _ARCH_PWR8

> +#ifdef __LITTLE_ENDIAN__

>    temp = vec_mergeo (temp, temp);

> +#else

> +  temp = vec_mergee (temp, temp);

> +#endif

>    result = (__v4si) vec_vpkudum ((__vector long long) temp,

>  				 (__vector long long) vzero);

>  #else

> @@ -922,7 +926,11 @@ _mm_cvtpd_ps (__m128d __A)

>        : );

>  

>  #ifdef _ARCH_PWR8

> +#ifdef __LITTLE_ENDIAN__

>    temp = vec_mergeo (temp, temp);

> +#else

> +  temp = vec_mergee (temp, temp);

> +#endif

>    result = (__v4sf) vec_vpkudum ((__vector long long) temp,

>  				 (__vector long long) vzero);

>  #else

> @@ -951,7 +959,11 @@ _mm_cvttpd_epi32 (__m128d __A)

>        : );

>  

>  #ifdef _ARCH_PWR8

> +#ifdef __LITTLE_ENDIAN__

>    temp = vec_mergeo (temp, temp);

> +#else

> +  temp = vec_mergee (temp, temp);

> +#endif

>    result = (__v4si) vec_vpkudum ((__vector long long) temp,

>  				 (__vector long long) vzero);

>  #else

> Index: gcc/config/rs6000/xmmintrin.h

> ===================================================================

> diff --git a/trunk/gcc/config/rs6000/xmmintrin.h b/trunk/gcc/config/rs6000/xmmintrin.h

> --- a/trunk/gcc/config/rs6000/xmmintrin.h	(revision 268997)

> +++ b/trunk/gcc/config/rs6000/xmmintrin.h	(working copy)

> @@ -905,7 +905,7 @@ _mm_cvtss_f32 (__m128 __A)

>  extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))

>  _mm_cvtss_si32 (__m128 __A)

>  {

> -  __m64 res = 0;

> +  int res;

>  #ifdef _ARCH_PWR8

>    double dtmp;

>    __asm__(

> @@ -938,8 +938,8 @@ _mm_cvt_ss2si (__m128 __A)

>  extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))

>  _mm_cvtss_si64 (__m128 __A)

>  {

> -  __m64 res = 0;

> -#ifdef _ARCH_PWR8

> +  long long res;

> +#if defined (_ARCH_PWR8) && defined (__powerpc64__)

>    double dtmp;

>    __asm__(

>  #ifdef __LITTLE_ENDIAN__

> @@ -1577,6 +1577,7 @@ _m_pminub (__m64 __A, __m64 __B)

>  extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))

>  _mm_movemask_pi8 (__m64 __A)

>  {

> +#ifdef __powerpc64__

>    unsigned long long p =

>  #ifdef __LITTLE_ENDIAN__

>                           0x0008101820283038UL; // permute control for sign bits

> @@ -1584,6 +1585,12 @@ _mm_movemask_pi8 (__m64 __A)

>                           0x3830282018100800UL; // permute control for sign bits

>  #endif

>    return __builtin_bpermd (p, __A);

> +#else

> +  unsigned int mask = 0x20283038UL;

> +  unsigned int r1 = __builtin_bpermd (mask, __A) & 0xf;

> +  unsigned int r2 = __builtin_bpermd (mask, __A >> 32) & 0xf;

> +  return (r2 << 4) | r1;

> +#endif

>  }

>  

>  extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))

>
Segher Boessenkool Feb. 25, 2019, 7:18 p.m. | #2
Hi Paul,

On Tue, Feb 19, 2019 at 03:03:58PM -0600, Paul Clarke wrote:
> Test FAILS: sse2-cvtpd2dq-1, sse2-cvtpd2ps, sse2-cvttpd2dq on powerpc64

> (big-endian).

> 

> _mm_cvtpd_epi32, _mm_cvtpd_ps, _mm_cvttpd_epi32: Type conversion from

> vector doubleword type to vector word type leaves the results in even

> lanes in big endian mode.

> 

> Test FAILS: sse-cvtss2si-1, sse-cvtss2si-2, sse-movmskb-1 on powerpc

> (32-bit big-endian).

> 

> Incorrect type for interpreting the result from mfvsrd instruction leads

> to incorrect results.  Also, mfvsrd instruction only works as expected in

> 64-bit mode or for 32-bit quantities in 32-bit mode.  A more general,

> if slower, solution is needed for 32-bit mode.


Sorry for not reviewing this before.  Thanks for the ping :-)

> 2019-02-19  Paul A. Clarke  <pc@us.ibm.com>

> 

> [gcc]

> 

> 	* config/rs6000/emmintrin.h (_mm_cvtpd_epi32): Fix big endian.

> 	(_mm_cvtpd_ps): Likewise.

> 	(_mm_cvttpd_epi32): Likewise.

> 

> 	PR89338


This should be

	PR target/89338

> 	* config/rs6000/xmmintrin.h (_mm_cvtss_f32):  Fix type mismatch.

> 	(_mm_cvt_ss2si): Fix type mismatch and 32-bit.

> 

> 	PR89339

> 	* config/rs6000/xmmintrin.h (_mm_movemask_pi8): Fix 32-bit.


Okay for trunk with those corrected.  Thanks!


Segher

Patch

Index: gcc/config/rs6000/emmintrin.h
===================================================================
diff --git a/trunk/gcc/config/rs6000/emmintrin.h b/trunk/gcc/config/rs6000/emmintrin.h
--- a/trunk/gcc/config/rs6000/emmintrin.h	(revision 268997)
+++ b/trunk/gcc/config/rs6000/emmintrin.h	(working copy)
@@ -887,7 +887,11 @@  _mm_cvtpd_epi32 (__m128d __A)
       : );
 
 #ifdef _ARCH_PWR8
+#ifdef __LITTLE_ENDIAN__
   temp = vec_mergeo (temp, temp);
+#else
+  temp = vec_mergee (temp, temp);
+#endif
   result = (__v4si) vec_vpkudum ((__vector long long) temp,
 				 (__vector long long) vzero);
 #else
@@ -922,7 +926,11 @@  _mm_cvtpd_ps (__m128d __A)
       : );
 
 #ifdef _ARCH_PWR8
+#ifdef __LITTLE_ENDIAN__
   temp = vec_mergeo (temp, temp);
+#else
+  temp = vec_mergee (temp, temp);
+#endif
   result = (__v4sf) vec_vpkudum ((__vector long long) temp,
 				 (__vector long long) vzero);
 #else
@@ -951,7 +959,11 @@  _mm_cvttpd_epi32 (__m128d __A)
       : );
 
 #ifdef _ARCH_PWR8
+#ifdef __LITTLE_ENDIAN__
   temp = vec_mergeo (temp, temp);
+#else
+  temp = vec_mergee (temp, temp);
+#endif
   result = (__v4si) vec_vpkudum ((__vector long long) temp,
 				 (__vector long long) vzero);
 #else
Index: gcc/config/rs6000/xmmintrin.h
===================================================================
diff --git a/trunk/gcc/config/rs6000/xmmintrin.h b/trunk/gcc/config/rs6000/xmmintrin.h
--- a/trunk/gcc/config/rs6000/xmmintrin.h	(revision 268997)
+++ b/trunk/gcc/config/rs6000/xmmintrin.h	(working copy)
@@ -905,7 +905,7 @@  _mm_cvtss_f32 (__m128 __A)
 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cvtss_si32 (__m128 __A)
 {
-  __m64 res = 0;
+  int res;
 #ifdef _ARCH_PWR8
   double dtmp;
   __asm__(
@@ -938,8 +938,8 @@  _mm_cvt_ss2si (__m128 __A)
 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cvtss_si64 (__m128 __A)
 {
-  __m64 res = 0;
-#ifdef _ARCH_PWR8
+  long long res;
+#if defined (_ARCH_PWR8) && defined (__powerpc64__)
   double dtmp;
   __asm__(
 #ifdef __LITTLE_ENDIAN__
@@ -1577,6 +1577,7 @@  _m_pminub (__m64 __A, __m64 __B)
 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_movemask_pi8 (__m64 __A)
 {
+#ifdef __powerpc64__
   unsigned long long p =
 #ifdef __LITTLE_ENDIAN__
                          0x0008101820283038UL; // permute control for sign bits
@@ -1584,6 +1585,12 @@  _mm_movemask_pi8 (__m64 __A)
                          0x3830282018100800UL; // permute control for sign bits
 #endif
   return __builtin_bpermd (p, __A);
+#else
+  unsigned int mask = 0x20283038UL;
+  unsigned int r1 = __builtin_bpermd (mask, __A) & 0xf;
+  unsigned int r2 = __builtin_bpermd (mask, __A >> 32) & 0xf;
+  return (r2 << 4) | r1;
+#endif
 }
 
 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))