[x86] Match movss and movsd "blend" instructions

Message ID 6341719.0aBNjb40xM@twilight
State New
Headers show
Series
  • [x86] Match movss and movsd "blend" instructions
Related show

Commit Message

Allan Sandfeld Jensen Aug. 1, 2018, 4 p.m.
Adds the ability to match movss and movsd as blend patterns,
implemented in a new method to be able to match these before shuffles,
while keeping other blends after.

2018-07-29 Allan Sandfeld Jensen <allan.jensen@qt.io>

gcc/config/i386

    * i386.cc (expand_vec_perm_movs): New method matching movs
    patterns.
    * i386.cc (expand_vec_perm_1): Try the new method.

gcc/testsuite

    * gcc.target/i386/sse2-movs.c: New test.
---
 gcc/config/i386/emmintrin.h               |  2 +-
 gcc/config/i386/i386.c                    | 44 +++++++++++++++++++++++
 gcc/config/i386/xmmintrin.h               |  2 +-
 gcc/testsuite/gcc.target/i386/sse2-movs.c | 21 +++++++++++
 4 files changed, 67 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/sse2-movs.c

Comments

Marc Glisse Aug. 1, 2018, 4:51 p.m. | #1
On Wed, 1 Aug 2018, Allan Sandfeld Jensen wrote:

>  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 

> __artificial__))

>  _mm_move_sd (__m128d __A, __m128d __B)

>  {

> -  return (__m128d) __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);

> +  return __extension__ (__m128d)(__v2df){__B[0],__A[1]};

>  }


If the goal is to have it represented as a VEC_PERM_EXPR internally, I 
wonder if we should be explicit and use __builtin_shuffle instead of 
relying on some forwprop pass to transform it. Maybe not, just asking. And 
the answer need not even be the same for _mm_move_sd and _mm_move_ss.

-- 
Marc Glisse
Allan Sandfeld Jensen Aug. 2, 2018, 9:12 a.m. | #2
On Mittwoch, 1. August 2018 18:51:41 CEST Marc Glisse wrote:
> On Wed, 1 Aug 2018, Allan Sandfeld Jensen wrote:

> >  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__,

> > 

> > __artificial__))

> > 

> >  _mm_move_sd (__m128d __A, __m128d __B)

> >  {

> > 

> > -  return (__m128d) __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);

> > +  return __extension__ (__m128d)(__v2df){__B[0],__A[1]};

> > 

> >  }

> 

> If the goal is to have it represented as a VEC_PERM_EXPR internally, I

> wonder if we should be explicit and use __builtin_shuffle instead of

> relying on some forwprop pass to transform it. Maybe not, just asking. And

> the answer need not even be the same for _mm_move_sd and _mm_move_ss.


I wrote it this way because this pattern could later also be used for the 
other _ss intrinsics, such as _mm_add_ss, where a _builtin_shuffle could not. 
To match the other intrinsics the logic that tries to match vector 
construction just needs to be extended to try merge patterns even if one of 
the subexpressions is not simple.

'Allan
Richard Biener Aug. 2, 2018, 9:18 a.m. | #3
On Thu, Aug 2, 2018 at 11:12 AM Allan Sandfeld Jensen
<linux@carewolf.com> wrote:
>

> On Mittwoch, 1. August 2018 18:51:41 CEST Marc Glisse wrote:

> > On Wed, 1 Aug 2018, Allan Sandfeld Jensen wrote:

> > >  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__,

> > >

> > > __artificial__))

> > >

> > >  _mm_move_sd (__m128d __A, __m128d __B)

> > >  {

> > >

> > > -  return (__m128d) __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);

> > > +  return __extension__ (__m128d)(__v2df){__B[0],__A[1]};

> > >

> > >  }

> >

> > If the goal is to have it represented as a VEC_PERM_EXPR internally, I

> > wonder if we should be explicit and use __builtin_shuffle instead of

> > relying on some forwprop pass to transform it. Maybe not, just asking. And

> > the answer need not even be the same for _mm_move_sd and _mm_move_ss.

>

> I wrote it this way because this pattern could later also be used for the

> other _ss intrinsics, such as _mm_add_ss, where a _builtin_shuffle could not.

> To match the other intrinsics the logic that tries to match vector

> construction just needs to be extended to try merge patterns even if one of

> the subexpressions is not simple.


The question is what users expect and get when they use -O0 with intrinsics?

Richard.

> 'Allan

>

>
Allan Sandfeld Jensen Aug. 2, 2018, 8:39 p.m. | #4
On Mittwoch, 1. August 2018 18:51:41 CEST Marc Glisse wrote:
> On Wed, 1 Aug 2018, Allan Sandfeld Jensen wrote:

> >  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__,

> > 

> > __artificial__))

> > 

> >  _mm_move_sd (__m128d __A, __m128d __B)

> >  {

> > 

> > -  return (__m128d) __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);

> > +  return __extension__ (__m128d)(__v2df){__B[0],__A[1]};

> > 

> >  }

> 

> If the goal is to have it represented as a VEC_PERM_EXPR internally, I

> wonder if we should be explicit and use __builtin_shuffle instead of

> relying on some forwprop pass to transform it. Maybe not, just asking. And

> the answer need not even be the same for _mm_move_sd and _mm_move_ss.


I forgot. One of the things that makes using __builtin_shuffle ugly is that 
__v4si  as the suffle argument needs to be in _mm_move_ss, is declared
in emmintrin.h, but _mm_move_ss is in xmmintrin.h.

In general the gcc __builtin_shuffle syntax with the argument being a vector 
is kind of ackward. At least for the declaring intrinsics, the clang still 
where the permutator is extra argument is easier to deal with:
__builtin_shuffle(a, b, (__v4si){4, 0, 1, 2})
 vs
 __builtin_shuffle(a, b, 4, 0, 1, 2)
Allan Sandfeld Jensen Aug. 2, 2018, 8:50 p.m. | #5
On Donnerstag, 2. August 2018 11:18:41 CEST Richard Biener wrote:
> On Thu, Aug 2, 2018 at 11:12 AM Allan Sandfeld Jensen

> 

> <linux@carewolf.com> wrote:

> > On Mittwoch, 1. August 2018 18:51:41 CEST Marc Glisse wrote:

> > > On Wed, 1 Aug 2018, Allan Sandfeld Jensen wrote:

> > > >  extern __inline __m128d __attribute__((__gnu_inline__,

> > > >  __always_inline__,

> > > > 

> > > > __artificial__))

> > > > 

> > > >  _mm_move_sd (__m128d __A, __m128d __B)

> > > >  {

> > > > 

> > > > -  return (__m128d) __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);

> > > > +  return __extension__ (__m128d)(__v2df){__B[0],__A[1]};

> > > > 

> > > >  }

> > > 

> > > If the goal is to have it represented as a VEC_PERM_EXPR internally, I

> > > wonder if we should be explicit and use __builtin_shuffle instead of

> > > relying on some forwprop pass to transform it. Maybe not, just asking.

> > > And

> > > the answer need not even be the same for _mm_move_sd and _mm_move_ss.

> > 

> > I wrote it this way because this pattern could later also be used for the

> > other _ss intrinsics, such as _mm_add_ss, where a _builtin_shuffle could

> > not. To match the other intrinsics the logic that tries to match vector

> > construction just needs to be extended to try merge patterns even if one

> > of the subexpressions is not simple.

> 

> The question is what users expect and get when they use -O0 with intrinsics?

> 

> Richard.

> 

Here is the version with __builtin_shuffle. It might be more expectable -O0, 
but it is also uglier.
diff --git a/gcc/config/i386/emmintrin.h b/gcc/config/i386/emmintrin.h
index b940a39d27b..6501638f619 100644
--- a/gcc/config/i386/emmintrin.h
+++ b/gcc/config/i386/emmintrin.h
@@ -113,7 +113,7 @@ _mm_setzero_pd (void)
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_move_sd (__m128d __A, __m128d __B)
 {
-  return (__m128d) __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);
+  return __extension__ (__m128d) __builtin_shuffle((__v2df)__A, (__v2df)__B, (__v2di){2, 1});
 }
 
 /* Load two DPFP values from P.  The address must be 16-byte aligned.  */
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index ee409cfe7e4..2337ef5ea08 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -46143,6 +46143,46 @@ expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
   return ok;
 }
 
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
+   using movss or movsd.  */
+static bool
+expand_vec_perm_movs (struct expand_vec_perm_d *d)
+{
+  machine_mode vmode = d->vmode;
+  unsigned i, nelt = d->nelt;
+  rtx x;
+
+  if (d->one_operand_p)
+    return false;
+
+  if (TARGET_SSE2 && (vmode == V2DFmode || vmode == V4SFmode))
+    ;
+  else
+    return false;
+
+  /* Only the first element is changed. */
+  if (d->perm[0] != nelt && d->perm[0] != 0)
+    return false;
+  for (i = 1; i < nelt; ++i) {
+    {
+      if (d->perm[i] != i + nelt - d->perm[0])
+        return false;
+    }
+  }
+
+  if (d->testing_p)
+    return true;
+
+  if (d->perm[0] == nelt)
+    x = gen_rtx_VEC_MERGE (vmode, d->op1, d->op0, GEN_INT (1));
+  else
+    x = gen_rtx_VEC_MERGE (vmode, d->op0, d->op1, GEN_INT (1));
+
+  emit_insn (gen_rtx_SET (d->target, x));
+
+  return true;
+}
+
 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
    in terms of blendp[sd] / pblendw / pblendvb / vpblendd.  */
 
@@ -46885,6 +46925,10 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
 	}
     }
 
+  /* Try movss/movsd instructions.  */
+  if (expand_vec_perm_movs (d))
+    return true;
+
   /* Finally, try the fully general two operand permute.  */
   if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
 			      d->testing_p))
diff --git a/gcc/config/i386/xmmintrin.h b/gcc/config/i386/xmmintrin.h
index f64f3f74a0b..45b99ff87d5 100644
--- a/gcc/config/i386/xmmintrin.h
+++ b/gcc/config/i386/xmmintrin.h
@@ -1011,7 +1011,8 @@ _mm_storer_ps (float *__P, __m128 __A)
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_move_ss (__m128 __A, __m128 __B)
 {
-  return (__m128) __builtin_ia32_movss ((__v4sf)__A, (__v4sf)__B);
+  return __extension__ (__m128) __builtin_shuffle((__v4sf)__A, (__v4sf)__B,
+                                                  (__attribute__((__vector_size__ (16))) int){4, 1, 2, 3});
 }
 
 /* Extracts one of the four words of A.  The selector N must be immediate.  */
Marc Glisse Aug. 2, 2018, 9:15 p.m. | #6
On Thu, 2 Aug 2018, Allan Sandfeld Jensen wrote:

> I forgot. One of the things that makes using __builtin_shuffle ugly is that

> __v4si  as the suffle argument needs to be in _mm_move_ss, is declared

> in emmintrin.h, but _mm_move_ss is in xmmintrin.h.


__v4si is some internal detail, I don't see much issue with moving it to 
xmmintrin.h if you want to use it there.

> In general the gcc __builtin_shuffle syntax with the argument being a vector

> is kind of ackward. At least for the declaring intrinsics, the clang still

> where the permutator is extra argument is easier to deal with:

> __builtin_shuffle(a, b, (__v4si){4, 0, 1, 2})

> vs

> __builtin_shuffle(a, b, 4, 0, 1, 2)


__builtin_shufflevector IIRC


>> The question is what users expect and get when they use -O0 with intrinsics?

>>

> Here is the version with __builtin_shuffle. It might be more expectable -O0,

> but it is also uglier.


I am not convinced -O0 is very important.

If you start extending your approach to _mm_add_sd and others, while one 
instruction is easy enough to recognize, if we put several in a row, they 
will be partially simplified and may become harder to recognize.
{ x*(y+v[0]-z), v[1] } requires that you notice that the upper part of 
this vector is v[1], i.e. the upper part of a vector whose lower part 
appears somewhere in the arbitrarily complex expression for the lower 
part of the result. And you then have to propagate the fact that you are 
doing vector operations all the way back to v[0].

I don't have a strong opinion on what the best approach is.

-- 
Marc Glisse
Jakub Jelinek Aug. 2, 2018, 9:46 p.m. | #7
On Thu, Aug 02, 2018 at 10:50:58PM +0200, Allan Sandfeld Jensen wrote:
> Here is the version with __builtin_shuffle. It might be more expectable -O0, 

> but it is also uglier.


I don't find anything ugly on it, except the formatting glitches (missing
space before (, overlong line, and useless __extension__.
Improving code generated for __builtin_shuffle is desirable too.

> --- a/gcc/config/i386/xmmintrin.h

> +++ b/gcc/config/i386/xmmintrin.h

> @@ -1011,7 +1011,8 @@ _mm_storer_ps (float *__P, __m128 __A)

>  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

>  _mm_move_ss (__m128 __A, __m128 __B)

>  {

> -  return (__m128) __builtin_ia32_movss ((__v4sf)__A, (__v4sf)__B);

> +  return __extension__ (__m128) __builtin_shuffle((__v4sf)__A, (__v4sf)__B,

> +                                                  (__attribute__((__vector_size__ (16))) int){4, 1, 2, 3});


And obviously use __v4si here instead of __attribute__((__vector_size__ (16))) int.

	Jakub
Allan Sandfeld Jensen Aug. 2, 2018, 9:51 p.m. | #8
On Donnerstag, 2. August 2018 23:46:37 CEST Jakub Jelinek wrote:
> On Thu, Aug 02, 2018 at 10:50:58PM +0200, Allan Sandfeld Jensen wrote:

> > Here is the version with __builtin_shuffle. It might be more expectable

> > -O0, but it is also uglier.

> 

> I don't find anything ugly on it, except the formatting glitches (missing

> space before (, overlong line, and useless __extension__.

> Improving code generated for __builtin_shuffle is desirable too.

> 


__extension__ is needed when using the the {...} initialization otherwise -
std=C89 will produce warnings about standards.  The line is a bit long, but I 
thought it looked better like this rather than adding any emergency line 
breaks. Is there a hard limit?

> > --- a/gcc/config/i386/xmmintrin.h

> > +++ b/gcc/config/i386/xmmintrin.h

> > @@ -1011,7 +1011,8 @@ _mm_storer_ps (float *__P, __m128 __A)

> > 

> >  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__,

> >  __artificial__)) _mm_move_ss (__m128 __A, __m128 __B)

> >  {

> > 

> > -  return (__m128) __builtin_ia32_movss ((__v4sf)__A, (__v4sf)__B);

> > +  return __extension__ (__m128) __builtin_shuffle((__v4sf)__A,

> > (__v4sf)__B, +                                                 

> > (__attribute__((__vector_size__ (16))) int){4, 1, 2, 3});

> And obviously use __v4si here instead of __attribute__((__vector_size__

> (16))) int.

> 

__v4si is declared in emmintrin.h, so I couldn't use it here unless I moved 
the definition. I tried changing as little as possible to not trigger bike 
shedding.

'Allan
Allan Sandfeld Jensen Aug. 2, 2018, 9:52 p.m. | #9
On Donnerstag, 2. August 2018 23:15:28 CEST Marc Glisse wrote:
> On Thu, 2 Aug 2018, Allan Sandfeld Jensen wrote:

> > I forgot. One of the things that makes using __builtin_shuffle ugly is

> > that

> > __v4si  as the suffle argument needs to be in _mm_move_ss, is declared

> > in emmintrin.h, but _mm_move_ss is in xmmintrin.h.

> 

> __v4si is some internal detail, I don't see much issue with moving it to

> xmmintrin.h if you want to use it there.

> 

> > In general the gcc __builtin_shuffle syntax with the argument being a

> > vector is kind of ackward. At least for the declaring intrinsics, the

> > clang still where the permutator is extra argument is easier to deal

> > with:

> > __builtin_shuffle(a, b, (__v4si){4, 0, 1, 2})

> > vs

> > __builtin_shuffle(a, b, 4, 0, 1, 2)

> 

> __builtin_shufflevector IIRC

> 

> >> The question is what users expect and get when they use -O0 with

> >> intrinsics?> 

> > Here is the version with __builtin_shuffle. It might be more expectable

> > -O0, but it is also uglier.

> 

> I am not convinced -O0 is very important.

> 

Me neither, and in any case I would argue the logic that recognizes the vector 
constructions patterns are not optimizations but instruction matching.

> If you start extending your approach to _mm_add_sd and others, while one

> instruction is easy enough to recognize, if we put several in a row, they

> will be partially simplified and may become harder to recognize.

> { x*(y+v[0]-z), v[1] } requires that you notice that the upper part of

> this vector is v[1], i.e. the upper part of a vector whose lower part

> appears somewhere in the arbitrarily complex expression for the lower

> part of the result. And you then have to propagate the fact that you are

> doing vector operations all the way back to v[0].

> 

> I don't have a strong opinion on what the best approach is.


Yes, I am not sure all of those could be done exhaustively with the existing 
logic, and it might also be of dubious value as in almost all cases the ps 
instructions have the same latency and bandwidth as the ss instructions, so 
developers should probably use _ps versions as they are scheduled better by 
the compiler (or at least better by gcc).
It was just an idea, and I haven't tried it at this point.

'Allan
Allan Sandfeld Jensen Aug. 11, 2018, 8:59 a.m. | #10
Updated:

Match movss and movsd "blend" instructions

Adds the ability to match movss and movsd as blend patterns,
implemented in a new method to be able to match these before shuffles,
while keeping other blends after.

2018-08-11 Allan Sandfeld Jensen <allan.jensen@qt.io>

gcc/config/i386

    * i386.cc (expand_vec_perm_movs): New method matching movs
    patterns.
    * i386.cc (expand_vec_perm_1): Try the new method.

gcc/testsuite

    * gcc.target/i386/sse2-movs.c: New test.
diff --git a/gcc/config/i386/emmintrin.h b/gcc/config/i386/emmintrin.h
index b940a39d27b..6501638f619 100644
--- a/gcc/config/i386/emmintrin.h
+++ b/gcc/config/i386/emmintrin.h
@@ -113,7 +113,7 @@ _mm_setzero_pd (void)
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_move_sd (__m128d __A, __m128d __B)
 {
-  return (__m128d) __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);
+  return __extension__ (__m128d) __builtin_shuffle((__v2df)__A, (__v2df)__B, (__v2di){2, 1});
 }
 
 /* Load two DPFP values from P.  The address must be 16-byte aligned.  */
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 7554fd1f659..485850096e9 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -46145,6 +46145,46 @@ expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
   return ok;
 }
 
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
+   using movss or movsd.  */
+static bool
+expand_vec_perm_movs (struct expand_vec_perm_d *d)
+{
+  machine_mode vmode = d->vmode;
+  unsigned i, nelt = d->nelt;
+  rtx x;
+
+  if (d->one_operand_p)
+    return false;
+
+  if (TARGET_SSE2 && (vmode == V2DFmode || vmode == V4SFmode))
+    ;
+  else
+    return false;
+
+  /* Only the first element is changed. */
+  if (d->perm[0] != nelt && d->perm[0] != 0)
+    return false;
+  for (i = 1; i < nelt; ++i) {
+    {
+      if (d->perm[i] != i + nelt - d->perm[0])
+        return false;
+    }
+  }
+
+  if (d->testing_p)
+    return true;
+
+  if (d->perm[0] == nelt)
+    x = gen_rtx_VEC_MERGE (vmode, d->op1, d->op0, GEN_INT (1));
+  else
+    x = gen_rtx_VEC_MERGE (vmode, d->op0, d->op1, GEN_INT (1));
+
+  emit_insn (gen_rtx_SET (d->target, x));
+
+  return true;
+}
+
 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
    in terms of blendp[sd] / pblendw / pblendvb / vpblendd.  */
 
@@ -46887,6 +46927,10 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
 	}
     }
 
+  /* Try movss/movsd instructions.  */
+  if (expand_vec_perm_movs (d))
+    return true;
+
   /* Finally, try the fully general two operand permute.  */
   if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
 			      d->testing_p))
diff --git a/gcc/config/i386/xmmintrin.h b/gcc/config/i386/xmmintrin.h
index f64f3f74a0b..f770570295c 100644
--- a/gcc/config/i386/xmmintrin.h
+++ b/gcc/config/i386/xmmintrin.h
@@ -1011,7 +1011,10 @@ _mm_storer_ps (float *__P, __m128 __A)
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_move_ss (__m128 __A, __m128 __B)
 {
-  return (__m128) __builtin_ia32_movss ((__v4sf)__A, (__v4sf)__B);
+  return (__m128) __builtin_shuffle ((__v4sf)__A, (__v4sf)__B,
+                                     __extension__
+                                     (__attribute__((__vector_size__ (16))) int)
+                                     {4,1,2,3});
 }
 
 /* Extracts one of the four words of A.  The selector N must be immediate.  */
Jakub Jelinek Aug. 11, 2018, 9:18 a.m. | #11
On Sat, Aug 11, 2018 at 10:59:26AM +0200, Allan Sandfeld Jensen wrote:
> +/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D

> +   using movss or movsd.  */

> +static bool

> +expand_vec_perm_movs (struct expand_vec_perm_d *d)

> +{

> +  machine_mode vmode = d->vmode;

> +  unsigned i, nelt = d->nelt;

> +  rtx x;

> +

> +  if (d->one_operand_p)

> +    return false;

> +

> +  if (TARGET_SSE2 && (vmode == V2DFmode || vmode == V4SFmode))

> +    ;

> +  else

> +    return false;

> +

> +  /* Only the first element is changed. */


Two spaces after .

> +  if (d->perm[0] != nelt && d->perm[0] != 0)

> +    return false;

> +  for (i = 1; i < nelt; ++i) {

> +    {

> +      if (d->perm[i] != i + nelt - d->perm[0])

> +        return false;

> +    }

> +  }


Extraneous {}s (both pairs, the outer ones even badly indented).

Otherwise LGTM.

	Jakub
Allan Sandfeld Jensen Aug. 11, 2018, 9:54 a.m. | #12
On Samstag, 11. August 2018 11:18:39 CEST Jakub Jelinek wrote:
> On Sat, Aug 11, 2018 at 10:59:26AM +0200, Allan Sandfeld Jensen wrote:

> > +/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D

> > +   using movss or movsd.  */

> > +static bool

> > +expand_vec_perm_movs (struct expand_vec_perm_d *d)

> > +{

> > +  machine_mode vmode = d->vmode;

> > +  unsigned i, nelt = d->nelt;

> > +  rtx x;

> > +

> > +  if (d->one_operand_p)

> > +    return false;

> > +

> > +  if (TARGET_SSE2 && (vmode == V2DFmode || vmode == V4SFmode))

> > +    ;

> > +  else

> > +    return false;

> > +

> > +  /* Only the first element is changed. */

> 

> Two spaces after .

> 

> > +  if (d->perm[0] != nelt && d->perm[0] != 0)

> > +    return false;

> > +  for (i = 1; i < nelt; ++i) {

> > +    {

> > +      if (d->perm[i] != i + nelt - d->perm[0])

> > +        return false;

> > +    }

> > +  }

> 

> Extraneous {}s (both pairs, the outer ones even badly indented).

> 

> Otherwise LGTM.

> 

Updated:

Note as an infrequent contributor don't have commit access, so I need someone 
reviewing to also commit.

'Allan
From e33241e5ddc7fa57c4ba7893669af7f7e636125e Mon Sep 17 00:00:00 2001
From: Allan Sandfeld Jensen <allan.jensen@qt.io>

Date: Sat, 11 Aug 2018 11:52:21 +0200
Subject: [PATCH] Match movss and movsd "blend" instructions

Adds the ability to match movss and movsd as blend patterns,
implemented in a new method to be able to match these before shuffles,
while keeping other blends after.

2018-08-11 Allan Sandfeld Jensen <allan.jensen@qt.io>

gcc/config/i386

    * i386.cc (expand_vec_perm_movs): New method matching movs
    patterns.
    * i386.cc (expand_vec_perm_1): Try the new method.

gcc/testsuite

    * gcc.target/i386/sse2-movs.c: New test.
---
 gcc/config/i386/emmintrin.h |  2 +-
 gcc/config/i386/i386.c      | 41 +++++++++++++++++++++++++++++++++++++
 gcc/config/i386/xmmintrin.h |  5 ++++-
 3 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/gcc/config/i386/emmintrin.h b/gcc/config/i386/emmintrin.h
index b940a39d27b..6501638f619 100644
--- a/gcc/config/i386/emmintrin.h
+++ b/gcc/config/i386/emmintrin.h
@@ -113,7 +113,7 @@ _mm_setzero_pd (void)
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_move_sd (__m128d __A, __m128d __B)
 {
-  return (__m128d) __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);
+  return __extension__ (__m128d) __builtin_shuffle((__v2df)__A, (__v2df)__B, (__v2di){2, 1});
 }
 
 /* Load two DPFP values from P.  The address must be 16-byte aligned.  */
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 7554fd1f659..15a3caa94c3 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -46145,6 +46145,43 @@ expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
   return ok;
 }
 
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
+   using movss or movsd.  */
+static bool
+expand_vec_perm_movs (struct expand_vec_perm_d *d)
+{
+  machine_mode vmode = d->vmode;
+  unsigned i, nelt = d->nelt;
+  rtx x;
+
+  if (d->one_operand_p)
+    return false;
+
+  if (TARGET_SSE2 && (vmode == V2DFmode || vmode == V4SFmode))
+    ;
+  else
+    return false;
+
+  /* Only the first element is changed.  */
+  if (d->perm[0] != nelt && d->perm[0] != 0)
+    return false;
+  for (i = 1; i < nelt; ++i)
+    if (d->perm[i] != i + nelt - d->perm[0])
+      return false;
+
+  if (d->testing_p)
+    return true;
+
+  if (d->perm[0] == nelt)
+    x = gen_rtx_VEC_MERGE (vmode, d->op1, d->op0, GEN_INT (1));
+  else
+    x = gen_rtx_VEC_MERGE (vmode, d->op0, d->op1, GEN_INT (1));
+
+  emit_insn (gen_rtx_SET (d->target, x));
+
+  return true;
+}
+
 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
    in terms of blendp[sd] / pblendw / pblendvb / vpblendd.  */
 
@@ -46887,6 +46924,10 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
 	}
     }
 
+  /* Try movss/movsd instructions.  */
+  if (expand_vec_perm_movs (d))
+    return true;
+
   /* Finally, try the fully general two operand permute.  */
   if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
 			      d->testing_p))
diff --git a/gcc/config/i386/xmmintrin.h b/gcc/config/i386/xmmintrin.h
index f64f3f74a0b..f770570295c 100644
--- a/gcc/config/i386/xmmintrin.h
+++ b/gcc/config/i386/xmmintrin.h
@@ -1011,7 +1011,10 @@ _mm_storer_ps (float *__P, __m128 __A)
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_move_ss (__m128 __A, __m128 __B)
 {
-  return (__m128) __builtin_ia32_movss ((__v4sf)__A, (__v4sf)__B);
+  return (__m128) __builtin_shuffle ((__v4sf)__A, (__v4sf)__B,
+                                     __extension__
+                                     (__attribute__((__vector_size__ (16))) int)
+                                     {4,1,2,3});
 }
 
 /* Extracts one of the four words of A.  The selector N must be immediate.  */
-- 
2.17.1
Uros Bizjak Aug. 12, 2018, 2:39 p.m. | #13
On Sat, Aug 11, 2018 at 11:54 AM, Allan Sandfeld Jensen
<linux@carewolf.com> wrote:
> On Samstag, 11. August 2018 11:18:39 CEST Jakub Jelinek wrote:

>> On Sat, Aug 11, 2018 at 10:59:26AM +0200, Allan Sandfeld Jensen wrote:

>> > +/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D

>> > +   using movss or movsd.  */

>> > +static bool

>> > +expand_vec_perm_movs (struct expand_vec_perm_d *d)

>> > +{

>> > +  machine_mode vmode = d->vmode;

>> > +  unsigned i, nelt = d->nelt;

>> > +  rtx x;

>> > +

>> > +  if (d->one_operand_p)

>> > +    return false;

>> > +

>> > +  if (TARGET_SSE2 && (vmode == V2DFmode || vmode == V4SFmode))

>> > +    ;

>> > +  else

>> > +    return false;

>> > +

>> > +  /* Only the first element is changed. */

>>

>> Two spaces after .

>>

>> > +  if (d->perm[0] != nelt && d->perm[0] != 0)

>> > +    return false;

>> > +  for (i = 1; i < nelt; ++i) {

>> > +    {

>> > +      if (d->perm[i] != i + nelt - d->perm[0])

>> > +        return false;

>> > +    }

>> > +  }

>>

>> Extraneous {}s (both pairs, the outer ones even badly indented).

>>

>> Otherwise LGTM.

>>

> Updated:

>

> Note as an infrequent contributor don't have commit access, so I need someone

> reviewing to also commit.


+  if (TARGET_SSE2 && (vmode == V2DFmode || vmode == V4SFmode))
+    ;
+  else
+    return false;

V4SFmode can be used with TARGET_SSE only.

Uros.
Jeff Law Aug. 15, 2018, 4:33 a.m. | #14
On 08/11/2018 03:54 AM, Allan Sandfeld Jensen wrote:
> On Samstag, 11. August 2018 11:18:39 CEST Jakub Jelinek wrote:

>> On Sat, Aug 11, 2018 at 10:59:26AM +0200, Allan Sandfeld Jensen wrote:

>>> +/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D

>>> +   using movss or movsd.  */

>>> +static bool

>>> +expand_vec_perm_movs (struct expand_vec_perm_d *d)

>>> +{

>>> +  machine_mode vmode = d->vmode;

>>> +  unsigned i, nelt = d->nelt;

>>> +  rtx x;

>>> +

>>> +  if (d->one_operand_p)

>>> +    return false;

>>> +

>>> +  if (TARGET_SSE2 && (vmode == V2DFmode || vmode == V4SFmode))

>>> +    ;

>>> +  else

>>> +    return false;

>>> +

>>> +  /* Only the first element is changed. */

>>

>> Two spaces after .

>>

>>> +  if (d->perm[0] != nelt && d->perm[0] != 0)

>>> +    return false;

>>> +  for (i = 1; i < nelt; ++i) {

>>> +    {

>>> +      if (d->perm[i] != i + nelt - d->perm[0])

>>> +        return false;

>>> +    }

>>> +  }

>>

>> Extraneous {}s (both pairs, the outer ones even badly indented).

>>

>> Otherwise LGTM.

>>

> Updated:

> 

> Note as an infrequent contributor don't have commit access, so I need someone 

> reviewing to also commit.

I fixed up the ChangeLog, extracted the test from the original patch and
committed all the bits to the trunk.

Thanks,
jeff
Uros Bizjak Aug. 15, 2018, 7:36 p.m. | #15
On Wed, Aug 15, 2018 at 6:33 AM, Jeff Law <law@redhat.com> wrote:
> On 08/11/2018 03:54 AM, Allan Sandfeld Jensen wrote:

>> On Samstag, 11. August 2018 11:18:39 CEST Jakub Jelinek wrote:

>>> On Sat, Aug 11, 2018 at 10:59:26AM +0200, Allan Sandfeld Jensen wrote:

>>>> +/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D

>>>> +   using movss or movsd.  */

>>>> +static bool

>>>> +expand_vec_perm_movs (struct expand_vec_perm_d *d)

>>>> +{

>>>> +  machine_mode vmode = d->vmode;

>>>> +  unsigned i, nelt = d->nelt;

>>>> +  rtx x;

>>>> +

>>>> +  if (d->one_operand_p)

>>>> +    return false;

>>>> +

>>>> +  if (TARGET_SSE2 && (vmode == V2DFmode || vmode == V4SFmode))

>>>> +    ;

>>>> +  else

>>>> +    return false;

>>>> +

>>>> +  /* Only the first element is changed. */

>>>

>>> Two spaces after .

>>>

>>>> +  if (d->perm[0] != nelt && d->perm[0] != 0)

>>>> +    return false;

>>>> +  for (i = 1; i < nelt; ++i) {

>>>> +    {

>>>> +      if (d->perm[i] != i + nelt - d->perm[0])

>>>> +        return false;

>>>> +    }

>>>> +  }

>>>

>>> Extraneous {}s (both pairs, the outer ones even badly indented).

>>>

>>> Otherwise LGTM.

>>>

>> Updated:

>>

>> Note as an infrequent contributor don't have commit access, so I need someone

>> reviewing to also commit.

> I fixed up the ChangeLog, extracted the test from the original patch and

> committed all the bits to the trunk.


I have amended the committed code with attached fixup patch.

2018-08-15  Uros Bizjak  <ubizjak@gmail.com>

    * config/i386/i386.c (expand_vec_perm_movs): Enable V4SFmode
    for TARGET_SSE.

Bootstrapped and regression tested on x86_64-linux-gnu {,-m32}.

Uros.
Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c	(revision 263563)
+++ config/i386/i386.c	(working copy)
@@ -46157,9 +46157,8 @@ expand_vec_perm_movs (struct expand_vec_perm_d *d)
   if (d->one_operand_p)
     return false;
 
-  if (TARGET_SSE2 && (vmode == V2DFmode || vmode == V4SFmode))
-    ;
-  else
+  if (!(TARGET_SSE && vmode == V4SFmode)
+      && !(TARGET_SSE2 && vmode == V2DFmode))
     return false;
 
   /* Only the first element is changed.  */

Patch

From e96b3aa9017ad0d19238c923146196405cc4e5af Mon Sep 17 00:00:00 2001
From: Allan Sandfeld Jensen <allan.jensen@qt.io>
Date: Wed, 9 May 2018 12:35:14 +0200
Subject: [PATCH] Match movss and movsd blends

Adds the ability to match movss and movsd as blend patterns,
implemented in a new method to be able to match these before shuffles,
while keeping other blends after.

2018-07-29 Allan Sandfeld Jensen <allan.jensen@qt.io>

gcc/config/i386

    * i386.cc (expand_vec_perm_movs): New method matching movs
    patterns.
    * i386.cc (expand_vec_perm_1): Try the new method.

gcc/testsuite

    * gcc.target/i386/sse2-movs.c: New test.
---
 gcc/config/i386/emmintrin.h               |  2 +-
 gcc/config/i386/i386.c                    | 44 +++++++++++++++++++++++
 gcc/config/i386/xmmintrin.h               |  2 +-
 gcc/testsuite/gcc.target/i386/sse2-movs.c | 21 +++++++++++
 4 files changed, 67 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/sse2-movs.c

diff --git a/gcc/config/i386/emmintrin.h b/gcc/config/i386/emmintrin.h
index b940a39d27b..1efd943bac4 100644
--- a/gcc/config/i386/emmintrin.h
+++ b/gcc/config/i386/emmintrin.h
@@ -113,7 +113,7 @@  _mm_setzero_pd (void)
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_move_sd (__m128d __A, __m128d __B)
 {
-  return (__m128d) __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);
+  return __extension__ (__m128d)(__v2df){__B[0],__A[1]};
 }
 
 /* Load two DPFP values from P.  The address must be 16-byte aligned.  */
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index ee409cfe7e4..2337ef5ea08 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -46143,6 +46143,46 @@  expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
   return ok;
 }
 
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
+   using movss or movsd.  */
+static bool
+expand_vec_perm_movs (struct expand_vec_perm_d *d)
+{
+  machine_mode vmode = d->vmode;
+  unsigned i, nelt = d->nelt;
+  rtx x;
+
+  if (d->one_operand_p)
+    return false;
+
+  if (TARGET_SSE2 && (vmode == V2DFmode || vmode == V4SFmode))
+    ;
+  else
+    return false;
+
+  /* Only the first element is changed. */
+  if (d->perm[0] != nelt && d->perm[0] != 0)
+    return false;
+  for (i = 1; i < nelt; ++i) {
+    {
+      if (d->perm[i] != i + nelt - d->perm[0])
+        return false;
+    }
+  }
+
+  if (d->testing_p)
+    return true;
+
+  if (d->perm[0] == nelt)
+    x = gen_rtx_VEC_MERGE (vmode, d->op1, d->op0, GEN_INT (1));
+  else
+    x = gen_rtx_VEC_MERGE (vmode, d->op0, d->op1, GEN_INT (1));
+
+  emit_insn (gen_rtx_SET (d->target, x));
+
+  return true;
+}
+
 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
    in terms of blendp[sd] / pblendw / pblendvb / vpblendd.  */
 
@@ -46885,6 +46925,10 @@  expand_vec_perm_1 (struct expand_vec_perm_d *d)
 	}
     }
 
+  /* Try movss/movsd instructions.  */
+  if (expand_vec_perm_movs (d))
+    return true;
+
   /* Finally, try the fully general two operand permute.  */
   if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
 			      d->testing_p))
diff --git a/gcc/config/i386/xmmintrin.h b/gcc/config/i386/xmmintrin.h
index f64f3f74a0b..699f681e054 100644
--- a/gcc/config/i386/xmmintrin.h
+++ b/gcc/config/i386/xmmintrin.h
@@ -1011,7 +1011,7 @@  _mm_storer_ps (float *__P, __m128 __A)
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_move_ss (__m128 __A, __m128 __B)
 {
-  return (__m128) __builtin_ia32_movss ((__v4sf)__A, (__v4sf)__B);
+  return __extension__ (__m128)(__v4sf){__B[0],__A[1],__A[2],__A[3]};
 }
 
 /* Extracts one of the four words of A.  The selector N must be immediate.  */
diff --git a/gcc/testsuite/gcc.target/i386/sse2-movs.c b/gcc/testsuite/gcc.target/i386/sse2-movs.c
new file mode 100644
index 00000000000..79f486cfa82
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-movs.c
@@ -0,0 +1,21 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2" } */
+/* { dg-require-effective-target sse2 } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler "movsd" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+/* { dg-final { scan-assembler-not "shufpd" } } */
+
+typedef float v4sf __attribute__ ((vector_size (16)));
+typedef double v2df __attribute__ ((vector_size (16)));
+
+v4sf movss(v4sf a, v4sf b)
+{
+     return (v4sf){b[0],a[1],a[2],a[3]};
+}
+
+v2df movsd(v2df a, v2df b)
+{
+     return (v2df){b[0],a[1]};
+}
-- 
2.17.0