V2 [PATCH] Optimize vector constructor

Message ID 20190304174625.GA25613@gmail.com
State New
Headers show
Series
  • V2 [PATCH] Optimize vector constructor
Related show

Commit Message

H.J. Lu March 4, 2019, 5:46 p.m.
On Mon, Mar 04, 2019 at 12:55:04PM +0100, Richard Biener wrote:
> On Sun, Mar 3, 2019 at 10:13 PM H.J. Lu <hjl.tools@gmail.com> wrote:

> >

> > On Sun, Mar 03, 2019 at 06:40:09AM -0800, Andrew Pinski wrote:

> > > )

> > > ,On Sun, Mar 3, 2019 at 6:32 AM H.J. Lu <hjl.tools@gmail.com> wrote:

> > > >

> > > > For vector init constructor:

> > > >

> > > > ---

> > > > typedef float __v4sf __attribute__ ((__vector_size__ (16)));

> > > >

> > > > __v4sf

> > > > foo (__v4sf x, float f)

> > > > {

> > > >   __v4sf y = { f, x[1], x[2], x[3] };

> > > >   return y;

> > > > }

> > > > ---

> > > >

> > > > we can optimize vector init constructor with vector copy or permute

> > > > followed by a single scalar insert:


> and you want to advance to the _1 = BIT_INSERT_EXPR here.  The easiest way

> is to emit a new stmt for _2 = copy ...; and do the set_rhs with the

> BIT_INSERT_EXPR.


Thanks for BIT_INSERT_EXPR suggestion.  I am testing this patch.


H.J.
---
We can optimize vector constructor with vector copy or permute followed
by a single scalar insert:

  __v4sf y;
  __v4sf D.1930;
  float _1;
  float _2;
  float _3;

  <bb 2> :
  _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
  _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
  _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
  y_6 = {f_5(D), _3, _2, _1};
  return y_6;

with

 __v4sf y;
  __v4sf D.1930;
  float _1;
  float _2;
  float _3;
  vector(4) float _8;

  <bb 2> :
  _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
  _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
  _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
  _8 = x_9(D);
  y_6 = BIT_INSERT_EXPR <x_9(D), f_5(D), 0 (32 bits)>;
  return y_6;

gcc/

	PR tree-optimization/88828
	* tree-ssa-forwprop.c (simplify_vector_constructor): Optimize
	vector init constructor with vector copy or permute followed
	by a single scalar insert.

gcc/testsuite/

	PR tree-optimization/88828
	* gcc.target/i386/pr88828-1a.c: New test.
	* gcc.target/i386/pr88828-2b.c: Likewise.
	* gcc.target/i386/pr88828-2.c: Likewise.
	* gcc.target/i386/pr88828-3a.c: Likewise.
	* gcc.target/i386/pr88828-3b.c: Likewise.
	* gcc.target/i386/pr88828-3c.c: Likewise.
	* gcc.target/i386/pr88828-3d.c: Likewise.
	* gcc.target/i386/pr88828-4a.c: Likewise.
	* gcc.target/i386/pr88828-4b.c: Likewise.
	* gcc.target/i386/pr88828-5a.c: Likewise.
	* gcc.target/i386/pr88828-5b.c: Likewise.
	* gcc.target/i386/pr88828-6a.c: Likewise.
	* gcc.target/i386/pr88828-6b.c: Likewise.
---
 gcc/testsuite/gcc.target/i386/pr88828-1a.c | 16 +++++
 gcc/testsuite/gcc.target/i386/pr88828-1b.c | 22 ++++++
 gcc/testsuite/gcc.target/i386/pr88828-2.c  | 17 +++++
 gcc/testsuite/gcc.target/i386/pr88828-3a.c | 16 +++++
 gcc/testsuite/gcc.target/i386/pr88828-3b.c | 18 +++++
 gcc/testsuite/gcc.target/i386/pr88828-3c.c | 22 ++++++
 gcc/testsuite/gcc.target/i386/pr88828-3d.c | 24 +++++++
 gcc/testsuite/gcc.target/i386/pr88828-4a.c | 17 +++++
 gcc/testsuite/gcc.target/i386/pr88828-4b.c | 20 ++++++
 gcc/testsuite/gcc.target/i386/pr88828-5a.c | 16 +++++
 gcc/testsuite/gcc.target/i386/pr88828-5b.c | 18 +++++
 gcc/testsuite/gcc.target/i386/pr88828-6a.c | 17 +++++
 gcc/testsuite/gcc.target/i386/pr88828-6b.c | 19 +++++
 gcc/testsuite/gcc.target/i386/pr88828-7.c  | 22 ++++++
 gcc/tree-ssa-forwprop.c                    | 84 +++++++++++++++++++---
 15 files changed, 338 insertions(+), 10 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-1a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-1b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-3a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-3b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-3c.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-3d.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-4a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-4b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-5a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-5b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-6a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-6b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-7.c

-- 
2.20.1

Patch

diff --git a/gcc/testsuite/gcc.target/i386/pr88828-1a.c b/gcc/testsuite/gcc.target/i386/pr88828-1a.c
new file mode 100644
index 00000000000..4ef1feab389
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-1a.c
@@ -0,0 +1,16 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { f, x[1], x[2], x[3] };
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-1b.c b/gcc/testsuite/gcc.target/i386/pr88828-1b.c
new file mode 100644
index 00000000000..2cddf4263f3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-1b.c
@@ -0,0 +1,22 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+static __v4sf
+vector_init (float f0,float f1, float f2,float f3)
+{
+  __v4sf y = { f0, f1, f2, f3 };
+   return y;
+}
+
+__v4sf
+foo (__v4sf x, float f)
+{
+  return vector_init (f, x[1], x[2], x[3]) ;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-2.c b/gcc/testsuite/gcc.target/i386/pr88828-2.c
new file mode 100644
index 00000000000..6dc482b6f4b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-2.c
@@ -0,0 +1,17 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = x;
+  y[0] = f;
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3a.c b/gcc/testsuite/gcc.target/i386/pr88828-3a.c
new file mode 100644
index 00000000000..97eb8e7162a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-3a.c
@@ -0,0 +1,16 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-times "shufps" 1 } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { f, x[0], x[2], x[3] };
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3b.c b/gcc/testsuite/gcc.target/i386/pr88828-3b.c
new file mode 100644
index 00000000000..ab2ba730716
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-3b.c
@@ -0,0 +1,18 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vmovss" 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpinsrd" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vmovss" { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { f, x[0], x[2], x[3] };
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3c.c b/gcc/testsuite/gcc.target/i386/pr88828-3c.c
new file mode 100644
index 00000000000..0db7f9e145b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-3c.c
@@ -0,0 +1,22 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-times "shufps" 1 } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+static __v4sf
+vector_init (float f0,float f1, float f2,float f3)
+{
+  __v4sf y = { f0, f1, f2, f3 };
+   return y;
+}
+
+__v4sf
+foo (__v4sf x, float f)
+{
+  return vector_init (f, x[0], x[1], x[3]) ;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3d.c b/gcc/testsuite/gcc.target/i386/pr88828-3d.c
new file mode 100644
index 00000000000..33e2b6e5881
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-3d.c
@@ -0,0 +1,24 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vmovss" 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpinsrd" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vmovss" { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+static __v4sf
+vector_init (float f0,float f1, float f2,float f3)
+{
+  __v4sf y = { f0, f1, f2, f3 };
+   return y;
+}
+
+__v4sf
+foo (__v4sf x, float f)
+{
+  return vector_init (f, x[0], x[1], x[3]) ;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-4a.c b/gcc/testsuite/gcc.target/i386/pr88828-4a.c
new file mode 100644
index 00000000000..a54689be701
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-4a.c
@@ -0,0 +1,17 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-times "shufps" 1 } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], x[1] };
+  y[0] = f;
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-4b.c b/gcc/testsuite/gcc.target/i386/pr88828-4b.c
new file mode 100644
index 00000000000..0c3a1024d93
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-4b.c
@@ -0,0 +1,20 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vmovss" 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpinsrd" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vmovss" { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vshufps" } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], x[1] };
+  y[0] = f;
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-5a.c b/gcc/testsuite/gcc.target/i386/pr88828-5a.c
new file mode 100644
index 00000000000..534808d3cd1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-5a.c
@@ -0,0 +1,16 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-times "shufps" 2 } } */
+/* { dg-final { scan-assembler-times "movaps" 1 } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], f };
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-5b.c b/gcc/testsuite/gcc.target/i386/pr88828-5b.c
new file mode 100644
index 00000000000..aebea790979
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-5b.c
@@ -0,0 +1,18 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vinsertps" 1 } } */
+/* { dg-final { scan-assembler-not "vmovss" } } */
+/* { dg-final { scan-assembler-not "vshufps" } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], f };
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-6a.c b/gcc/testsuite/gcc.target/i386/pr88828-6a.c
new file mode 100644
index 00000000000..d43a36d9137
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-6a.c
@@ -0,0 +1,17 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-times "shufps" 2 } } */
+/* { dg-final { scan-assembler-times "movaps" 1 } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], x[0] };
+  y[3] = f;
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-6b.c b/gcc/testsuite/gcc.target/i386/pr88828-6b.c
new file mode 100644
index 00000000000..6856fe6500e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-6b.c
@@ -0,0 +1,19 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vinsertps" 1 } } */
+/* { dg-final { scan-assembler-not "vshufps" } } */
+/* { dg-final { scan-assembler-not "vmovss" } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], x[0] };
+  y[3] = f;
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-7.c b/gcc/testsuite/gcc.target/i386/pr88828-7.c
new file mode 100644
index 00000000000..2cddf4263f3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-7.c
@@ -0,0 +1,22 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+static __v4sf
+vector_init (float f0,float f1, float f2,float f3)
+{
+  __v4sf y = { f0, f1, f2, f3 };
+   return y;
+}
+
+__v4sf
+foo (__v4sf x, float f)
+{
+  return vector_init (f, x[1], x[2], x[3]) ;
+}
diff --git a/gcc/tree-ssa-forwprop.c b/gcc/tree-ssa-forwprop.c
index eeb6281c652..ce00c43d7e7 100644
--- a/gcc/tree-ssa-forwprop.c
+++ b/gcc/tree-ssa-forwprop.c
@@ -2008,7 +2008,7 @@  simplify_vector_constructor (gimple_stmt_iterator *gsi)
   unsigned elem_size, i;
   unsigned HOST_WIDE_INT nelts;
   enum tree_code code, conv_code;
-  constructor_elt *elt;
+  constructor_elt *ce;
   bool maybe_ident;
 
   gcc_checking_assert (gimple_assign_rhs_code (stmt) == CONSTRUCTOR);
@@ -2027,18 +2027,41 @@  simplify_vector_constructor (gimple_stmt_iterator *gsi)
   orig[1] = NULL;
   conv_code = ERROR_MARK;
   maybe_ident = true;
-  FOR_EACH_VEC_SAFE_ELT (CONSTRUCTOR_ELTS (op), i, elt)
+
+  tree rhs_vector = NULL;
+  /* The single scalar element.  */
+  tree scalar_element = NULL;
+  unsigned int scalar_idx = 0;
+  bool insert = false;
+  unsigned int nscalars = 0;
+  unsigned int nvectors = 0;
+  FOR_EACH_VEC_SAFE_ELT (CONSTRUCTOR_ELTS (op), i, ce)
     {
       tree ref, op1;
 
       if (i >= nelts)
 	return false;
 
-      if (TREE_CODE (elt->value) != SSA_NAME)
+      if (TREE_CODE (ce->value) != SSA_NAME)
 	return false;
-      def_stmt = get_prop_source_stmt (elt->value, false, NULL);
+      def_stmt = get_prop_source_stmt (ce->value, false, NULL);
       if (!def_stmt)
-	return false;
+	{
+	  if (gimple_nop_p (SSA_NAME_DEF_STMT (ce->value)))
+	    {
+	      /* Only allow one scalar insert.  */
+	      if (nscalars != 0)
+		return false;
+
+	      nscalars = 1;
+	      insert = true;
+	      scalar_idx = i;
+	      scalar_element = ce->value;
+	      continue;
+	    }
+	  else
+	    return false;
+	}
       code = gimple_assign_rhs_code (def_stmt);
       if (code == FLOAT_EXPR
 	  || code == FIX_TRUNC_EXPR)
@@ -2046,7 +2069,7 @@  simplify_vector_constructor (gimple_stmt_iterator *gsi)
 	  op1 = gimple_assign_rhs1 (def_stmt);
 	  if (conv_code == ERROR_MARK)
 	    {
-	      if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (elt->value))),
+	      if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (ce->value))),
 			    GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (op1)))))
 		return false;
 	      conv_code = code;
@@ -2095,6 +2118,18 @@  simplify_vector_constructor (gimple_stmt_iterator *gsi)
 	elt += nelts;
       if (elt != i)
 	maybe_ident = false;
+
+       if (type == TREE_TYPE (ref))
+	 {
+	   /* The RHS vector has the same type as LHS.  */
+	   if (rhs_vector == NULL)
+	     rhs_vector = ref;
+	   /* Check if all RHS vector elements come fome the same
+	      vector.  */
+	   if (rhs_vector == ref)
+	     nvectors++;
+	 }
+
       sel.quick_push (elt);
     }
   if (i < nelts)
@@ -2113,6 +2148,12 @@  simplify_vector_constructor (gimple_stmt_iterator *gsi)
 	  || conv_code == CALL_EXPR))
     return false;
 
+  /* Replace the scalar element with the vector element.  */
+  if (insert
+      && (TYPE_VECTOR_SUBPARTS (type).to_constant ()
+	  == (nscalars + nvectors)))
+    sel.quick_push (scalar_idx);
+
   if (maybe_ident)
     {
       if (conv_code == ERROR_MARK)
@@ -2127,18 +2168,26 @@  simplify_vector_constructor (gimple_stmt_iterator *gsi)
 
       vec_perm_indices indices (sel, orig[1] ? 2 : 1, nelts);
       if (!can_vec_perm_const_p (TYPE_MODE (type), indices))
-	return false;
+	{
+	  if (insert)
+	    gcc_unreachable ();
+	  return false;
+	}
       mask_type
 	= build_vector_type (build_nonstandard_integer_type (elem_size, 1),
 			     nelts);
       if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_VECTOR_INT
 	  || maybe_ne (GET_MODE_SIZE (TYPE_MODE (mask_type)),
 		       GET_MODE_SIZE (TYPE_MODE (type))))
-	return false;
+	{
+	  if (insert)
+	    gcc_unreachable ();
+	  return false;
+	}
       op2 = vec_perm_indices_to_tree (mask_type, indices);
       if (!orig[1])
 	orig[1] = orig[0];
-      if (conv_code == ERROR_MARK)
+      if (conv_code == ERROR_MARK && !insert)
 	gimple_assign_set_rhs_with_ops (gsi, VEC_PERM_EXPR, orig[0],
 					orig[1], op2);
       else
@@ -2148,10 +2197,25 @@  simplify_vector_constructor (gimple_stmt_iterator *gsi)
 				   VEC_PERM_EXPR, orig[0], orig[1], op2);
 	  orig[0] = gimple_assign_lhs (perm);
 	  gsi_insert_before (gsi, perm, GSI_SAME_STMT);
-	  gimple_assign_set_rhs_with_ops (gsi, conv_code, orig[0],
+	  gimple_assign_set_rhs_with_ops (gsi,
+					  (conv_code != ERROR_MARK
+					   ? conv_code
+					   : NOP_EXPR),
+					  orig[0],
 					  NULL_TREE, NULL_TREE);
 	}
     }
+  if (insert)
+    {
+      /* Generate a single scalar insert.  */
+      tree var = make_ssa_name (type);
+      tree val = gimple_assign_rhs1 (stmt);
+      gimple *copy = gimple_build_assign (var, val);
+      gsi_insert_before (gsi, copy, GSI_SAME_STMT);
+      tree bitpos = bitsize_int (scalar_idx * elem_size);
+      gimple_assign_set_rhs_with_ops (gsi, BIT_INSERT_EXPR, var,
+				      scalar_element, bitpos);
+    }
   update_stmt (gsi_stmt (*gsi));
   return true;
 }