Add vect_recog_popcount_pattern to handle mismatch between the vectorized popcount IFN and scalar popcount builtin.

Message ID 20210617062912.89506-1-hongtao.liu@intel.com
State New
Headers show
Series
  • Add vect_recog_popcount_pattern to handle mismatch between the vectorized popcount IFN and scalar popcount builtin.
Related show

Commit Message

H.J. Lu via Gcc-patches June 17, 2021, 6:29 a.m.
The patch remove those pro- and demotions when backend support direct
optab.

For i386: it enables vectorization for vpopcntb/vpopcntw and optimized
for vpopcntq.

gcc/ChangeLog:

	PR tree-optimization/97770
	* tree-vect-patterns.c (vect_recog_popcount_pattern):
	New.
	(vect_recog_func vect_vect_recog_func_ptrs): Add new pattern.

gcc/testsuite/ChangeLog:

	PR tree-optimization/97770
	* gcc.target/i386/avx512bitalg-pr97770-1.c: Remove xfail.
	* gcc.target/i386/avx512vpopcntdq-pr97770-1.c: Remove xfail.
---
 .../gcc.target/i386/avx512bitalg-pr97770-1.c  |  27 +++--
 .../i386/avx512vpopcntdq-pr97770-1.c          |   9 +-
 gcc/tree-vect-patterns.c                      | 110 ++++++++++++++++++
 3 files changed, 127 insertions(+), 19 deletions(-)

-- 
2.18.1

Comments

H.J. Lu via Gcc-patches June 21, 2021, 10:05 a.m. | #1
On Thu, Jun 17, 2021 at 8:29 AM liuhongt <hongtao.liu@intel.com> wrote:
>

> The patch remove those pro- and demotions when backend support direct

> optab.

>

> For i386: it enables vectorization for vpopcntb/vpopcntw and optimized

> for vpopcntq.

>

> gcc/ChangeLog:

>

>         PR tree-optimization/97770

>         * tree-vect-patterns.c (vect_recog_popcount_pattern):

>         New.

>         (vect_recog_func vect_vect_recog_func_ptrs): Add new pattern.

>

> gcc/testsuite/ChangeLog:

>

>         PR tree-optimization/97770

>         * gcc.target/i386/avx512bitalg-pr97770-1.c: Remove xfail.

>         * gcc.target/i386/avx512vpopcntdq-pr97770-1.c: Remove xfail.

> ---

>  .../gcc.target/i386/avx512bitalg-pr97770-1.c  |  27 +++--

>  .../i386/avx512vpopcntdq-pr97770-1.c          |   9 +-

>  gcc/tree-vect-patterns.c                      | 110 ++++++++++++++++++

>  3 files changed, 127 insertions(+), 19 deletions(-)

>

> diff --git a/gcc/testsuite/gcc.target/i386/avx512bitalg-pr97770-1.c b/gcc/testsuite/gcc.target/i386/avx512bitalg-pr97770-1.c

> index c83a477045c..d1beec4cdb4 100644

> --- a/gcc/testsuite/gcc.target/i386/avx512bitalg-pr97770-1.c

> +++ b/gcc/testsuite/gcc.target/i386/avx512bitalg-pr97770-1.c

> @@ -1,19 +1,18 @@

>  /* PR target/97770 */

>  /* { dg-do compile } */

> -/* { dg-options "-O2 -mavx512bitalg -mavx512vl -mprefer-vector-width=512" } */

> -/* Add xfail since no IFN for QI/HImode popcount */

> -/* { dg-final { scan-assembler-times "vpopcntb\[ \\t\]+\[^\\n\\r\]*xmm" 1 {xfail *-*-*} } } */

> -/* { dg-final { scan-assembler-times "vpopcntw\[ \\t\]+\[^\\n\\r\]*xmm" 1 {xfail *-*-*} } } */

> -/* { dg-final { scan-assembler-times "vpopcntb\[ \\t\]+\[^\\n\\r\]*ymm" 1 {xfail *-*-*} } } */

> -/* { dg-final { scan-assembler-times "vpopcntw\[ \\t\]+\[^\\n\\r\]*ymm" 1 {xfail *-*-*} } } */

> -/* { dg-final { scan-assembler-times "vpopcntb\[ \\t\]+\[^\\n\\r\]*zmm" 1 {xfail *-*-*} } } */

> -/* { dg-final { scan-assembler-times "vpopcntw\[ \\t\]+\[^\\n\\r\]*zmm" 1 {xfail *-*-*} } } */

> +/* { dg-options "-O2 -march=icelake-server -mprefer-vector-width=512" } */

> +/* { dg-final { scan-assembler-times "vpopcntb\[ \\t\]+\[^\\n\\r\]*xmm" 1  } } */

> +/* { dg-final { scan-assembler-times "vpopcntw\[ \\t\]+\[^\\n\\r\]*xmm" 1  } } */

> +/* { dg-final { scan-assembler-times "vpopcntb\[ \\t\]+\[^\\n\\r\]*ymm" 1  } } */

> +/* { dg-final { scan-assembler-times "vpopcntw\[ \\t\]+\[^\\n\\r\]*ymm" 1  } } */

> +/* { dg-final { scan-assembler-times "vpopcntb\[ \\t\]+\[^\\n\\r\]*zmm" 1  } } */

> +/* { dg-final { scan-assembler-times "vpopcntw\[ \\t\]+\[^\\n\\r\]*zmm" 1  } } */

>

>  #include <immintrin.h>

>

>  void

>  __attribute__ ((noipa, optimize("-O3")))

> -popcountb_128 (char * __restrict dest, char* src)

> +popcountb_128 (unsigned char * __restrict dest, unsigned char* src)

>  {

>    for (int i = 0; i != 16; i++)

>      dest[i] = __builtin_popcount (src[i]);

> @@ -21,7 +20,7 @@ popcountb_128 (char * __restrict dest, char* src)

>

>  void

>  __attribute__ ((noipa, optimize("-O3")))

> -popcountw_128 (short* __restrict dest, short* src)

> +popcountw_128 (unsigned short* __restrict dest, unsigned short* src)

>  {

>    for (int i = 0; i != 8; i++)

>      dest[i] = __builtin_popcount (src[i]);

> @@ -29,7 +28,7 @@ popcountw_128 (short* __restrict dest, short* src)

>

>  void

>  __attribute__ ((noipa, optimize("-O3")))

> -popcountb_256 (char * __restrict dest, char* src)

> +popcountb_256 (unsigned char * __restrict dest, unsigned char* src)

>  {

>    for (int i = 0; i != 32; i++)

>      dest[i] = __builtin_popcount (src[i]);

> @@ -37,7 +36,7 @@ popcountb_256 (char * __restrict dest, char* src)

>

>  void

>  __attribute__ ((noipa, optimize("-O3")))

> -popcountw_256 (short* __restrict dest, short* src)

> +popcountw_256 (unsigned short* __restrict dest, unsigned short* src)

>  {

>    for (int i = 0; i != 16; i++)

>      dest[i] = __builtin_popcount (src[i]);

> @@ -45,7 +44,7 @@ popcountw_256 (short* __restrict dest, short* src)

>

>  void

>  __attribute__ ((noipa, optimize("-O3")))

> -popcountb_512 (char * __restrict dest, char* src)

> +popcountb_512 (unsigned char * __restrict dest, unsigned char* src)

>  {

>    for (int i = 0; i != 64; i++)

>      dest[i] = __builtin_popcount (src[i]);

> @@ -53,7 +52,7 @@ popcountb_512 (char * __restrict dest, char* src)

>

>  void

>  __attribute__ ((noipa, optimize("-O3")))

> -popcountw_512 (short* __restrict dest, short* src)

> +popcountw_512 (unsigned short* __restrict dest, unsigned short* src)

>  {

>    for (int i = 0; i != 32; i++)

>      dest[i] = __builtin_popcount (src[i]);

> diff --git a/gcc/testsuite/gcc.target/i386/avx512vpopcntdq-pr97770-1.c b/gcc/testsuite/gcc.target/i386/avx512vpopcntdq-pr97770-1.c

> index 63bb00d9b4a..dedd2e4c3d6 100644

> --- a/gcc/testsuite/gcc.target/i386/avx512vpopcntdq-pr97770-1.c

> +++ b/gcc/testsuite/gcc.target/i386/avx512vpopcntdq-pr97770-1.c

> @@ -1,13 +1,12 @@

>  /* PR target/97770 */

>  /* { dg-do compile } */

> -/* { dg-options "-O2 -mavx512vpopcntdq -mavx512vl -mprefer-vector-width=512" } */

> +/* { dg-options "-O2 -march=icelake-server -mprefer-vector-width=512" } */

>  /* { dg-final { scan-assembler-times "vpopcntd\[ \\t\]+\[^\\n\\r\]*xmm" 1 } } */

>  /* { dg-final { scan-assembler-times "vpopcntd\[ \\t\]+\[^\\n\\r\]*ymm" 1 } } */

>  /* { dg-final { scan-assembler-times "vpopcntd\[ \\t\]+\[^\\n\\r\]*zmm" 1 } } */

> -/* Add xfail since current vectorizor cannot generate expected code for DImode popcount */

> -/* { dg-final { scan-assembler-times "vpopcntq\[ \\t\]+\[^\\n\\r\]*xmm" 1 { xfail *-*-* } } } */

> -/* { dg-final { scan-assembler-times "vpopcntq\[ \\t\]+\[^\\n\\r\]*ymm" 1 { xfail *-*-* } } } */

> -/* { dg-final { scan-assembler-times "vpopcntq\[ \\t\]+\[^\\n\\r\]*zmm" 1 { xfail *-*-* } } } */

> +/* { dg-final { scan-assembler-times "vpopcntq\[ \\t\]+\[^\\n\\r\]*xmm" 1  } } */

> +/* { dg-final { scan-assembler-times "vpopcntq\[ \\t\]+\[^\\n\\r\]*ymm" 1  } } */

> +/* { dg-final { scan-assembler-times "vpopcntq\[ \\t\]+\[^\\n\\r\]*zmm" 1  } } */

>  #ifndef AVX512VPOPCNTQ_H_INCLUDED

>  #define AVX512VPOPCNTQ_H_INCLUDED

>

> diff --git a/gcc/tree-vect-patterns.c b/gcc/tree-vect-patterns.c

> index 177d44ebb5e..5c80800efbb 100644

> --- a/gcc/tree-vect-patterns.c

> +++ b/gcc/tree-vect-patterns.c

> @@ -1292,6 +1292,115 @@ vect_recog_widen_minus_pattern (vec_info *vinfo, stmt_vec_info last_stmt_info,

>                                       "vect_recog_widen_minus_pattern");

>  }

>

> +/* Function vect_recog_popcount_pattern

> +

> +   Try to find the following pattern:

> +

> +   UTYPE1 A;

> +   TYPE1 B;

> +   UTYPE2 temp_in;

> +   TYPE3 temp_out;

> +   temp_in = (TYPE2)A;

> +

> +   temp_out = __builtin_popcount{,l,ll} (temp_in);

> +   B = (TYPE1) temp_out;

> +

> +   TYPE2 may or may not be equal to TYPE3.

> +   i.e. TYPE2 is equal to TYPE3 for __builtin_popcount

> +   i.e. TYPE2 is not equal to TYPE3 for __builtin_popcountll

> +

> +   Input:

> +

> +   * STMT_VINFO: The stmt from which the pattern search begins.

> +   here it starts with B = (TYPE1) temp_out;

> +

> +   Output:

> +

> +   * TYPE_OUT: The vector type of the output of this pattern.

> +

> +   * Return value: A new stmt that will be used to replace the sequence of

> +   stmts that constitute the pattern. In this case it will be:

> +   B = .POPCOUNT (A);

> +*/

> +

> +static gimple *

> +vect_recog_popcount_pattern (vec_info *vinfo,

> +                            stmt_vec_info stmt_vinfo, tree *type_out)

> +{

> +  gassign *last_stmt = dyn_cast <gassign *> (stmt_vinfo->stmt);

> +  gimple *popcount_stmt, *pattern_stmt;

> +  tree rhs_oprnd, rhs_origin, lhs_oprnd, lhs_type, vec_type, new_var;

> +  auto_vec<tree> vargs;

> +

> +  /* Find B = (TYPE1) temp_out. */

> +  if (!last_stmt)

> +    return NULL;

> +  tree_code code = gimple_assign_rhs_code (last_stmt);

> +  if (!CONVERT_EXPR_CODE_P (code))

> +    return NULL;

> +

> +  lhs_oprnd = gimple_assign_lhs (last_stmt);

> +  lhs_type = TREE_TYPE (lhs_oprnd);

> +  if (TREE_CODE (lhs_type) != INTEGER_TYPE)

> +    return NULL;


INTEGRAL_TYPE_P

> +  rhs_oprnd = gimple_assign_rhs1 (last_stmt);

> +  if (TREE_CODE (rhs_oprnd) != SSA_NAME

> +      || !has_single_use (rhs_oprnd))

> +    return NULL;

> +  popcount_stmt = SSA_NAME_DEF_STMT (rhs_oprnd);

> +

> +  /* Find temp_out = __builtin_popcount{,l,ll} (temp_in);  */

> +  if (!is_gimple_call (popcount_stmt)

> +      || !gimple_call_lhs (popcount_stmt))


Since you're arriving here via use-def chain the LHS will
never be NULL.

> +    return NULL;

> +  switch (gimple_call_combined_fn (popcount_stmt))

> +    {

> +    CASE_CFN_POPCOUNT:

> +      break;

> +    default:

> +      return NULL;

> +    }

> +


for safety:

    if (gimple_call_num_args (popcount_stmt) != 1)
      return NULL;

> +  rhs_oprnd = gimple_call_arg (popcount_stmt, 0);

> +  vect_unpromoted_value unprom_diff;

> +  rhs_origin = vect_look_through_possible_promotion (vinfo, rhs_oprnd,

> +                                                   &unprom_diff);

> +

> +  if (!rhs_origin)

> +    return NULL;

> +

> +  /* Input and outout of .POPCOUNT should be same-precision integer.

> +     Also A should be unsigned or same presion as temp_in,

> +     otherwise there would be sign_extend from A to temp_in.  */

> +  if (TYPE_PRECISION (unprom_diff.type) != TYPE_PRECISION (lhs_type)

> +      || !(TYPE_UNSIGNED (unprom_diff.type)

> +          || (TYPE_PRECISION (unprom_diff.type)

> +              == TYPE_PRECISION (TREE_TYPE (rhs_oprnd)))))


Note I find a if (A || !(B || C)) hard to read, please write if (A ||
(!B && !C)) instead.

OK otherwise.

Thanks,
Richard.

> +    return NULL;

> +  vargs.safe_push (unprom_diff.op);

> +

> +  vect_pattern_detected ("vec_regcog_popcount_pattern", popcount_stmt);

> +  vec_type = get_vectype_for_scalar_type (vinfo, lhs_type);

> +  /* Do it only the backend existed popcount<vector_mode>2.  */

> +  if (!direct_internal_fn_supported_p (IFN_POPCOUNT,

> +                                      vec_type,

> +                                      OPTIMIZE_FOR_SPEED))

> +    return NULL;

> +

> +  /* Create B = .POPCOUNT (A).  */

> +  new_var = vect_recog_temp_ssa_var (lhs_type, NULL);

> +  pattern_stmt = gimple_build_call_internal_vec (IFN_POPCOUNT, vargs);

> +  gimple_call_set_lhs (pattern_stmt, new_var);

> +  gimple_set_location (pattern_stmt, gimple_location (last_stmt));

> +  *type_out = vec_type;

> +

> +  if (dump_enabled_p ())

> +    dump_printf_loc (MSG_NOTE, vect_location,

> +                    "created pattern stmt: %G", pattern_stmt);

> +  return pattern_stmt;

> +}

> +

>  /* Function vect_recog_pow_pattern

>

>     Try to find the following pattern:

> @@ -5283,6 +5392,7 @@ static vect_recog_func vect_vect_recog_func_ptrs[] = {

>    { vect_recog_sad_pattern, "sad" },

>    { vect_recog_widen_sum_pattern, "widen_sum" },

>    { vect_recog_pow_pattern, "pow" },

> +  { vect_recog_popcount_pattern, "popcount" },

>    { vect_recog_widen_shift_pattern, "widen_shift" },

>    { vect_recog_rotate_pattern, "rotate" },

>    { vect_recog_vector_vector_shift_pattern, "vector_vector_shift" },

> --

> 2.18.1

>
H.J. Lu via Gcc-patches June 22, 2021, 2:43 a.m. | #2
On Mon, Jun 21, 2021 at 6:05 PM Richard Biener
<richard.guenther@gmail.com> wrote:
>

> On Thu, Jun 17, 2021 at 8:29 AM liuhongt <hongtao.liu@intel.com> wrote:

> >

> > The patch remove those pro- and demotions when backend support direct

> > optab.

> >

> > For i386: it enables vectorization for vpopcntb/vpopcntw and optimized

> > for vpopcntq.

> >

> > gcc/ChangeLog:

> >

> >         PR tree-optimization/97770

> >         * tree-vect-patterns.c (vect_recog_popcount_pattern):

> >         New.

> >         (vect_recog_func vect_vect_recog_func_ptrs): Add new pattern.

> >

> > gcc/testsuite/ChangeLog:

> >

> >         PR tree-optimization/97770

> >         * gcc.target/i386/avx512bitalg-pr97770-1.c: Remove xfail.

> >         * gcc.target/i386/avx512vpopcntdq-pr97770-1.c: Remove xfail.

> > ---

> >  .../gcc.target/i386/avx512bitalg-pr97770-1.c  |  27 +++--

> >  .../i386/avx512vpopcntdq-pr97770-1.c          |   9 +-

> >  gcc/tree-vect-patterns.c                      | 110 ++++++++++++++++++

> >  3 files changed, 127 insertions(+), 19 deletions(-)

> >

> > diff --git a/gcc/testsuite/gcc.target/i386/avx512bitalg-pr97770-1.c b/gcc/testsuite/gcc.target/i386/avx512bitalg-pr97770-1.c

> > index c83a477045c..d1beec4cdb4 100644

> > --- a/gcc/testsuite/gcc.target/i386/avx512bitalg-pr97770-1.c

> > +++ b/gcc/testsuite/gcc.target/i386/avx512bitalg-pr97770-1.c

> > @@ -1,19 +1,18 @@

> >  /* PR target/97770 */

> >  /* { dg-do compile } */

> > -/* { dg-options "-O2 -mavx512bitalg -mavx512vl -mprefer-vector-width=512" } */

> > -/* Add xfail since no IFN for QI/HImode popcount */

> > -/* { dg-final { scan-assembler-times "vpopcntb\[ \\t\]+\[^\\n\\r\]*xmm" 1 {xfail *-*-*} } } */

> > -/* { dg-final { scan-assembler-times "vpopcntw\[ \\t\]+\[^\\n\\r\]*xmm" 1 {xfail *-*-*} } } */

> > -/* { dg-final { scan-assembler-times "vpopcntb\[ \\t\]+\[^\\n\\r\]*ymm" 1 {xfail *-*-*} } } */

> > -/* { dg-final { scan-assembler-times "vpopcntw\[ \\t\]+\[^\\n\\r\]*ymm" 1 {xfail *-*-*} } } */

> > -/* { dg-final { scan-assembler-times "vpopcntb\[ \\t\]+\[^\\n\\r\]*zmm" 1 {xfail *-*-*} } } */

> > -/* { dg-final { scan-assembler-times "vpopcntw\[ \\t\]+\[^\\n\\r\]*zmm" 1 {xfail *-*-*} } } */

> > +/* { dg-options "-O2 -march=icelake-server -mprefer-vector-width=512" } */

> > +/* { dg-final { scan-assembler-times "vpopcntb\[ \\t\]+\[^\\n\\r\]*xmm" 1  } } */

> > +/* { dg-final { scan-assembler-times "vpopcntw\[ \\t\]+\[^\\n\\r\]*xmm" 1  } } */

> > +/* { dg-final { scan-assembler-times "vpopcntb\[ \\t\]+\[^\\n\\r\]*ymm" 1  } } */

> > +/* { dg-final { scan-assembler-times "vpopcntw\[ \\t\]+\[^\\n\\r\]*ymm" 1  } } */

> > +/* { dg-final { scan-assembler-times "vpopcntb\[ \\t\]+\[^\\n\\r\]*zmm" 1  } } */

> > +/* { dg-final { scan-assembler-times "vpopcntw\[ \\t\]+\[^\\n\\r\]*zmm" 1  } } */

> >

> >  #include <immintrin.h>

> >

> >  void

> >  __attribute__ ((noipa, optimize("-O3")))

> > -popcountb_128 (char * __restrict dest, char* src)

> > +popcountb_128 (unsigned char * __restrict dest, unsigned char* src)

> >  {

> >    for (int i = 0; i != 16; i++)

> >      dest[i] = __builtin_popcount (src[i]);

> > @@ -21,7 +20,7 @@ popcountb_128 (char * __restrict dest, char* src)

> >

> >  void

> >  __attribute__ ((noipa, optimize("-O3")))

> > -popcountw_128 (short* __restrict dest, short* src)

> > +popcountw_128 (unsigned short* __restrict dest, unsigned short* src)

> >  {

> >    for (int i = 0; i != 8; i++)

> >      dest[i] = __builtin_popcount (src[i]);

> > @@ -29,7 +28,7 @@ popcountw_128 (short* __restrict dest, short* src)

> >

> >  void

> >  __attribute__ ((noipa, optimize("-O3")))

> > -popcountb_256 (char * __restrict dest, char* src)

> > +popcountb_256 (unsigned char * __restrict dest, unsigned char* src)

> >  {

> >    for (int i = 0; i != 32; i++)

> >      dest[i] = __builtin_popcount (src[i]);

> > @@ -37,7 +36,7 @@ popcountb_256 (char * __restrict dest, char* src)

> >

> >  void

> >  __attribute__ ((noipa, optimize("-O3")))

> > -popcountw_256 (short* __restrict dest, short* src)

> > +popcountw_256 (unsigned short* __restrict dest, unsigned short* src)

> >  {

> >    for (int i = 0; i != 16; i++)

> >      dest[i] = __builtin_popcount (src[i]);

> > @@ -45,7 +44,7 @@ popcountw_256 (short* __restrict dest, short* src)

> >

> >  void

> >  __attribute__ ((noipa, optimize("-O3")))

> > -popcountb_512 (char * __restrict dest, char* src)

> > +popcountb_512 (unsigned char * __restrict dest, unsigned char* src)

> >  {

> >    for (int i = 0; i != 64; i++)

> >      dest[i] = __builtin_popcount (src[i]);

> > @@ -53,7 +52,7 @@ popcountb_512 (char * __restrict dest, char* src)

> >

> >  void

> >  __attribute__ ((noipa, optimize("-O3")))

> > -popcountw_512 (short* __restrict dest, short* src)

> > +popcountw_512 (unsigned short* __restrict dest, unsigned short* src)

> >  {

> >    for (int i = 0; i != 32; i++)

> >      dest[i] = __builtin_popcount (src[i]);

> > diff --git a/gcc/testsuite/gcc.target/i386/avx512vpopcntdq-pr97770-1.c b/gcc/testsuite/gcc.target/i386/avx512vpopcntdq-pr97770-1.c

> > index 63bb00d9b4a..dedd2e4c3d6 100644

> > --- a/gcc/testsuite/gcc.target/i386/avx512vpopcntdq-pr97770-1.c

> > +++ b/gcc/testsuite/gcc.target/i386/avx512vpopcntdq-pr97770-1.c

> > @@ -1,13 +1,12 @@

> >  /* PR target/97770 */

> >  /* { dg-do compile } */

> > -/* { dg-options "-O2 -mavx512vpopcntdq -mavx512vl -mprefer-vector-width=512" } */

> > +/* { dg-options "-O2 -march=icelake-server -mprefer-vector-width=512" } */

> >  /* { dg-final { scan-assembler-times "vpopcntd\[ \\t\]+\[^\\n\\r\]*xmm" 1 } } */

> >  /* { dg-final { scan-assembler-times "vpopcntd\[ \\t\]+\[^\\n\\r\]*ymm" 1 } } */

> >  /* { dg-final { scan-assembler-times "vpopcntd\[ \\t\]+\[^\\n\\r\]*zmm" 1 } } */

> > -/* Add xfail since current vectorizor cannot generate expected code for DImode popcount */

> > -/* { dg-final { scan-assembler-times "vpopcntq\[ \\t\]+\[^\\n\\r\]*xmm" 1 { xfail *-*-* } } } */

> > -/* { dg-final { scan-assembler-times "vpopcntq\[ \\t\]+\[^\\n\\r\]*ymm" 1 { xfail *-*-* } } } */

> > -/* { dg-final { scan-assembler-times "vpopcntq\[ \\t\]+\[^\\n\\r\]*zmm" 1 { xfail *-*-* } } } */

> > +/* { dg-final { scan-assembler-times "vpopcntq\[ \\t\]+\[^\\n\\r\]*xmm" 1  } } */

> > +/* { dg-final { scan-assembler-times "vpopcntq\[ \\t\]+\[^\\n\\r\]*ymm" 1  } } */

> > +/* { dg-final { scan-assembler-times "vpopcntq\[ \\t\]+\[^\\n\\r\]*zmm" 1  } } */

> >  #ifndef AVX512VPOPCNTQ_H_INCLUDED

> >  #define AVX512VPOPCNTQ_H_INCLUDED

> >

> > diff --git a/gcc/tree-vect-patterns.c b/gcc/tree-vect-patterns.c

> > index 177d44ebb5e..5c80800efbb 100644

> > --- a/gcc/tree-vect-patterns.c

> > +++ b/gcc/tree-vect-patterns.c

> > @@ -1292,6 +1292,115 @@ vect_recog_widen_minus_pattern (vec_info *vinfo, stmt_vec_info last_stmt_info,

> >                                       "vect_recog_widen_minus_pattern");

> >  }

> >

> > +/* Function vect_recog_popcount_pattern

> > +

> > +   Try to find the following pattern:

> > +

> > +   UTYPE1 A;

> > +   TYPE1 B;

> > +   UTYPE2 temp_in;

> > +   TYPE3 temp_out;

> > +   temp_in = (TYPE2)A;

> > +

> > +   temp_out = __builtin_popcount{,l,ll} (temp_in);

> > +   B = (TYPE1) temp_out;

> > +

> > +   TYPE2 may or may not be equal to TYPE3.

> > +   i.e. TYPE2 is equal to TYPE3 for __builtin_popcount

> > +   i.e. TYPE2 is not equal to TYPE3 for __builtin_popcountll

> > +

> > +   Input:

> > +

> > +   * STMT_VINFO: The stmt from which the pattern search begins.

> > +   here it starts with B = (TYPE1) temp_out;

> > +

> > +   Output:

> > +

> > +   * TYPE_OUT: The vector type of the output of this pattern.

> > +

> > +   * Return value: A new stmt that will be used to replace the sequence of

> > +   stmts that constitute the pattern. In this case it will be:

> > +   B = .POPCOUNT (A);

> > +*/

> > +

> > +static gimple *

> > +vect_recog_popcount_pattern (vec_info *vinfo,

> > +                            stmt_vec_info stmt_vinfo, tree *type_out)

> > +{

> > +  gassign *last_stmt = dyn_cast <gassign *> (stmt_vinfo->stmt);

> > +  gimple *popcount_stmt, *pattern_stmt;

> > +  tree rhs_oprnd, rhs_origin, lhs_oprnd, lhs_type, vec_type, new_var;

> > +  auto_vec<tree> vargs;

> > +

> > +  /* Find B = (TYPE1) temp_out. */

> > +  if (!last_stmt)

> > +    return NULL;

> > +  tree_code code = gimple_assign_rhs_code (last_stmt);

> > +  if (!CONVERT_EXPR_CODE_P (code))

> > +    return NULL;

> > +

> > +  lhs_oprnd = gimple_assign_lhs (last_stmt);

> > +  lhs_type = TREE_TYPE (lhs_oprnd);

> > +  if (TREE_CODE (lhs_type) != INTEGER_TYPE)

> > +    return NULL;

>

> INTEGRAL_TYPE_P

>

Changed.
> > +  rhs_oprnd = gimple_assign_rhs1 (last_stmt);

> > +  if (TREE_CODE (rhs_oprnd) != SSA_NAME

> > +      || !has_single_use (rhs_oprnd))

> > +    return NULL;

> > +  popcount_stmt = SSA_NAME_DEF_STMT (rhs_oprnd);

> > +

> > +  /* Find temp_out = __builtin_popcount{,l,ll} (temp_in);  */

> > +  if (!is_gimple_call (popcount_stmt)

> > +      || !gimple_call_lhs (popcount_stmt))

>

> Since you're arriving here via use-def chain the LHS will

> never be NULL.

>

> > +    return NULL;

> > +  switch (gimple_call_combined_fn (popcount_stmt))

> > +    {

> > +    CASE_CFN_POPCOUNT:

> > +      break;

> > +    default:

> > +      return NULL;

> > +    }

> > +

>

> for safety:

>

>     if (gimple_call_num_args (popcount_stmt) != 1)

>       return NULL;

>

Changed.
> > +  rhs_oprnd = gimple_call_arg (popcount_stmt, 0);

> > +  vect_unpromoted_value unprom_diff;

> > +  rhs_origin = vect_look_through_possible_promotion (vinfo, rhs_oprnd,

> > +                                                   &unprom_diff);

> > +

> > +  if (!rhs_origin)

> > +    return NULL;

> > +

> > +  /* Input and outout of .POPCOUNT should be same-precision integer.

> > +     Also A should be unsigned or same presion as temp_in,

> > +     otherwise there would be sign_extend from A to temp_in.  */

> > +  if (TYPE_PRECISION (unprom_diff.type) != TYPE_PRECISION (lhs_type)

> > +      || !(TYPE_UNSIGNED (unprom_diff.type)

> > +          || (TYPE_PRECISION (unprom_diff.type)

> > +              == TYPE_PRECISION (TREE_TYPE (rhs_oprnd)))))

>

> Note I find a if (A || !(B || C)) hard to read, please write if (A ||

> (!B && !C)) instead.

>

Changed.
> OK otherwise.

>

> Thanks,

> Richard.

>

> > +    return NULL;

> > +  vargs.safe_push (unprom_diff.op);

> > +

> > +  vect_pattern_detected ("vec_regcog_popcount_pattern", popcount_stmt);

> > +  vec_type = get_vectype_for_scalar_type (vinfo, lhs_type);

> > +  /* Do it only the backend existed popcount<vector_mode>2.  */

> > +  if (!direct_internal_fn_supported_p (IFN_POPCOUNT,

> > +                                      vec_type,

> > +                                      OPTIMIZE_FOR_SPEED))

> > +    return NULL;

> > +

> > +  /* Create B = .POPCOUNT (A).  */

> > +  new_var = vect_recog_temp_ssa_var (lhs_type, NULL);

> > +  pattern_stmt = gimple_build_call_internal_vec (IFN_POPCOUNT, vargs);

> > +  gimple_call_set_lhs (pattern_stmt, new_var);

> > +  gimple_set_location (pattern_stmt, gimple_location (last_stmt));

> > +  *type_out = vec_type;

> > +

> > +  if (dump_enabled_p ())

> > +    dump_printf_loc (MSG_NOTE, vect_location,

> > +                    "created pattern stmt: %G", pattern_stmt);

> > +  return pattern_stmt;

> > +}

> > +

> >  /* Function vect_recog_pow_pattern

> >

> >     Try to find the following pattern:

> > @@ -5283,6 +5392,7 @@ static vect_recog_func vect_vect_recog_func_ptrs[] = {

> >    { vect_recog_sad_pattern, "sad" },

> >    { vect_recog_widen_sum_pattern, "widen_sum" },

> >    { vect_recog_pow_pattern, "pow" },

> > +  { vect_recog_popcount_pattern, "popcount" },

> >    { vect_recog_widen_shift_pattern, "widen_shift" },

> >    { vect_recog_rotate_pattern, "rotate" },

> >    { vect_recog_vector_vector_shift_pattern, "vector_vector_shift" },

> > --

> > 2.18.1

> >


Thanks for the review, here is the patch I'm checking in.

-- 
BR,
Hongtao
From dc8c51031bb38d04ffcb52a36aaeab471ee2ad0c Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Wed, 16 Jun 2021 17:34:43 +0800
Subject: [PATCH] Add vect_recog_popcount_pattern to handle mismatch between
 the vectorized popcount IFN and scalar popcount builtin.

The patch remove those pro- and demotions when backend support direct
optab.

For i386: it enables vectorization for vpopcntb/vpopcntw and optimized
for vpopcntq.

gcc/ChangeLog:

	PR tree-optimization/97770
	* tree-vect-patterns.c (vect_recog_popcount_pattern):
	New.
	(vect_recog_func vect_vect_recog_func_ptrs): Add new pattern.

gcc/testsuite/ChangeLog:

	PR tree-optimization/97770
	* gcc.target/i386/avx512bitalg-pr97770-1.c: Remove xfail.
	* gcc.target/i386/avx512vpopcntdq-pr97770-1.c: Remove xfail.
---
 .../gcc.target/i386/avx512bitalg-pr97770-1.c  |  27 ++---
 .../i386/avx512vpopcntdq-pr97770-1.c          |   9 +-
 gcc/tree-vect-patterns.c                      | 112 ++++++++++++++++++
 3 files changed, 129 insertions(+), 19 deletions(-)

diff --git a/gcc/testsuite/gcc.target/i386/avx512bitalg-pr97770-1.c b/gcc/testsuite/gcc.target/i386/avx512bitalg-pr97770-1.c
index c83a477045c..d1beec4cdb4 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bitalg-pr97770-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bitalg-pr97770-1.c
@@ -1,19 +1,18 @@
 /* PR target/97770 */
 /* { dg-do compile } */
-/* { dg-options "-O2 -mavx512bitalg -mavx512vl -mprefer-vector-width=512" } */
-/* Add xfail since no IFN for QI/HImode popcount */
-/* { dg-final { scan-assembler-times "vpopcntb\[ \\t\]+\[^\\n\\r\]*xmm" 1 {xfail *-*-*} } } */
-/* { dg-final { scan-assembler-times "vpopcntw\[ \\t\]+\[^\\n\\r\]*xmm" 1 {xfail *-*-*} } } */
-/* { dg-final { scan-assembler-times "vpopcntb\[ \\t\]+\[^\\n\\r\]*ymm" 1 {xfail *-*-*} } } */
-/* { dg-final { scan-assembler-times "vpopcntw\[ \\t\]+\[^\\n\\r\]*ymm" 1 {xfail *-*-*} } } */
-/* { dg-final { scan-assembler-times "vpopcntb\[ \\t\]+\[^\\n\\r\]*zmm" 1 {xfail *-*-*} } } */
-/* { dg-final { scan-assembler-times "vpopcntw\[ \\t\]+\[^\\n\\r\]*zmm" 1 {xfail *-*-*} } } */
+/* { dg-options "-O2 -march=icelake-server -mprefer-vector-width=512" } */
+/* { dg-final { scan-assembler-times "vpopcntb\[ \\t\]+\[^\\n\\r\]*xmm" 1  } } */
+/* { dg-final { scan-assembler-times "vpopcntw\[ \\t\]+\[^\\n\\r\]*xmm" 1  } } */
+/* { dg-final { scan-assembler-times "vpopcntb\[ \\t\]+\[^\\n\\r\]*ymm" 1  } } */
+/* { dg-final { scan-assembler-times "vpopcntw\[ \\t\]+\[^\\n\\r\]*ymm" 1  } } */
+/* { dg-final { scan-assembler-times "vpopcntb\[ \\t\]+\[^\\n\\r\]*zmm" 1  } } */
+/* { dg-final { scan-assembler-times "vpopcntw\[ \\t\]+\[^\\n\\r\]*zmm" 1  } } */
 
 #include <immintrin.h>
 
 void
 __attribute__ ((noipa, optimize("-O3")))
-popcountb_128 (char * __restrict dest, char* src)
+popcountb_128 (unsigned char * __restrict dest, unsigned char* src)
 {
   for (int i = 0; i != 16; i++)
     dest[i] = __builtin_popcount (src[i]);
@@ -21,7 +20,7 @@ popcountb_128 (char * __restrict dest, char* src)
 
 void
 __attribute__ ((noipa, optimize("-O3")))
-popcountw_128 (short* __restrict dest, short* src)
+popcountw_128 (unsigned short* __restrict dest, unsigned short* src)
 {
   for (int i = 0; i != 8; i++)
     dest[i] = __builtin_popcount (src[i]);
@@ -29,7 +28,7 @@ popcountw_128 (short* __restrict dest, short* src)
 
 void
 __attribute__ ((noipa, optimize("-O3")))
-popcountb_256 (char * __restrict dest, char* src)
+popcountb_256 (unsigned char * __restrict dest, unsigned char* src)
 {
   for (int i = 0; i != 32; i++)
     dest[i] = __builtin_popcount (src[i]);
@@ -37,7 +36,7 @@ popcountb_256 (char * __restrict dest, char* src)
 
 void
 __attribute__ ((noipa, optimize("-O3")))
-popcountw_256 (short* __restrict dest, short* src)
+popcountw_256 (unsigned short* __restrict dest, unsigned short* src)
 {
   for (int i = 0; i != 16; i++)
     dest[i] = __builtin_popcount (src[i]);
@@ -45,7 +44,7 @@ popcountw_256 (short* __restrict dest, short* src)
 
 void
 __attribute__ ((noipa, optimize("-O3")))
-popcountb_512 (char * __restrict dest, char* src)
+popcountb_512 (unsigned char * __restrict dest, unsigned char* src)
 {
   for (int i = 0; i != 64; i++)
     dest[i] = __builtin_popcount (src[i]);
@@ -53,7 +52,7 @@ popcountb_512 (char * __restrict dest, char* src)
 
 void
 __attribute__ ((noipa, optimize("-O3")))
-popcountw_512 (short* __restrict dest, short* src)
+popcountw_512 (unsigned short* __restrict dest, unsigned short* src)
 {
   for (int i = 0; i != 32; i++)
     dest[i] = __builtin_popcount (src[i]);
diff --git a/gcc/testsuite/gcc.target/i386/avx512vpopcntdq-pr97770-1.c b/gcc/testsuite/gcc.target/i386/avx512vpopcntdq-pr97770-1.c
index 63bb00d9b4a..dedd2e4c3d6 100644
--- a/gcc/testsuite/gcc.target/i386/avx512vpopcntdq-pr97770-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512vpopcntdq-pr97770-1.c
@@ -1,13 +1,12 @@
 /* PR target/97770 */
 /* { dg-do compile } */
-/* { dg-options "-O2 -mavx512vpopcntdq -mavx512vl -mprefer-vector-width=512" } */
+/* { dg-options "-O2 -march=icelake-server -mprefer-vector-width=512" } */
 /* { dg-final { scan-assembler-times "vpopcntd\[ \\t\]+\[^\\n\\r\]*xmm" 1 } } */
 /* { dg-final { scan-assembler-times "vpopcntd\[ \\t\]+\[^\\n\\r\]*ymm" 1 } } */
 /* { dg-final { scan-assembler-times "vpopcntd\[ \\t\]+\[^\\n\\r\]*zmm" 1 } } */
-/* Add xfail since current vectorizor cannot generate expected code for DImode popcount */
-/* { dg-final { scan-assembler-times "vpopcntq\[ \\t\]+\[^\\n\\r\]*xmm" 1 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times "vpopcntq\[ \\t\]+\[^\\n\\r\]*ymm" 1 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times "vpopcntq\[ \\t\]+\[^\\n\\r\]*zmm" 1 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times "vpopcntq\[ \\t\]+\[^\\n\\r\]*xmm" 1  } } */
+/* { dg-final { scan-assembler-times "vpopcntq\[ \\t\]+\[^\\n\\r\]*ymm" 1  } } */
+/* { dg-final { scan-assembler-times "vpopcntq\[ \\t\]+\[^\\n\\r\]*zmm" 1  } } */
 #ifndef AVX512VPOPCNTQ_H_INCLUDED
 #define AVX512VPOPCNTQ_H_INCLUDED
 
diff --git a/gcc/tree-vect-patterns.c b/gcc/tree-vect-patterns.c
index 177d44ebb5e..59727056dc7 100644
--- a/gcc/tree-vect-patterns.c
+++ b/gcc/tree-vect-patterns.c
@@ -1292,6 +1292,117 @@ vect_recog_widen_minus_pattern (vec_info *vinfo, stmt_vec_info last_stmt_info,
 				      "vect_recog_widen_minus_pattern");
 }
 
+/* Function vect_recog_popcount_pattern
+
+   Try to find the following pattern:
+
+   UTYPE1 A;
+   TYPE1 B;
+   UTYPE2 temp_in;
+   TYPE3 temp_out;
+   temp_in = (TYPE2)A;
+
+   temp_out = __builtin_popcount{,l,ll} (temp_in);
+   B = (TYPE1) temp_out;
+
+   TYPE2 may or may not be equal to TYPE3.
+   i.e. TYPE2 is equal to TYPE3 for __builtin_popcount
+   i.e. TYPE2 is not equal to TYPE3 for __builtin_popcountll
+
+   Input:
+
+   * STMT_VINFO: The stmt from which the pattern search begins.
+   here it starts with B = (TYPE1) temp_out;
+
+   Output:
+
+   * TYPE_OUT: The vector type of the output of this pattern.
+
+   * Return value: A new stmt that will be used to replace the sequence of
+   stmts that constitute the pattern. In this case it will be:
+   B = .POPCOUNT (A);
+*/
+
+static gimple *
+vect_recog_popcount_pattern (vec_info *vinfo,
+			     stmt_vec_info stmt_vinfo, tree *type_out)
+{
+  gassign *last_stmt = dyn_cast <gassign *> (stmt_vinfo->stmt);
+  gimple *popcount_stmt, *pattern_stmt;
+  tree rhs_oprnd, rhs_origin, lhs_oprnd, lhs_type, vec_type, new_var;
+  auto_vec<tree> vargs;
+
+  /* Find B = (TYPE1) temp_out. */
+  if (!last_stmt)
+    return NULL;
+  tree_code code = gimple_assign_rhs_code (last_stmt);
+  if (!CONVERT_EXPR_CODE_P (code))
+    return NULL;
+
+  lhs_oprnd = gimple_assign_lhs (last_stmt);
+  lhs_type = TREE_TYPE (lhs_oprnd);
+  if (!INTEGRAL_TYPE_P (lhs_type))
+    return NULL;
+
+  rhs_oprnd = gimple_assign_rhs1 (last_stmt);
+  if (TREE_CODE (rhs_oprnd) != SSA_NAME
+      || !has_single_use (rhs_oprnd))
+    return NULL;
+  popcount_stmt = SSA_NAME_DEF_STMT (rhs_oprnd);
+
+  /* Find temp_out = __builtin_popcount{,l,ll} (temp_in);  */
+  if (!is_gimple_call (popcount_stmt))
+    return NULL;
+  switch (gimple_call_combined_fn (popcount_stmt))
+    {
+    CASE_CFN_POPCOUNT:
+      break;
+    default:
+      return NULL;
+    }
+
+  if (gimple_call_num_args (popcount_stmt) != 1)
+    return NULL;
+
+  rhs_oprnd = gimple_call_arg (popcount_stmt, 0);
+  vect_unpromoted_value unprom_diff;
+  rhs_origin = vect_look_through_possible_promotion (vinfo, rhs_oprnd,
+						    &unprom_diff);
+
+  if (!rhs_origin)
+    return NULL;
+
+  /* Input and outout of .POPCOUNT should be same-precision integer.
+     Also A should be unsigned or same presion as temp_in,
+     otherwise there would be sign_extend from A to temp_in.  */
+  if (TYPE_PRECISION (unprom_diff.type) != TYPE_PRECISION (lhs_type)
+      || (!TYPE_UNSIGNED (unprom_diff.type)
+	  && (TYPE_PRECISION (unprom_diff.type)
+	      != TYPE_PRECISION (TREE_TYPE (rhs_oprnd)))))
+    return NULL;
+  vargs.safe_push (unprom_diff.op);
+
+  vect_pattern_detected ("vec_regcog_popcount_pattern", popcount_stmt);
+  vec_type = get_vectype_for_scalar_type (vinfo, lhs_type);
+  /* Do it only the backend existed popcount<vector_mode>2.  */
+  if (!direct_internal_fn_supported_p (IFN_POPCOUNT,
+				       vec_type,
+				       OPTIMIZE_FOR_SPEED))
+    return NULL;
+
+  /* Create B = .POPCOUNT (A).  */
+  new_var = vect_recog_temp_ssa_var (lhs_type, NULL);
+  pattern_stmt = gimple_build_call_internal_vec (IFN_POPCOUNT, vargs);
+  gimple_call_set_lhs (pattern_stmt, new_var);
+  gimple_set_location (pattern_stmt, gimple_location (last_stmt));
+  *type_out = vec_type;
+
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location,
+		     "created pattern stmt: %G", pattern_stmt);
+  return pattern_stmt;
+}
+
 /* Function vect_recog_pow_pattern
 
    Try to find the following pattern:
@@ -5283,6 +5394,7 @@ static vect_recog_func vect_vect_recog_func_ptrs[] = {
   { vect_recog_sad_pattern, "sad" },
   { vect_recog_widen_sum_pattern, "widen_sum" },
   { vect_recog_pow_pattern, "pow" },
+  { vect_recog_popcount_pattern, "popcount" },
   { vect_recog_widen_shift_pattern, "widen_shift" },
   { vect_recog_rotate_pattern, "rotate" },
   { vect_recog_vector_vector_shift_pattern, "vector_vector_shift" },
H.J. Lu via Gcc-patches June 22, 2021, 2:44 a.m. | #3
On Tue, Jun 22, 2021 at 10:43 AM Hongtao Liu <crazylht@gmail.com> wrote:
>

> On Mon, Jun 21, 2021 at 6:05 PM Richard Biener

> <richard.guenther@gmail.com> wrote:

> >

> > On Thu, Jun 17, 2021 at 8:29 AM liuhongt <hongtao.liu@intel.com> wrote:

> > >

> > > The patch remove those pro- and demotions when backend support direct

> > > optab.

> > >

> > > For i386: it enables vectorization for vpopcntb/vpopcntw and optimized

> > > for vpopcntq.

> > >

> > > gcc/ChangeLog:

> > >

> > >         PR tree-optimization/97770

> > >         * tree-vect-patterns.c (vect_recog_popcount_pattern):

> > >         New.

> > >         (vect_recog_func vect_vect_recog_func_ptrs): Add new pattern.

> > >

> > > gcc/testsuite/ChangeLog:

> > >

> > >         PR tree-optimization/97770

> > >         * gcc.target/i386/avx512bitalg-pr97770-1.c: Remove xfail.

> > >         * gcc.target/i386/avx512vpopcntdq-pr97770-1.c: Remove xfail.

> > > ---

> > >  .../gcc.target/i386/avx512bitalg-pr97770-1.c  |  27 +++--

> > >  .../i386/avx512vpopcntdq-pr97770-1.c          |   9 +-

> > >  gcc/tree-vect-patterns.c                      | 110 ++++++++++++++++++

> > >  3 files changed, 127 insertions(+), 19 deletions(-)

> > >

> > > diff --git a/gcc/testsuite/gcc.target/i386/avx512bitalg-pr97770-1.c b/gcc/testsuite/gcc.target/i386/avx512bitalg-pr97770-1.c

> > > index c83a477045c..d1beec4cdb4 100644

> > > --- a/gcc/testsuite/gcc.target/i386/avx512bitalg-pr97770-1.c

> > > +++ b/gcc/testsuite/gcc.target/i386/avx512bitalg-pr97770-1.c

> > > @@ -1,19 +1,18 @@

> > >  /* PR target/97770 */

> > >  /* { dg-do compile } */

> > > -/* { dg-options "-O2 -mavx512bitalg -mavx512vl -mprefer-vector-width=512" } */

> > > -/* Add xfail since no IFN for QI/HImode popcount */

> > > -/* { dg-final { scan-assembler-times "vpopcntb\[ \\t\]+\[^\\n\\r\]*xmm" 1 {xfail *-*-*} } } */

> > > -/* { dg-final { scan-assembler-times "vpopcntw\[ \\t\]+\[^\\n\\r\]*xmm" 1 {xfail *-*-*} } } */

> > > -/* { dg-final { scan-assembler-times "vpopcntb\[ \\t\]+\[^\\n\\r\]*ymm" 1 {xfail *-*-*} } } */

> > > -/* { dg-final { scan-assembler-times "vpopcntw\[ \\t\]+\[^\\n\\r\]*ymm" 1 {xfail *-*-*} } } */

> > > -/* { dg-final { scan-assembler-times "vpopcntb\[ \\t\]+\[^\\n\\r\]*zmm" 1 {xfail *-*-*} } } */

> > > -/* { dg-final { scan-assembler-times "vpopcntw\[ \\t\]+\[^\\n\\r\]*zmm" 1 {xfail *-*-*} } } */

> > > +/* { dg-options "-O2 -march=icelake-server -mprefer-vector-width=512" } */

> > > +/* { dg-final { scan-assembler-times "vpopcntb\[ \\t\]+\[^\\n\\r\]*xmm" 1  } } */

> > > +/* { dg-final { scan-assembler-times "vpopcntw\[ \\t\]+\[^\\n\\r\]*xmm" 1  } } */

> > > +/* { dg-final { scan-assembler-times "vpopcntb\[ \\t\]+\[^\\n\\r\]*ymm" 1  } } */

> > > +/* { dg-final { scan-assembler-times "vpopcntw\[ \\t\]+\[^\\n\\r\]*ymm" 1  } } */

> > > +/* { dg-final { scan-assembler-times "vpopcntb\[ \\t\]+\[^\\n\\r\]*zmm" 1  } } */

> > > +/* { dg-final { scan-assembler-times "vpopcntw\[ \\t\]+\[^\\n\\r\]*zmm" 1  } } */

> > >

> > >  #include <immintrin.h>

> > >

> > >  void

> > >  __attribute__ ((noipa, optimize("-O3")))

> > > -popcountb_128 (char * __restrict dest, char* src)

> > > +popcountb_128 (unsigned char * __restrict dest, unsigned char* src)

> > >  {

> > >    for (int i = 0; i != 16; i++)

> > >      dest[i] = __builtin_popcount (src[i]);

> > > @@ -21,7 +20,7 @@ popcountb_128 (char * __restrict dest, char* src)

> > >

> > >  void

> > >  __attribute__ ((noipa, optimize("-O3")))

> > > -popcountw_128 (short* __restrict dest, short* src)

> > > +popcountw_128 (unsigned short* __restrict dest, unsigned short* src)

> > >  {

> > >    for (int i = 0; i != 8; i++)

> > >      dest[i] = __builtin_popcount (src[i]);

> > > @@ -29,7 +28,7 @@ popcountw_128 (short* __restrict dest, short* src)

> > >

> > >  void

> > >  __attribute__ ((noipa, optimize("-O3")))

> > > -popcountb_256 (char * __restrict dest, char* src)

> > > +popcountb_256 (unsigned char * __restrict dest, unsigned char* src)

> > >  {

> > >    for (int i = 0; i != 32; i++)

> > >      dest[i] = __builtin_popcount (src[i]);

> > > @@ -37,7 +36,7 @@ popcountb_256 (char * __restrict dest, char* src)

> > >

> > >  void

> > >  __attribute__ ((noipa, optimize("-O3")))

> > > -popcountw_256 (short* __restrict dest, short* src)

> > > +popcountw_256 (unsigned short* __restrict dest, unsigned short* src)

> > >  {

> > >    for (int i = 0; i != 16; i++)

> > >      dest[i] = __builtin_popcount (src[i]);

> > > @@ -45,7 +44,7 @@ popcountw_256 (short* __restrict dest, short* src)

> > >

> > >  void

> > >  __attribute__ ((noipa, optimize("-O3")))

> > > -popcountb_512 (char * __restrict dest, char* src)

> > > +popcountb_512 (unsigned char * __restrict dest, unsigned char* src)

> > >  {

> > >    for (int i = 0; i != 64; i++)

> > >      dest[i] = __builtin_popcount (src[i]);

> > > @@ -53,7 +52,7 @@ popcountb_512 (char * __restrict dest, char* src)

> > >

> > >  void

> > >  __attribute__ ((noipa, optimize("-O3")))

> > > -popcountw_512 (short* __restrict dest, short* src)

> > > +popcountw_512 (unsigned short* __restrict dest, unsigned short* src)

> > >  {

> > >    for (int i = 0; i != 32; i++)

> > >      dest[i] = __builtin_popcount (src[i]);

> > > diff --git a/gcc/testsuite/gcc.target/i386/avx512vpopcntdq-pr97770-1.c b/gcc/testsuite/gcc.target/i386/avx512vpopcntdq-pr97770-1.c

> > > index 63bb00d9b4a..dedd2e4c3d6 100644

> > > --- a/gcc/testsuite/gcc.target/i386/avx512vpopcntdq-pr97770-1.c

> > > +++ b/gcc/testsuite/gcc.target/i386/avx512vpopcntdq-pr97770-1.c

> > > @@ -1,13 +1,12 @@

> > >  /* PR target/97770 */

> > >  /* { dg-do compile } */

> > > -/* { dg-options "-O2 -mavx512vpopcntdq -mavx512vl -mprefer-vector-width=512" } */

> > > +/* { dg-options "-O2 -march=icelake-server -mprefer-vector-width=512" } */

> > >  /* { dg-final { scan-assembler-times "vpopcntd\[ \\t\]+\[^\\n\\r\]*xmm" 1 } } */

> > >  /* { dg-final { scan-assembler-times "vpopcntd\[ \\t\]+\[^\\n\\r\]*ymm" 1 } } */

> > >  /* { dg-final { scan-assembler-times "vpopcntd\[ \\t\]+\[^\\n\\r\]*zmm" 1 } } */

> > > -/* Add xfail since current vectorizor cannot generate expected code for DImode popcount */

> > > -/* { dg-final { scan-assembler-times "vpopcntq\[ \\t\]+\[^\\n\\r\]*xmm" 1 { xfail *-*-* } } } */

> > > -/* { dg-final { scan-assembler-times "vpopcntq\[ \\t\]+\[^\\n\\r\]*ymm" 1 { xfail *-*-* } } } */

> > > -/* { dg-final { scan-assembler-times "vpopcntq\[ \\t\]+\[^\\n\\r\]*zmm" 1 { xfail *-*-* } } } */

> > > +/* { dg-final { scan-assembler-times "vpopcntq\[ \\t\]+\[^\\n\\r\]*xmm" 1  } } */

> > > +/* { dg-final { scan-assembler-times "vpopcntq\[ \\t\]+\[^\\n\\r\]*ymm" 1  } } */

> > > +/* { dg-final { scan-assembler-times "vpopcntq\[ \\t\]+\[^\\n\\r\]*zmm" 1  } } */

> > >  #ifndef AVX512VPOPCNTQ_H_INCLUDED

> > >  #define AVX512VPOPCNTQ_H_INCLUDED

> > >

> > > diff --git a/gcc/tree-vect-patterns.c b/gcc/tree-vect-patterns.c

> > > index 177d44ebb5e..5c80800efbb 100644

> > > --- a/gcc/tree-vect-patterns.c

> > > +++ b/gcc/tree-vect-patterns.c

> > > @@ -1292,6 +1292,115 @@ vect_recog_widen_minus_pattern (vec_info *vinfo, stmt_vec_info last_stmt_info,

> > >                                       "vect_recog_widen_minus_pattern");

> > >  }

> > >

> > > +/* Function vect_recog_popcount_pattern

> > > +

> > > +   Try to find the following pattern:

> > > +

> > > +   UTYPE1 A;

> > > +   TYPE1 B;

> > > +   UTYPE2 temp_in;

> > > +   TYPE3 temp_out;

> > > +   temp_in = (TYPE2)A;

> > > +

> > > +   temp_out = __builtin_popcount{,l,ll} (temp_in);

> > > +   B = (TYPE1) temp_out;

> > > +

> > > +   TYPE2 may or may not be equal to TYPE3.

> > > +   i.e. TYPE2 is equal to TYPE3 for __builtin_popcount

> > > +   i.e. TYPE2 is not equal to TYPE3 for __builtin_popcountll

> > > +

> > > +   Input:

> > > +

> > > +   * STMT_VINFO: The stmt from which the pattern search begins.

> > > +   here it starts with B = (TYPE1) temp_out;

> > > +

> > > +   Output:

> > > +

> > > +   * TYPE_OUT: The vector type of the output of this pattern.

> > > +

> > > +   * Return value: A new stmt that will be used to replace the sequence of

> > > +   stmts that constitute the pattern. In this case it will be:

> > > +   B = .POPCOUNT (A);

> > > +*/

> > > +

> > > +static gimple *

> > > +vect_recog_popcount_pattern (vec_info *vinfo,

> > > +                            stmt_vec_info stmt_vinfo, tree *type_out)

> > > +{

> > > +  gassign *last_stmt = dyn_cast <gassign *> (stmt_vinfo->stmt);

> > > +  gimple *popcount_stmt, *pattern_stmt;

> > > +  tree rhs_oprnd, rhs_origin, lhs_oprnd, lhs_type, vec_type, new_var;

> > > +  auto_vec<tree> vargs;

> > > +

> > > +  /* Find B = (TYPE1) temp_out. */

> > > +  if (!last_stmt)

> > > +    return NULL;

> > > +  tree_code code = gimple_assign_rhs_code (last_stmt);

> > > +  if (!CONVERT_EXPR_CODE_P (code))

> > > +    return NULL;

> > > +

> > > +  lhs_oprnd = gimple_assign_lhs (last_stmt);

> > > +  lhs_type = TREE_TYPE (lhs_oprnd);

> > > +  if (TREE_CODE (lhs_type) != INTEGER_TYPE)

> > > +    return NULL;

> >

> > INTEGRAL_TYPE_P

> >

> Changed.

> > > +  rhs_oprnd = gimple_assign_rhs1 (last_stmt);

> > > +  if (TREE_CODE (rhs_oprnd) != SSA_NAME

> > > +      || !has_single_use (rhs_oprnd))

> > > +    return NULL;

> > > +  popcount_stmt = SSA_NAME_DEF_STMT (rhs_oprnd);

> > > +

> > > +  /* Find temp_out = __builtin_popcount{,l,ll} (temp_in);  */

> > > +  if (!is_gimple_call (popcount_stmt)

> > > +      || !gimple_call_lhs (popcount_stmt))

> >

> > Since you're arriving here via use-def chain the LHS will

> > never be NULL.

> >

Forgot to mention this part is also changed.
> > > +    return NULL;

> > > +  switch (gimple_call_combined_fn (popcount_stmt))

> > > +    {

> > > +    CASE_CFN_POPCOUNT:

> > > +      break;

> > > +    default:

> > > +      return NULL;

> > > +    }

> > > +

> >

> > for safety:

> >

> >     if (gimple_call_num_args (popcount_stmt) != 1)

> >       return NULL;

> >

> Changed.

> > > +  rhs_oprnd = gimple_call_arg (popcount_stmt, 0);

> > > +  vect_unpromoted_value unprom_diff;

> > > +  rhs_origin = vect_look_through_possible_promotion (vinfo, rhs_oprnd,

> > > +                                                   &unprom_diff);

> > > +

> > > +  if (!rhs_origin)

> > > +    return NULL;

> > > +

> > > +  /* Input and outout of .POPCOUNT should be same-precision integer.

> > > +     Also A should be unsigned or same presion as temp_in,

> > > +     otherwise there would be sign_extend from A to temp_in.  */

> > > +  if (TYPE_PRECISION (unprom_diff.type) != TYPE_PRECISION (lhs_type)

> > > +      || !(TYPE_UNSIGNED (unprom_diff.type)

> > > +          || (TYPE_PRECISION (unprom_diff.type)

> > > +              == TYPE_PRECISION (TREE_TYPE (rhs_oprnd)))))

> >

> > Note I find a if (A || !(B || C)) hard to read, please write if (A ||

> > (!B && !C)) instead.

> >

> Changed.

> > OK otherwise.

> >

> > Thanks,

> > Richard.

> >

> > > +    return NULL;

> > > +  vargs.safe_push (unprom_diff.op);

> > > +

> > > +  vect_pattern_detected ("vec_regcog_popcount_pattern", popcount_stmt);

> > > +  vec_type = get_vectype_for_scalar_type (vinfo, lhs_type);

> > > +  /* Do it only the backend existed popcount<vector_mode>2.  */

> > > +  if (!direct_internal_fn_supported_p (IFN_POPCOUNT,

> > > +                                      vec_type,

> > > +                                      OPTIMIZE_FOR_SPEED))

> > > +    return NULL;

> > > +

> > > +  /* Create B = .POPCOUNT (A).  */

> > > +  new_var = vect_recog_temp_ssa_var (lhs_type, NULL);

> > > +  pattern_stmt = gimple_build_call_internal_vec (IFN_POPCOUNT, vargs);

> > > +  gimple_call_set_lhs (pattern_stmt, new_var);

> > > +  gimple_set_location (pattern_stmt, gimple_location (last_stmt));

> > > +  *type_out = vec_type;

> > > +

> > > +  if (dump_enabled_p ())

> > > +    dump_printf_loc (MSG_NOTE, vect_location,

> > > +                    "created pattern stmt: %G", pattern_stmt);

> > > +  return pattern_stmt;

> > > +}

> > > +

> > >  /* Function vect_recog_pow_pattern

> > >

> > >     Try to find the following pattern:

> > > @@ -5283,6 +5392,7 @@ static vect_recog_func vect_vect_recog_func_ptrs[] = {

> > >    { vect_recog_sad_pattern, "sad" },

> > >    { vect_recog_widen_sum_pattern, "widen_sum" },

> > >    { vect_recog_pow_pattern, "pow" },

> > > +  { vect_recog_popcount_pattern, "popcount" },

> > >    { vect_recog_widen_shift_pattern, "widen_shift" },

> > >    { vect_recog_rotate_pattern, "rotate" },

> > >    { vect_recog_vector_vector_shift_pattern, "vector_vector_shift" },

> > > --

> > > 2.18.1

> > >

>

> Thanks for the review, here is the patch I'm checking in.

>

> --

> BR,

> Hongtao




-- 
BR,
Hongtao

Patch

diff --git a/gcc/testsuite/gcc.target/i386/avx512bitalg-pr97770-1.c b/gcc/testsuite/gcc.target/i386/avx512bitalg-pr97770-1.c
index c83a477045c..d1beec4cdb4 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bitalg-pr97770-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bitalg-pr97770-1.c
@@ -1,19 +1,18 @@ 
 /* PR target/97770 */
 /* { dg-do compile } */
-/* { dg-options "-O2 -mavx512bitalg -mavx512vl -mprefer-vector-width=512" } */
-/* Add xfail since no IFN for QI/HImode popcount */
-/* { dg-final { scan-assembler-times "vpopcntb\[ \\t\]+\[^\\n\\r\]*xmm" 1 {xfail *-*-*} } } */
-/* { dg-final { scan-assembler-times "vpopcntw\[ \\t\]+\[^\\n\\r\]*xmm" 1 {xfail *-*-*} } } */
-/* { dg-final { scan-assembler-times "vpopcntb\[ \\t\]+\[^\\n\\r\]*ymm" 1 {xfail *-*-*} } } */
-/* { dg-final { scan-assembler-times "vpopcntw\[ \\t\]+\[^\\n\\r\]*ymm" 1 {xfail *-*-*} } } */
-/* { dg-final { scan-assembler-times "vpopcntb\[ \\t\]+\[^\\n\\r\]*zmm" 1 {xfail *-*-*} } } */
-/* { dg-final { scan-assembler-times "vpopcntw\[ \\t\]+\[^\\n\\r\]*zmm" 1 {xfail *-*-*} } } */
+/* { dg-options "-O2 -march=icelake-server -mprefer-vector-width=512" } */
+/* { dg-final { scan-assembler-times "vpopcntb\[ \\t\]+\[^\\n\\r\]*xmm" 1  } } */
+/* { dg-final { scan-assembler-times "vpopcntw\[ \\t\]+\[^\\n\\r\]*xmm" 1  } } */
+/* { dg-final { scan-assembler-times "vpopcntb\[ \\t\]+\[^\\n\\r\]*ymm" 1  } } */
+/* { dg-final { scan-assembler-times "vpopcntw\[ \\t\]+\[^\\n\\r\]*ymm" 1  } } */
+/* { dg-final { scan-assembler-times "vpopcntb\[ \\t\]+\[^\\n\\r\]*zmm" 1  } } */
+/* { dg-final { scan-assembler-times "vpopcntw\[ \\t\]+\[^\\n\\r\]*zmm" 1  } } */
 
 #include <immintrin.h>
 
 void
 __attribute__ ((noipa, optimize("-O3")))
-popcountb_128 (char * __restrict dest, char* src)
+popcountb_128 (unsigned char * __restrict dest, unsigned char* src)
 {
   for (int i = 0; i != 16; i++)
     dest[i] = __builtin_popcount (src[i]);
@@ -21,7 +20,7 @@  popcountb_128 (char * __restrict dest, char* src)
 
 void
 __attribute__ ((noipa, optimize("-O3")))
-popcountw_128 (short* __restrict dest, short* src)
+popcountw_128 (unsigned short* __restrict dest, unsigned short* src)
 {
   for (int i = 0; i != 8; i++)
     dest[i] = __builtin_popcount (src[i]);
@@ -29,7 +28,7 @@  popcountw_128 (short* __restrict dest, short* src)
 
 void
 __attribute__ ((noipa, optimize("-O3")))
-popcountb_256 (char * __restrict dest, char* src)
+popcountb_256 (unsigned char * __restrict dest, unsigned char* src)
 {
   for (int i = 0; i != 32; i++)
     dest[i] = __builtin_popcount (src[i]);
@@ -37,7 +36,7 @@  popcountb_256 (char * __restrict dest, char* src)
 
 void
 __attribute__ ((noipa, optimize("-O3")))
-popcountw_256 (short* __restrict dest, short* src)
+popcountw_256 (unsigned short* __restrict dest, unsigned short* src)
 {
   for (int i = 0; i != 16; i++)
     dest[i] = __builtin_popcount (src[i]);
@@ -45,7 +44,7 @@  popcountw_256 (short* __restrict dest, short* src)
 
 void
 __attribute__ ((noipa, optimize("-O3")))
-popcountb_512 (char * __restrict dest, char* src)
+popcountb_512 (unsigned char * __restrict dest, unsigned char* src)
 {
   for (int i = 0; i != 64; i++)
     dest[i] = __builtin_popcount (src[i]);
@@ -53,7 +52,7 @@  popcountb_512 (char * __restrict dest, char* src)
 
 void
 __attribute__ ((noipa, optimize("-O3")))
-popcountw_512 (short* __restrict dest, short* src)
+popcountw_512 (unsigned short* __restrict dest, unsigned short* src)
 {
   for (int i = 0; i != 32; i++)
     dest[i] = __builtin_popcount (src[i]);
diff --git a/gcc/testsuite/gcc.target/i386/avx512vpopcntdq-pr97770-1.c b/gcc/testsuite/gcc.target/i386/avx512vpopcntdq-pr97770-1.c
index 63bb00d9b4a..dedd2e4c3d6 100644
--- a/gcc/testsuite/gcc.target/i386/avx512vpopcntdq-pr97770-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512vpopcntdq-pr97770-1.c
@@ -1,13 +1,12 @@ 
 /* PR target/97770 */
 /* { dg-do compile } */
-/* { dg-options "-O2 -mavx512vpopcntdq -mavx512vl -mprefer-vector-width=512" } */
+/* { dg-options "-O2 -march=icelake-server -mprefer-vector-width=512" } */
 /* { dg-final { scan-assembler-times "vpopcntd\[ \\t\]+\[^\\n\\r\]*xmm" 1 } } */
 /* { dg-final { scan-assembler-times "vpopcntd\[ \\t\]+\[^\\n\\r\]*ymm" 1 } } */
 /* { dg-final { scan-assembler-times "vpopcntd\[ \\t\]+\[^\\n\\r\]*zmm" 1 } } */
-/* Add xfail since current vectorizor cannot generate expected code for DImode popcount */
-/* { dg-final { scan-assembler-times "vpopcntq\[ \\t\]+\[^\\n\\r\]*xmm" 1 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times "vpopcntq\[ \\t\]+\[^\\n\\r\]*ymm" 1 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times "vpopcntq\[ \\t\]+\[^\\n\\r\]*zmm" 1 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times "vpopcntq\[ \\t\]+\[^\\n\\r\]*xmm" 1  } } */
+/* { dg-final { scan-assembler-times "vpopcntq\[ \\t\]+\[^\\n\\r\]*ymm" 1  } } */
+/* { dg-final { scan-assembler-times "vpopcntq\[ \\t\]+\[^\\n\\r\]*zmm" 1  } } */
 #ifndef AVX512VPOPCNTQ_H_INCLUDED
 #define AVX512VPOPCNTQ_H_INCLUDED
 
diff --git a/gcc/tree-vect-patterns.c b/gcc/tree-vect-patterns.c
index 177d44ebb5e..5c80800efbb 100644
--- a/gcc/tree-vect-patterns.c
+++ b/gcc/tree-vect-patterns.c
@@ -1292,6 +1292,115 @@  vect_recog_widen_minus_pattern (vec_info *vinfo, stmt_vec_info last_stmt_info,
 				      "vect_recog_widen_minus_pattern");
 }
 
+/* Function vect_recog_popcount_pattern
+
+   Try to find the following pattern:
+
+   UTYPE1 A;
+   TYPE1 B;
+   UTYPE2 temp_in;
+   TYPE3 temp_out;
+   temp_in = (TYPE2)A;
+
+   temp_out = __builtin_popcount{,l,ll} (temp_in);
+   B = (TYPE1) temp_out;
+
+   TYPE2 may or may not be equal to TYPE3.
+   i.e. TYPE2 is equal to TYPE3 for __builtin_popcount
+   i.e. TYPE2 is not equal to TYPE3 for __builtin_popcountll
+
+   Input:
+
+   * STMT_VINFO: The stmt from which the pattern search begins.
+   here it starts with B = (TYPE1) temp_out;
+
+   Output:
+
+   * TYPE_OUT: The vector type of the output of this pattern.
+
+   * Return value: A new stmt that will be used to replace the sequence of
+   stmts that constitute the pattern. In this case it will be:
+   B = .POPCOUNT (A);
+*/
+
+static gimple *
+vect_recog_popcount_pattern (vec_info *vinfo,
+			     stmt_vec_info stmt_vinfo, tree *type_out)
+{
+  gassign *last_stmt = dyn_cast <gassign *> (stmt_vinfo->stmt);
+  gimple *popcount_stmt, *pattern_stmt;
+  tree rhs_oprnd, rhs_origin, lhs_oprnd, lhs_type, vec_type, new_var;
+  auto_vec<tree> vargs;
+
+  /* Find B = (TYPE1) temp_out. */
+  if (!last_stmt)
+    return NULL;
+  tree_code code = gimple_assign_rhs_code (last_stmt);
+  if (!CONVERT_EXPR_CODE_P (code))
+    return NULL;
+
+  lhs_oprnd = gimple_assign_lhs (last_stmt);
+  lhs_type = TREE_TYPE (lhs_oprnd);
+  if (TREE_CODE (lhs_type) != INTEGER_TYPE)
+    return NULL;
+
+  rhs_oprnd = gimple_assign_rhs1 (last_stmt);
+  if (TREE_CODE (rhs_oprnd) != SSA_NAME
+      || !has_single_use (rhs_oprnd))
+    return NULL;
+  popcount_stmt = SSA_NAME_DEF_STMT (rhs_oprnd);
+
+  /* Find temp_out = __builtin_popcount{,l,ll} (temp_in);  */
+  if (!is_gimple_call (popcount_stmt)
+      || !gimple_call_lhs (popcount_stmt))
+    return NULL;
+  switch (gimple_call_combined_fn (popcount_stmt))
+    {
+    CASE_CFN_POPCOUNT:
+      break;
+    default:
+      return NULL;
+    }
+
+  rhs_oprnd = gimple_call_arg (popcount_stmt, 0);
+  vect_unpromoted_value unprom_diff;
+  rhs_origin = vect_look_through_possible_promotion (vinfo, rhs_oprnd,
+						    &unprom_diff);
+
+  if (!rhs_origin)
+    return NULL;
+
+  /* Input and outout of .POPCOUNT should be same-precision integer.
+     Also A should be unsigned or same presion as temp_in,
+     otherwise there would be sign_extend from A to temp_in.  */
+  if (TYPE_PRECISION (unprom_diff.type) != TYPE_PRECISION (lhs_type)
+      || !(TYPE_UNSIGNED (unprom_diff.type)
+	   || (TYPE_PRECISION (unprom_diff.type)
+	       == TYPE_PRECISION (TREE_TYPE (rhs_oprnd)))))
+    return NULL;
+  vargs.safe_push (unprom_diff.op);
+
+  vect_pattern_detected ("vec_regcog_popcount_pattern", popcount_stmt);
+  vec_type = get_vectype_for_scalar_type (vinfo, lhs_type);
+  /* Do it only the backend existed popcount<vector_mode>2.  */
+  if (!direct_internal_fn_supported_p (IFN_POPCOUNT,
+				       vec_type,
+				       OPTIMIZE_FOR_SPEED))
+    return NULL;
+
+  /* Create B = .POPCOUNT (A).  */
+  new_var = vect_recog_temp_ssa_var (lhs_type, NULL);
+  pattern_stmt = gimple_build_call_internal_vec (IFN_POPCOUNT, vargs);
+  gimple_call_set_lhs (pattern_stmt, new_var);
+  gimple_set_location (pattern_stmt, gimple_location (last_stmt));
+  *type_out = vec_type;
+
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location,
+		     "created pattern stmt: %G", pattern_stmt);
+  return pattern_stmt;
+}
+
 /* Function vect_recog_pow_pattern
 
    Try to find the following pattern:
@@ -5283,6 +5392,7 @@  static vect_recog_func vect_vect_recog_func_ptrs[] = {
   { vect_recog_sad_pattern, "sad" },
   { vect_recog_widen_sum_pattern, "widen_sum" },
   { vect_recog_pow_pattern, "pow" },
+  { vect_recog_popcount_pattern, "popcount" },
   { vect_recog_widen_shift_pattern, "widen_shift" },
   { vect_recog_rotate_pattern, "rotate" },
   { vect_recog_vector_vector_shift_pattern, "vector_vector_shift" },