[Committed] middle-end: Parity and popcount folding optimizations.

Message ID 000601d6652b$9b8e9460$d2abbd20$@nextmovesoftware.com
State New
Headers show
Series
  • [Committed] middle-end: Parity and popcount folding optimizations.
Related show

Commit Message

Roger Sayle July 28, 2020, 10:08 p.m.
Here is the final version of this patch, implementing Richard's latest suggestion,
as committed to mainline.  This revision has been tested on x86_64-pc-linux-gnu
with a "make bootstrap" and "make -k check" with no new regressions.

Many thanks for the helpful feedback about match.pd's BUILT_IN_ iterators;
I'll post a follow-up patch to clean-up/make use of this idiom elsewhere in
match.pd, as it is much nicer.

Cheers,
Roger
--

-----Original Message-----
From: Richard Biener <richard.guenther@gmail.com> 

Sent: 23 July 2020 10:02
To: GCC Patches <gcc-patches@gcc.gnu.org>
Cc: Roger Sayle <roger@nextmovesoftware.com>
Subject: Re: [PATCH] middle-end: Fold popcount(x&4) to (x>>2)&1 and friends.

On Thu, Jul 23, 2020 at 10:15 AM Marc Glisse <marc.glisse@inria.fr> wrote:
>

> On Wed, 22 Jul 2020, Roger Sayle wrote:

>

> > Many thanks for the peer review and feedback.  I completely agree 

> > that POPCOUNT and PARITY iterators simplifies things and handle the IFN_ variants.

>

> Is there a reason why the iterators cannot be used for this one?

>

> +(for popcount (BUILT_IN_POPCOUNT BUILT_IN_POPCOUNTL BUILT_IN_POPCOUNTLL

> +              BUILT_IN_POPCOUNTIMAX)

> +     parity (BUILT_IN_PARITY BUILT_IN_PARITYL BUILT_IN_PARITYLL

> +            BUILT_IN_PARITYIMAX)

> +  (simplify

> +    (bit_and (popcount @0) integer_onep)

> +    (parity @0)))


I should indeed work fine to even do

 (simplify
   (bit_and (POPCOUNT @0) integer_onep)
   (PARITY @0))

Likewise for the existing

+/* popcount(X) == 0 is X == 0, and related (in)equalities.  */ (for 
+popcount (POPCOUNT)
   (for cmp (le eq ne gt)
        rep (eq eq ne ne)
     (simplify
       (cmp (popcount @0) integer_zerop)
       (rep @0 { build_zero_cst (TREE_TYPE (@0)); }))))

you can elide the (for ...)

OK with those changes.

Richard.

>

> --

> Marc Glisse

Patch

diff --git a/gcc/match.pd b/gcc/match.pd
index c6ae7a7..a052c9e 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -5964,25 +5964,51 @@  DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
    (IFN_FMA @0 @1 @2))))
 
 /* POPCOUNT simplifications.  */
-(for popcount (BUILT_IN_POPCOUNT BUILT_IN_POPCOUNTL BUILT_IN_POPCOUNTLL
-	       BUILT_IN_POPCOUNTIMAX)
-  /* popcount(X&1) is nop_expr(X&1).  */
-  (simplify
-    (popcount @0)
-    (if (tree_nonzero_bits (@0) == 1)
-      (convert @0)))
-  /* popcount(X) + popcount(Y) is popcount(X|Y) when X&Y must be zero.  */
-  (simplify
-    (plus (popcount:s @0) (popcount:s @1))
-    (if (wi::bit_and (tree_nonzero_bits (@0), tree_nonzero_bits (@1)) == 0)
-      (popcount (bit_ior @0 @1))))
-  /* popcount(X) == 0 is X == 0, and related (in)equalities.  */
+/* popcount(X) + popcount(Y) is popcount(X|Y) when X&Y must be zero.  */
+(simplify
+  (plus (POPCOUNT:s @0) (POPCOUNT:s @1))
+  (if (wi::bit_and (tree_nonzero_bits (@0), tree_nonzero_bits (@1)) == 0)
+    (POPCOUNT (bit_ior @0 @1))))
+
+/* popcount(X) == 0 is X == 0, and related (in)equalities.  */
+(for popcount (POPCOUNT)
   (for cmp (le eq ne gt)
        rep (eq eq ne ne)
     (simplify
       (cmp (popcount @0) integer_zerop)
       (rep @0 { build_zero_cst (TREE_TYPE (@0)); }))))
 
+/* Canonicalize POPCOUNT(x)&1 as PARITY(X).  */
+(simplify
+  (bit_and (POPCOUNT @0) integer_onep)
+  (PARITY @0))
+
+/* PARITY simplifications.  */
+/* parity(~X) is parity(X).  */
+(simplify
+  (PARITY (bit_not @0))
+  (PARITY @0))
+
+/* parity(X)^parity(Y) is parity(X^Y).  */
+(simplify
+  (bit_xor (PARITY:s @0) (PARITY:s @1))
+  (PARITY (bit_xor @0 @1)))
+
+/* Common POPCOUNT/PARITY simplifications.  */
+/* popcount(X&C1) is (X>>C2)&1 when C1 == 1<<C2.  Same for parity(X&C1).  */
+(for pfun (POPCOUNT PARITY)
+  (simplify
+    (pfun @0)
+    (with { wide_int nz = tree_nonzero_bits (@0); }
+      (switch
+	(if (nz == 1)
+	  (convert @0))
+	(if (wi::popcount (nz) == 1)
+	  (with { tree utype = unsigned_type_for (TREE_TYPE (@0)); }
+	    (convert (rshift:utype (convert:utype @0)
+				   { build_int_cst (integer_type_node,
+						    wi::ctz (nz)); }))))))))
+
 #if GIMPLE
 /* 64- and 32-bits branchless implementations of popcount are detected:
 
diff --git a/gcc/testsuite/gcc.dg/fold-popcount-5.c b/gcc/testsuite/gcc.dg/fold-popcount-5.c
new file mode 100644
index 0000000..943726f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/fold-popcount-5.c
@@ -0,0 +1,38 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+int test_and4(unsigned int a)
+{
+  return __builtin_popcount(a&4);
+}
+
+int test_and4l(unsigned long b)
+{
+  return __builtin_popcountl(b&4);
+}
+
+int test_and4ll(unsigned long long c)
+{
+  return __builtin_popcountll(c&4);
+}
+
+int test_shift(unsigned int d)
+{
+  int bits = 8*sizeof(unsigned int)-1;
+  return __builtin_popcount(d<<31);
+}
+
+int test_shiftl(unsigned long e)
+{
+  int bits = 8*sizeof(unsigned long)-1;
+  return __builtin_popcountl(e<<bits);
+}
+
+int test_shiftll(unsigned long long f)
+{
+  int bits = 8*sizeof(unsigned long long)-1;
+  return __builtin_popcountll(f<<bits);
+}
+
+/* { dg-final { scan-tree-dump-times "popcount" 0 "optimized" } } */
+
diff --git a/gcc/testsuite/gcc.dg/fold-parity-1.c b/gcc/testsuite/gcc.dg/fold-parity-1.c
new file mode 100644
index 0000000..3ba56c7
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/fold-parity-1.c
@@ -0,0 +1,21 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-original" } */
+
+int foo(unsigned int x)
+{
+  return __builtin_popcount(x) & 1;
+}
+
+int fool(unsigned long x)
+{
+  return __builtin_popcountl(x) & 1;
+}
+
+int fooll(unsigned long long x)
+{
+  return __builtin_popcountll(x) & 1;
+}
+
+/* { dg-final { scan-tree-dump-times "__builtin_popcount" 0 "original" } } */
+/* { dg-final { scan-tree-dump-times "__builtin_parity" 3 "original" } } */
+
diff --git a/gcc/testsuite/gcc.dg/fold-parity-2.c b/gcc/testsuite/gcc.dg/fold-parity-2.c
new file mode 100644
index 0000000..8c7acbf
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/fold-parity-2.c
@@ -0,0 +1,20 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+int foo(unsigned int x)
+{
+  return __builtin_parity(~x);
+}
+
+int fool(unsigned long x)
+{
+  return __builtin_parityl(~x);
+}
+
+int fooll(unsigned long long x)
+{
+  return __builtin_parityll(~x);
+}
+
+/* { dg-final { scan-tree-dump-times "~" 0 "optimized" } } */
+
diff --git a/gcc/testsuite/gcc.dg/fold-parity-3.c b/gcc/testsuite/gcc.dg/fold-parity-3.c
new file mode 100644
index 0000000..e0355cc
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/fold-parity-3.c
@@ -0,0 +1,20 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+int foo(unsigned int x)
+{
+  return __builtin_parity(x&1);
+}
+
+int fool(unsigned long x)
+{
+  return __builtin_parityl(x&1);
+}
+
+int fooll(unsigned long long x)
+{
+  return __builtin_parityll(x&1);
+}
+
+/* { dg-final { scan-tree-dump-times "__builtin_parity" 0 "optimized" } } */
+
diff --git a/gcc/testsuite/gcc.dg/fold-parity-4.c b/gcc/testsuite/gcc.dg/fold-parity-4.c
new file mode 100644
index 0000000..5dfedab
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/fold-parity-4.c
@@ -0,0 +1,20 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+int foo(unsigned int x, unsigned int y)
+{
+  return __builtin_parity(x) ^ __builtin_parity(y);
+}
+
+int fool(unsigned long x, unsigned long y)
+{
+  return __builtin_parityl(x) ^ __builtin_parityl(y);
+}
+
+int fooll(unsigned long long x, unsigned long long y)
+{
+  return __builtin_parityll(x) ^ __builtin_parityll(y);
+}
+
+/* { dg-final { scan-tree-dump-times "__builtin_parity" 3 "optimized" } } */
+
diff --git a/gcc/testsuite/gcc.dg/fold-parity-5.c b/gcc/testsuite/gcc.dg/fold-parity-5.c
new file mode 100644
index 0000000..69d3a6a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/fold-parity-5.c
@@ -0,0 +1,38 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+int test_and4(unsigned int a)
+{
+  return __builtin_parity(a&4);
+}
+
+int test_and4l(unsigned long b)
+{
+  return __builtin_parityl(b&4);
+}
+
+int test_and4ll(unsigned long long c)
+{
+  return __builtin_parityll(c&4);
+}
+
+int test_shift(unsigned int d)
+{
+  int bits = 8*sizeof(unsigned int)-1;
+  return __builtin_parity(d<<31);
+}
+
+int test_shiftl(unsigned long e)
+{
+  int bits = 8*sizeof(unsigned long)-1;
+  return __builtin_parityl(e<<bits);
+}
+
+int test_shiftll(unsigned long long f)
+{
+  int bits = 8*sizeof(unsigned long long)-1;
+  return __builtin_parityll(f<<bits);
+}
+
+/* { dg-final { scan-tree-dump-times "parity" 0 "optimized" } } */
+