vec: don't select partial vectors when looping on full vectors

Message ID gkr4ko63os6.fsf@arm.com
State Superseded
Headers show
Series
  • vec: don't select partial vectors when looping on full vectors
Related show

Commit Message

Andrea Corallo Sept. 9, 2020, 4:20 p.m.
Hi all,

this patch is meant not to generate predication in loop when the
loop is operating only on full vectors.

Ex:

#+BEGIN_SRC C
/* Vector length is 256.  */
void
f (int *restrict x, int *restrict y, unsigned int n) {
  for (unsigned int i = 0; i < n * 8; ++i)
    x[i] += y[i];
}
#+END_SRC

Compiling on aarch64 with -O3 -msve-vector-bits=256 current trunk
gives:

#+BEGIN_SRC asm
f:
.LFB0:
        .cfi_startproc
        lsl     w2, w2, 3
        cbz     w2, .L1
        mov     x3, 0
        whilelo p0.s, xzr, x2
        .p2align 3,,7
.L3:
        ld1w    z0.s, p0/z, [x0, x3, lsl 2]
        ld1w    z1.s, p0/z, [x1, x3, lsl 2]
        add     z0.s, z0.s, z1.s
        st1w    z0.s, p0, [x0, x3, lsl 2]
        add     x3, x3, 8
        whilelo p0.s, x3, x2
        b.any   .L3
.L1:
        ret
        .cfi_endproc
#+END_SRC

With the patch applied:

#+BEGIN_SRC asm
f:
.LFB0:
        .cfi_startproc
        lsl     w3, w2, 3
        cbz     w3, .L1
        mov     x2, 0
        ptrue   p0.b, vl32
        .p2align 3,,7
.L3:
        ld1w    z0.s, p0/z, [x0, x2, lsl 2]
        ld1w    z1.s, p0/z, [x1, x2, lsl 2]
        add     z0.s, z0.s, z1.s
        st1w    z0.s, p0, [x0, x2, lsl 2]
        add     x2, x2, 8
        cmp     x2, x3
        bne     .L3
.L1:
        ret
        .cfi_endproc
#+END_SRC

To achieve this we check earlier if the loop needs peeling and if is
not the case we do not set LOOP_VINFO_USING_PARTIAL_VECTORS_P to true.

I moved some logic from 'determine_peel_for_niter' to
'vect_need_peeling_or_part_vects_p' so it can be used for this purpose.

Bootstrapped and regtested on aarch64-linux-gnu.

Feedback is welcome, thanks.

  Andrea
From fdcceaa420d6c3b03cf22ab50e0f9c393e8e3932 Mon Sep 17 00:00:00 2001
From: Andrea Corallo <andrea.corallo@arm.com>

Date: Fri, 28 Aug 2020 16:01:15 +0100
Subject: [PATCH] vec: don't select partial vectors when unnecessary

gcc/ChangeLog

2020-09-09  Andrea Corallo  <andrea.corallo@arm.com>

	* tree-vect-loop.c (vect_need_peeling_or_part_vects_p): New function.
	(determine_peel_for_niter): Move out some logic into
	'vect_need_peeling_or_part_vects_p'.

gcc/testsuite/ChangeLog

2020-09-09  Andrea Corallo  <andrea.corallo@arm.com>

	* gcc.target/aarch64/sve/cost_model_10.c: New test.
	* gcc.target/aarch64/sve/clastb_8.c: Update test for new
	vectorization strategy.
	* gcc.target/aarch64/sve/cost_model_5.c: Likewise.
	* gcc.target/aarch64/sve/struct_vect_14.c: Likewise.
	* gcc.target/aarch64/sve/struct_vect_15.c: Likewise.
	* gcc.target/aarch64/sve/struct_vect_16.c: Likewise.
	* gcc.target/aarch64/sve/struct_vect_17.c: Likewise.
---
 .../gcc.target/aarch64/sve/clastb_8.c         |  5 +-
 .../gcc.target/aarch64/sve/cost_model_10.c    | 12 +++
 .../gcc.target/aarch64/sve/cost_model_5.c     |  4 +-
 .../gcc.target/aarch64/sve/struct_vect_14.c   |  8 +-
 .../gcc.target/aarch64/sve/struct_vect_15.c   |  8 +-
 .../gcc.target/aarch64/sve/struct_vect_16.c   |  8 +-
 .../gcc.target/aarch64/sve/struct_vect_17.c   |  8 +-
 gcc/tree-vect-loop.c                          | 86 +++++++++++--------
 8 files changed, 81 insertions(+), 58 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/cost_model_10.c

-- 
2.17.1

Comments

Richard Biener Sept. 10, 2020, 10:13 a.m. | #1
On Wed, 9 Sep 2020, Andrea Corallo wrote:

> Hi all,

> 

> this patch is meant not to generate predication in loop when the

> loop is operating only on full vectors.

> 

> Ex:

> 

> #+BEGIN_SRC C

> /* Vector length is 256.  */

> void

> f (int *restrict x, int *restrict y, unsigned int n) {

>   for (unsigned int i = 0; i < n * 8; ++i)

>     x[i] += y[i];

> }

> #+END_SRC

> 

> Compiling on aarch64 with -O3 -msve-vector-bits=256 current trunk

> gives:

> 

> #+BEGIN_SRC asm

> f:

> .LFB0:

>         .cfi_startproc

>         lsl     w2, w2, 3

>         cbz     w2, .L1

>         mov     x3, 0

>         whilelo p0.s, xzr, x2

>         .p2align 3,,7

> .L3:

>         ld1w    z0.s, p0/z, [x0, x3, lsl 2]

>         ld1w    z1.s, p0/z, [x1, x3, lsl 2]

>         add     z0.s, z0.s, z1.s

>         st1w    z0.s, p0, [x0, x3, lsl 2]

>         add     x3, x3, 8

>         whilelo p0.s, x3, x2

>         b.any   .L3

> .L1:

>         ret

>         .cfi_endproc

> #+END_SRC

> 

> With the patch applied:

> 

> #+BEGIN_SRC asm

> f:

> .LFB0:

>         .cfi_startproc

>         lsl     w3, w2, 3

>         cbz     w3, .L1

>         mov     x2, 0

>         ptrue   p0.b, vl32

>         .p2align 3,,7

> .L3:

>         ld1w    z0.s, p0/z, [x0, x2, lsl 2]

>         ld1w    z1.s, p0/z, [x1, x2, lsl 2]

>         add     z0.s, z0.s, z1.s

>         st1w    z0.s, p0, [x0, x2, lsl 2]

>         add     x2, x2, 8

>         cmp     x2, x3

>         bne     .L3

> .L1:

>         ret

>         .cfi_endproc

> #+END_SRC

> 

> To achieve this we check earlier if the loop needs peeling and if is

> not the case we do not set LOOP_VINFO_USING_PARTIAL_VECTORS_P to true.

> 

> I moved some logic from 'determine_peel_for_niter' to

> 'vect_need_peeling_or_part_vects_p' so it can be used for this purpose.

> 

> Bootstrapped and regtested on aarch64-linux-gnu.


Looks OK to me, the comment

@@ -2267,7 +2278,10 @@ start_over:
     {
       if (param_vect_partial_vector_usage == 0)
        LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
-      else if (vect_verify_full_masking (loop_vinfo)
+      else if ((vect_verify_full_masking (loop_vinfo)
+               && vect_need_peeling_or_part_vects_p (loop_vinfo))
+              /* Don't use partial vectors if we don't need to peel the
+                 loop.  */
               || vect_verify_loop_lens (loop_vinfo))

seems to be oddly misplaced (I'd put it before the call).

Richard.

> Feedback is welcome, thanks.

> 

>   Andrea

> 

> 


-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg,
Germany; GF: Felix Imendörffer; HRB 36809 (AG Nuernberg)
Richard Sandiford Sept. 11, 2020, 7:20 a.m. | #2
Richard Biener <rguenther@suse.de> writes:
> On Wed, 9 Sep 2020, Andrea Corallo wrote:

>> Hi all,

>> 

>> this patch is meant not to generate predication in loop when the

>> loop is operating only on full vectors.

>> 

>> Ex:

>> 

>> #+BEGIN_SRC C

>> /* Vector length is 256.  */

>> void

>> f (int *restrict x, int *restrict y, unsigned int n) {

>>   for (unsigned int i = 0; i < n * 8; ++i)

>>     x[i] += y[i];

>> }

>> #+END_SRC

>> 

>> Compiling on aarch64 with -O3 -msve-vector-bits=256 current trunk

>> gives:

>> 

>> #+BEGIN_SRC asm

>> f:

>> .LFB0:

>>         .cfi_startproc

>>         lsl     w2, w2, 3

>>         cbz     w2, .L1

>>         mov     x3, 0

>>         whilelo p0.s, xzr, x2

>>         .p2align 3,,7

>> .L3:

>>         ld1w    z0.s, p0/z, [x0, x3, lsl 2]

>>         ld1w    z1.s, p0/z, [x1, x3, lsl 2]

>>         add     z0.s, z0.s, z1.s

>>         st1w    z0.s, p0, [x0, x3, lsl 2]

>>         add     x3, x3, 8

>>         whilelo p0.s, x3, x2

>>         b.any   .L3

>> .L1:

>>         ret

>>         .cfi_endproc

>> #+END_SRC

>> 

>> With the patch applied:

>> 

>> #+BEGIN_SRC asm

>> f:

>> .LFB0:

>>         .cfi_startproc

>>         lsl     w3, w2, 3

>>         cbz     w3, .L1

>>         mov     x2, 0

>>         ptrue   p0.b, vl32

>>         .p2align 3,,7

>> .L3:

>>         ld1w    z0.s, p0/z, [x0, x2, lsl 2]

>>         ld1w    z1.s, p0/z, [x1, x2, lsl 2]

>>         add     z0.s, z0.s, z1.s

>>         st1w    z0.s, p0, [x0, x2, lsl 2]

>>         add     x2, x2, 8

>>         cmp     x2, x3

>>         bne     .L3

>> .L1:

>>         ret

>>         .cfi_endproc

>> #+END_SRC

>> 

>> To achieve this we check earlier if the loop needs peeling and if is

>> not the case we do not set LOOP_VINFO_USING_PARTIAL_VECTORS_P to true.

>> 

>> I moved some logic from 'determine_peel_for_niter' to

>> 'vect_need_peeling_or_part_vects_p' so it can be used for this purpose.

>> 

>> Bootstrapped and regtested on aarch64-linux-gnu.

>

> Looks OK to me, the comment

>

> @@ -2267,7 +2278,10 @@ start_over:

>      {

>        if (param_vect_partial_vector_usage == 0)

>         LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;

> -      else if (vect_verify_full_masking (loop_vinfo)

> +      else if ((vect_verify_full_masking (loop_vinfo)

> +               && vect_need_peeling_or_part_vects_p (loop_vinfo))

> +              /* Don't use partial vectors if we don't need to peel the

> +                 loop.  */

>                || vect_verify_loop_lens (loop_vinfo))

>

> seems to be oddly misplaced (I'd put it before the call).


Yeah, IMO it'd better to put it in the first “if”.

Also, very minor, but I think it'd be better not to shorten the name:

  vect_need_peeling_or_partial_vectors_p

Thanks,
Richard

Patch

diff --git a/gcc/testsuite/gcc.target/aarch64/sve/clastb_8.c b/gcc/testsuite/gcc.target/aarch64/sve/clastb_8.c
index 57c42082449..e61ff4ac92d 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/clastb_8.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/clastb_8.c
@@ -23,7 +23,4 @@  TEST_TYPE (uint64_t);
 /* { dg-final { scan-assembler {\tclastb\t(h[0-9]+), p[0-7], \1, z[0-9]+\.h\n} } } */
 /* { dg-final { scan-assembler {\tclastb\t(s[0-9]+), p[0-7], \1, z[0-9]+\.s\n} } } */
 /* { dg-final { scan-assembler {\tclastb\t(d[0-9]+), p[0-7], \1, z[0-9]+\.d\n} } } */
-/* { dg-final { scan-assembler {\twhilelo\tp[0-9]+\.b,} } } */
-/* { dg-final { scan-assembler {\twhilelo\tp[0-9]+\.h,} } } */
-/* { dg-final { scan-assembler {\twhilelo\tp[0-9]+\.s,} } } */
-/* { dg-final { scan-assembler {\twhilelo\tp[0-9]+\.d,} } } */
+/* { dg-final { scan-assembler {\tptrue\tp[0-9]+\.b,} 4 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cost_model_10.c b/gcc/testsuite/gcc.target/aarch64/sve/cost_model_10.c
new file mode 100644
index 00000000000..bfac09ed1c1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cost_model_10.c
@@ -0,0 +1,12 @@ 
+/* { dg-options "-O3 -msve-vector-bits=256" } */
+
+void
+f (int *restrict x, int *restrict y, unsigned int n)
+{
+  for (unsigned int i = 0; i < n * 8; ++i)
+    x[i] += y[i];
+}
+
+/* { dg-final { scan-assembler-not {\twhilelo\t} } } */
+/* { dg-final { scan-assembler {\tptrue\tp} } } */
+/* { dg-final { scan-assembler {\tcmp\tx[0-9]+, x[0-9]+\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cost_model_5.c b/gcc/testsuite/gcc.target/aarch64/sve/cost_model_5.c
index 250ca837324..f3a29fc38a1 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/cost_model_5.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cost_model_5.c
@@ -9,5 +9,5 @@  vset (int *restrict dst, int *restrict src, int count)
       *dst++ = 1;
 }
 
-/* { dg-final { scan-assembler-not {\tst1w\tz} } } */
-/* { dg-final { scan-assembler-times {\tstp\tq} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz} 2 } } */
+/* { dg-final { scan-assembler-not {\tstp\tq} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_14.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_14.c
index a16a79e51c0..45644b67bda 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_14.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_14.c
@@ -43,12 +43,12 @@ 
 #undef NAME
 #undef TYPE
 
-/* { dg-final { scan-assembler-times {\tld2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+, x[0-9]+\]\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tld2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} 1 } } */
 /* { dg-final { scan-assembler-times {\tld3b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} 1 } } */
-/* { dg-final { scan-assembler-times {\tld4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+, x[0-9]+\]\n} 1 } } */
-/* { dg-final { scan-assembler-times {\tst2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+, x[0-9]+\]\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tld4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tst2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} 1 } } */
 /* { dg-final { scan-assembler-times {\tst3b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} 1 } } */
-/* { dg-final { scan-assembler-times {\tst4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+, x[0-9]+\]\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tst4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} 1 } } */
 
 /* { dg-final { scan-assembler-times {\tld2h\t{z[0-9]+.h - z[0-9]+.h}, p[0-7]/z, \[x[0-9]+\]\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tld3h\t{z[0-9]+.h - z[0-9]+.h}, p[0-7]/z, \[x[0-9]+\]\n} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_15.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_15.c
index bc00267c8e7..814dbb3ae41 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_15.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_15.c
@@ -3,12 +3,12 @@ 
 
 #include "struct_vect_14.c"
 
-/* { dg-final { scan-assembler-times {\tld2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+, x[0-9]+\]\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tld2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} 1 } } */
 /* { dg-final { scan-assembler-times {\tld3b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} 1 } } */
-/* { dg-final { scan-assembler-times {\tld4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+, x[0-9]+\]\n} 1 } } */
-/* { dg-final { scan-assembler-times {\tst2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+, x[0-9]+\]\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tld4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tst2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} 1 } } */
 /* { dg-final { scan-assembler-times {\tst3b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} 1 } } */
-/* { dg-final { scan-assembler-times {\tst4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+, x[0-9]+\]\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tst4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} 1 } } */
 
 /* { dg-final { scan-assembler-times {\tld2h\t{z[0-9]+.h - z[0-9]+.h}, p[0-7]/z, \[x[0-9]+\]\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tld3h\t{z[0-9]+.h - z[0-9]+.h}, p[0-7]/z, \[x[0-9]+\]\n} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_16.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_16.c
index 9e2a549f5e8..6ecf89b5442 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_16.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_16.c
@@ -3,12 +3,12 @@ 
 
 #include "struct_vect_14.c"
 
-/* { dg-final { scan-assembler-times {\tld2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+, x[0-9]+\]\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tld2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} 1 } } */
 /* { dg-final { scan-assembler-times {\tld3b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} 1 } } */
-/* { dg-final { scan-assembler-times {\tld4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+, x[0-9]+\]\n} 1 } } */
-/* { dg-final { scan-assembler-times {\tst2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+, x[0-9]+\]\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tld4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tst2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} 1 } } */
 /* { dg-final { scan-assembler-times {\tst3b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} 1 } } */
-/* { dg-final { scan-assembler-times {\tst4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+, x[0-9]+\]\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tst4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} 1 } } */
 
 /* { dg-final { scan-assembler-times {\tld2h\t{z[0-9]+.h - z[0-9]+.h}, p[0-7]/z, \[x[0-9]+\]\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tld3h\t{z[0-9]+.h - z[0-9]+.h}, p[0-7]/z, \[x[0-9]+\]\n} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_17.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_17.c
index e791e2e12a6..571c6d0d33b 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_17.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_17.c
@@ -3,12 +3,12 @@ 
 
 #include "struct_vect_14.c"
 
-/* { dg-final { scan-assembler-times {\tld2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+, x[0-9]+\]\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tld2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} 1 } } */
 /* { dg-final { scan-assembler-times {\tld3b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} 1 } } */
-/* { dg-final { scan-assembler-times {\tld4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+, x[0-9]+\]\n} 1 } } */
-/* { dg-final { scan-assembler-times {\tst2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+, x[0-9]+\]\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tld4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tst2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} 1 } } */
 /* { dg-final { scan-assembler-times {\tst3b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} 1 } } */
-/* { dg-final { scan-assembler-times {\tst4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+, x[0-9]+\]\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tst4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} 1 } } */
 
 /* { dg-final { scan-assembler-times {\tld2h\t{z[0-9]+.h - z[0-9]+.h}, p[0-7]/z, \[x[0-9]+\]\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tld3h\t{z[0-9]+.h - z[0-9]+.h}, p[0-7]/z, \[x[0-9]+\]\n} 2 } } */
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 80e78f7adf4..730dc0130c6 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -991,6 +991,51 @@  vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
   return wi::min_precision (max_ni * factor, UNSIGNED);
 }
 
+/* true if the loop needs peeling or partial vectors when vectorized.  */
+
+static bool
+vect_need_peeling_or_part_vects_p (loop_vec_info loop_vinfo)
+{
+  unsigned HOST_WIDE_INT const_vf;
+  HOST_WIDE_INT max_niter
+    = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
+
+  unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
+  if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
+    th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
+					  (loop_vinfo));
+
+  if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+      && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
+    {
+      /* Work out the (constant) number of iterations that need to be
+	 peeled for reasons other than niters.  */
+      unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
+      if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
+	peel_niter += 1;
+      if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
+		       LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
+	return true;
+    }
+  else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
+      /* ??? When peeling for gaps but not alignment, we could
+	 try to check whether the (variable) niters is known to be
+	 VF * N + 1.  That's something of a niche case though.  */
+      || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
+      || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
+      || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
+	   < (unsigned) exact_log2 (const_vf))
+	  /* In case of versioning, check if the maximum number of
+	     iterations is greater than th.  If they are identical,
+	     the epilogue is unnecessary.  */
+	  && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
+	      || ((unsigned HOST_WIDE_INT) max_niter
+		  > (th / const_vf) * const_vf))))
+    return true;
+
+  return false;
+}
+
 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
    whether we can actually generate the masks required.  Return true if so,
    storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE.  */
@@ -1967,44 +2012,10 @@  determine_peel_for_niter (loop_vec_info loop_vinfo)
 {
   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
 
-  unsigned HOST_WIDE_INT const_vf;
-  HOST_WIDE_INT max_niter
-    = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
-
-  unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
-  if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
-    th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
-					  (loop_vinfo));
-
   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
     /* The main loop handles all iterations.  */
     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
-  else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-	   && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
-    {
-      /* Work out the (constant) number of iterations that need to be
-	 peeled for reasons other than niters.  */
-      unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
-      if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
-	peel_niter += 1;
-      if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
-		       LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
-	LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
-    }
-  else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
-	   /* ??? When peeling for gaps but not alignment, we could
-	      try to check whether the (variable) niters is known to be
-	      VF * N + 1.  That's something of a niche case though.  */
-	   || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
-	   || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
-	   || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
-		< (unsigned) exact_log2 (const_vf))
-	       /* In case of versioning, check if the maximum number of
-		  iterations is greater than th.  If they are identical,
-		  the epilogue is unnecessary.  */
-	       && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
-		   || ((unsigned HOST_WIDE_INT) max_niter
-		       > (th / const_vf) * const_vf))))
+  else if (vect_need_peeling_or_part_vects_p (loop_vinfo))
     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
 }
 
@@ -2267,7 +2278,10 @@  start_over:
     {
       if (param_vect_partial_vector_usage == 0)
 	LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
-      else if (vect_verify_full_masking (loop_vinfo)
+      else if ((vect_verify_full_masking (loop_vinfo)
+		&& vect_need_peeling_or_part_vects_p (loop_vinfo))
+	       /* Don't use partial vectors if we don't need to peel the
+		  loop.  */
 	       || vect_verify_loop_lens (loop_vinfo))
 	{
 	  /* The epilogue and other known niters less than VF