13 files changed, 694 insertions, 44 deletions
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll
index bb70cbcfd4e9..8fa057f0f888 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll
@@ -72,11 +72,10 @@ for.end:
 }
 
 define void @vec_intrinsic(i64 %N, double* nocapture readonly %a) {
-;; FIXME: Should be calling sin_vec, once the cost of scalarizing is handled.
 ; CHECK-LABEL: @vec_intrinsic
 ; CHECK: vector.body:
 ; CHECK: %[[LOAD:.*]] = load <vscale x 2 x double>, <vscale x 2 x double>*
-; CHECK: call fast <vscale x 2 x double> @llvm.sin.nxv2f64(<vscale x 2 x double> %[[LOAD]])
+; CHECK: call fast <vscale x 2 x double> @sin_vec(<vscale x 2 x double> %[[LOAD]])
 entry:
   %cmp7 = icmp sgt i64 %N, 0
   br i1 %cmp7, label %for.body, label %for.end
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll
new file mode 100644
index 000000000000..ed16349c14bb
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll
@@ -0,0 +1,280 @@
+; RUN: opt < %s -loop-vectorize -instcombine -mtriple aarch64-unknown-linux-gnu -enable-strict-reductions -S | FileCheck %s -check-prefix=CHECK
+
+define float @fadd_strict(float* noalias nocapture readonly %a, i64 %n) {
+; CHECK-LABEL: @fadd_strict
+; CHECK: vector.body:
+; CHECK: %[[VEC_PHI:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX:.*]], %vector.body ]
+; CHECK: %[[LOAD:.*]] = load <8 x float>, <8 x float>*
+; CHECK: %[[RDX]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[VEC_PHI]], <8 x float> %[[LOAD]])
+; CHECK: for.end
+; CHECK: %[[PHI:.*]] = phi float [ %[[SCALAR:.*]], %for.body ], [ %[[RDX]], %middle.block ]
+; CHECK: ret float %[[PHI]]
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %sum.07 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %iv
+  %0 = load float, float* %arrayidx, align 4
+  %add = fadd float %0, %sum.07
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+  ret float %add
+}
+
+define float @fadd_strict_unroll(float* noalias nocapture readonly %a, i64 %n) {
+; CHECK-LABEL: @fadd_strict_unroll
+; CHECK: vector.body:
+; CHECK: %[[VEC_PHI1:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX4:.*]], %vector.body ]
+; CHECK: %[[LOAD1:.*]] = load <8 x float>, <8 x float>*
+; CHECK: %[[LOAD2:.*]] = load <8 x float>, <8 x float>*
+; CHECK: %[[LOAD3:.*]] = load <8 x float>, <8 x float>*
+; CHECK: %[[LOAD4:.*]] = load <8 x float>, <8 x float>*
+; CHECK: %[[RDX1:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[VEC_PHI1]], <8 x float> %[[LOAD1]])
+; CHECK: %[[RDX2:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[RDX1]], <8 x float> %[[LOAD2]])
+; CHECK: %[[RDX3:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[RDX2]], <8 x float> %[[LOAD3]])
+; CHECK: %[[RDX4:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[RDX3]], <8 x float> %[[LOAD4]])
+; CHECK: for.end
+; CHECK: %[[PHI:.*]] = phi float [ %[[SCALAR:.*]], %for.body ], [ %[[RDX4]], %middle.block ]
+; CHECK: ret float %[[PHI]]
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %sum.07 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %iv
+  %0 = load float, float* %arrayidx, align 4
+  %add = fadd float %0, %sum.07
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1
+
+for.end:
+  ret float %add
+}
+
+define void @fadd_strict_interleave(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) {
+; CHECK-LABEL: @fadd_strict_interleave
+; CHECK: entry
+; CHECK: %[[ARRAYIDX:.*]] = getelementptr inbounds float, float* %a, i64 1
+; CHECK: %[[LOAD1:.*]] = load float, float* %a
+; CHECK: %[[LOAD2:.*]] = load float, float* %[[ARRAYIDX]]
+; CHECK: vector.body
+; CHECK: %[[VEC_PHI1:.*]] = phi float [ %[[LOAD2]], %vector.ph ], [ %[[RDX2:.*]], %vector.body ]
+; CHECK: %[[VEC_PHI2:.*]] = phi float [ %[[LOAD1]], %vector.ph ], [ %[[RDX1:.*]], %vector.body ]
+; CHECK: %[[WIDE_LOAD:.*]] = load <8 x float>, <8 x float>*
+; CHECK: %[[STRIDED1:.*]] = shufflevector <8 x float> %[[WIDE_LOAD]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: %[[STRIDED2:.*]] = shufflevector <8 x float> %[[WIDE_LOAD]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: %[[RDX1]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[VEC_PHI2]], <4 x float> %[[STRIDED1]])
+; CHECK: %[[RDX2]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[VEC_PHI1]], <4 x float> %[[STRIDED2]])
+; CHECK: for.end
+; CHECK ret void
+entry:
+  %arrayidxa = getelementptr inbounds float, float* %a, i64 1
+  %a1 = load float, float* %a, align 4
+  %a2 = load float, float* %arrayidxa, align 4
+  br label %for.body
+
+for.body:
+  %add.phi1 = phi float [ %a2, %entry ], [ %add2, %for.body ]
+  %add.phi2 = phi float [ %a1, %entry ], [ %add1, %for.body ]
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidxb1 = getelementptr inbounds float, float* %b, i64 %iv
+  %0 = load float, float* %arrayidxb1, align 4
+  %add1 = fadd float %0, %add.phi2
+  %or = or i64 %iv, 1
+  %arrayidxb2 = getelementptr inbounds float, float* %b, i64 %or
+  %1 = load float, float* %arrayidxb2, align 4
+  %add2 = fadd float %1, %add.phi1
+  %iv.next = add nuw nsw i64 %iv, 2
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !2
+
+for.end:
+  store float %add1, float* %a, align 4
+  store float %add2, float* %arrayidxa, align 4
+  ret void
+}
+
+define float @fadd_invariant(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) {
+; CHECK-LABEL: @fadd_invariant
+; CHECK: vector.body
+; CHECK: %[[VEC_PHI1:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX:.*]], %vector.body ]
+; CHECK: %[[LOAD1:.*]] = load <4 x float>, <4 x float>*
+; CHECK: %[[LOAD2:.*]] = load <4 x float>, <4 x float>*
+; CHECK: %[[ADD:.*]] = fadd <4 x float> %[[LOAD1]], %[[LOAD2]]
+; CHECK: %[[RDX]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[VEC_PHI1]], <4 x float> %[[ADD]])
+; CHECK: for.end.loopexit
+; CHECK: %[[EXIT_PHI:.*]] = phi float [ %[[SCALAR:.*]], %for.body ], [ %[[RDX]], %middle.block ]
+; CHECK: for.end
+; CHECK: %[[PHI:.*]] = phi float [ 0.000000e+00, %entry ], [ %[[EXIT_PHI]], %for.end.loopexit ]
+; CHECK: ret float %[[PHI]]
+entry:
+  %arrayidx = getelementptr inbounds float, float* %a, i64 1
+  %0 = load float, float* %arrayidx, align 4
+  %cmp1 = fcmp ogt float %0, 5.000000e-01
+  br i1 %cmp1, label %for.body, label %for.end
+
+for.body:                                      ; preds = %for.body
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %res.014 = phi float [ 0.000000e+00, %entry ], [ %rdx, %for.body ]
+  %arrayidx2 = getelementptr inbounds float, float* %a, i64 %iv
+  %1 = load float, float* %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds float, float* %b, i64 %iv
+  %2 = load float, float* %arrayidx4, align 4
+  %add = fadd float %1, %2
+  %rdx = fadd float %res.014, %add
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !2
+
+for.end:                                 ; preds = %for.body, %entry
+  %res = phi float [ 0.000000e+00, %entry ], [ %rdx, %for.body ]
+  ret float %res
+}
+
+define float @fadd_conditional(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) {
+; CHECK-LABEL: @fadd_conditional
+; CHECK: vector.body:
+; CHECK: %[[PHI:.*]] = phi float [ 1.000000e+00, %vector.ph ], [ %[[RDX:.*]], %pred.load.continue6 ]
+; CHECK: %[[LOAD1:.*]] = load <4 x float>, <4 x float>*
+; CHECK: %[[FCMP1:.*]] = fcmp une <4 x float> %[[LOAD1]], zeroinitializer
+; CHECK: %[[EXTRACT:.*]] = extractelement <4 x i1> %[[FCMP1]], i32 0
+; CHECK: br i1 %[[EXTRACT]], label %pred.load.if, label %pred.load.continue
+; CHECK: pred.load.continue6
+; CHECK: %[[PHI1:.*]] = phi <4 x float> [ %[[PHI0:.*]], %pred.load.continue4 ], [ %[[INS_ELT:.*]], %pred.load.if5 ]
+; CHECK: %[[PRED:.*]] = select <4 x i1> %[[FCMP1]], <4 x float> %[[PHI1]], <4 x float> <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
+; CHECK: %[[RDX]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[PHI]], <4 x float> %[[PRED]])
+; CHECK: for.body
+; CHECK: %[[RES_PHI:.*]] = phi float [ %[[MERGE_RDX:.*]], %scalar.ph ], [ %[[FADD:.*]], %for.inc ]
+; CHECK: %[[LOAD2:.*]] = load float, float*
+; CHECK: %[[FCMP2:.*]] = fcmp une float %[[LOAD2]], 0.000000e+00
+; CHECK: br i1 %[[FCMP2]], label %if.then, label %for.inc
+; CHECK: if.then
+; CHECK: %[[LOAD3:.*]] = load float, float*
+; CHECK: br label %for.inc
+; CHECK: for.inc
+; CHECK: %[[PHI2:.*]] = phi float [ %[[LOAD3]], %if.then ], [ 3.000000e+00, %for.body ]
+; CHECK: %[[FADD]] = fadd float %[[RES_PHI]], %[[PHI2]]
+; CHECK: for.end
+; CHECK: %[[RDX_PHI:.*]] = phi float [ %[[FADD]], %for.inc ], [ %[[RDX]], %middle.block ]
+; CHECK: ret float %[[RDX_PHI]]
+entry:
+  br label %for.body
+
+for.body:                                      ; preds = %for.body
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ]
+  %res = phi float [ 1.000000e+00, %entry ], [ %fadd, %for.inc ]
+  %arrayidx = getelementptr inbounds float, float* %b, i64 %iv
+  %0 = load float, float* %arrayidx, align 4
+  %tobool = fcmp une float %0, 0.000000e+00
+  br i1 %tobool, label %if.then, label %for.inc
+
+if.then:                                      ; preds = %for.body
+  %arrayidx2 = getelementptr inbounds float, float* %a, i64 %iv
+  %1 = load float, float* %arrayidx2, align 4
+  br label %for.inc
+
+for.inc:
+  %phi = phi float [ %1, %if.then ], [ 3.000000e+00, %for.body ]
+  %fadd = fadd float %res, %phi
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !2
+
+for.end:
+  %rdx = phi float [ %fadd, %for.inc ]
+  ret float %rdx
+}
+
+; Test to check masking correct, using the "llvm.loop.vectorize.predicate.enable" attribute
+define float @fadd_predicated(float* noalias nocapture %a, i64 %n) {
+; CHECK-LABEL: @fadd_predicated
+; CHECK: vector.ph
+; CHECK: %[[TRIP_MINUS_ONE:.*]] = add i64 %n, -1
+; CHECK: %[[BROADCAST_INS:.*]] = insertelement <2 x i64> poison, i64 %[[TRIP_MINUS_ONE]], i32 0
+; CHECK: %[[SPLAT:.*]] = shufflevector <2 x i64> %[[BROADCAST_INS]], <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK: vector.body
+; CHECK: %[[RDX_PHI:.*]] =  phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX:.*]], %pred.load.continue2 ]
+; CHECK: pred.load.continue2
+; CHECK: %[[PHI:.*]] = phi <2 x float> [ %[[PHI0:.*]], %pred.load.continue ], [ %[[INS_ELT:.*]], %pred.load.if1 ]
+; CHECK: %[[MASK:.*]] = select <2 x i1> %0, <2 x float> %[[PHI]], <2 x float> <float -0.000000e+00, float -0.000000e+00>
+; CHECK: %[[RDX]] = call float @llvm.vector.reduce.fadd.v2f32(float %[[RDX_PHI]], <2 x float> %[[MASK]])
+; CHECK: for.end:
+; CHECK: %[[RES_PHI:.*]] = phi float [ undef, %for.body ], [ %[[RDX]], %middle.block ]
+; CHECK: ret float %[[RES_PHI]]
+entry:
+  br label %for.body
+
+for.body:                                           ; preds = %entry, %for.body
+  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+  %sum.02 = phi float [ %l7, %for.body ], [ 0.000000e+00, %entry ]
+  %l2 = getelementptr inbounds float, float* %a, i64 %iv
+  %l3 = load float, float* %l2, align 4
+  %l7 = fadd float %sum.02, %l3
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !3
+
+for.end:                                            ; preds = %for.body
+  %sum.0.lcssa = phi float [ %l7, %for.body ]
+  ret float %sum.0.lcssa
+}
+
+; Negative test - loop contains multiple fadds which we cannot safely reorder
+define float @fadd_multiple(float* noalias nocapture %a, float* noalias nocapture %b, i64 %n) {
+; CHECK-LABEL: @fadd_multiple
+; CHECK: vector.body
+; CHECK: %[[PHI:.*]] = phi <8 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ %[[VEC_FADD2:.*]], %vector.body ]
+; CHECK: %[[VEC_LOAD1:.*]] = load <8 x float>, <8 x float>
+; CHECK: %[[VEC_FADD1:.*]] = fadd <8 x float> %[[PHI]], %[[VEC_LOAD1]]
+; CHECK: %[[VEC_LOAD2:.*]] = load <8 x float>, <8 x float>
+; CHECK: %[[VEC_FADD2]] = fadd <8 x float> %[[VEC_FADD1]], %[[VEC_LOAD2]]
+; CHECK: middle.block
+; CHECK: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> %[[VEC_FADD2]])
+; CHECK: for.body
+; CHECK: %[[SUM:.*]] = phi float [ %bc.merge.rdx, %scalar.ph ], [ %[[FADD2:.*]], %for.body ]
+; CHECK: %[[LOAD1:.*]] = load float, float*
+; CHECK: %[[FADD1:.*]] = fadd float %sum, %[[LOAD1]]
+; CHECK: %[[LOAD2:.*]] = load float, float*
+; CHECK: %[[FADD2]] = fadd float %[[FADD1]], %[[LOAD2]]
+; CHECK: for.end
+; CHECK: %[[RET:.*]] = phi float [ %[[FADD2]], %for.body ], [ %[[RDX]], %middle.block ]
+; CHECK: ret float %[[RET]]
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %sum = phi float [ -0.000000e+00, %entry ], [ %add3, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %iv
+  %0 = load float, float* %arrayidx, align 4
+  %add = fadd float %sum, %0
+  %arrayidx2 = getelementptr inbounds float, float* %b, i64 %iv
+  %1 = load float, float* %arrayidx2, align 4
+  %add3 = fadd float %add, %1
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:                                         ; preds = %for.body
+  %rdx = phi float [ %add3, %for.body ]
+  ret float %rdx
+}
+
+!0 = distinct !{!0, !4, !7, !9}
+!1 = distinct !{!1, !4, !8, !9}
+!2 = distinct !{!2, !5, !7, !9}
+!3 = distinct !{!3, !6, !7, !9, !10}
+!4 = !{!"llvm.loop.vectorize.width", i32 8}
+!5 = !{!"llvm.loop.vectorize.width", i32 4}
+!6 = !{!"llvm.loop.vectorize.width", i32 2}
+!7 = !{!"llvm.loop.interleave.count", i32 1}
+!8 = !{!"llvm.loop.interleave.count", i32 4}
+!9 = !{!"llvm.loop.vectorize.enable", i1 true}
+!10 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-basic-vec.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-basic-vec.ll
index f59b2bf85db3..d548ca76d9a6 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-basic-vec.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-basic-vec.ll
@@ -1,9 +1,5 @@
-; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -mattr=+sve < %s -S 2>%t | FileCheck %s
+; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -mattr=+sve < %s -S | FileCheck %s
 
-; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
-
-; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.
-; WARN-NOT: warning
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-unknown-linux-gnu"
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll
index 37e70adf64f8..2536c6c85dea 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll
@@ -76,6 +76,47 @@ exit:                        ; preds = %for.inc
   ret void
 }
 
+define void @invariant_load_cond(i32* noalias nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %cond, i64 %n) {
+; CHECK-LABEL: @invariant_load_cond
+; CHECK: vector.body
+; CHECK: %[[GEP:.*]] = getelementptr inbounds i32, i32* %b, i64 42
+; CHECK-NEXT: %[[SPLATINS:.*]] = insertelement <vscale x 4 x i32*> poison, i32* %[[GEP]], i32 0
+; CHECK-NEXT: %[[SPLAT:.*]] = shufflevector <vscale x 4 x i32*> %[[SPLATINS]], <vscale x 4 x i32*> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK: %[[LOAD:.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>*
+; CHECK-NEXT: %[[ICMP:.*]] = icmp ne <vscale x 4 x i32> %[[LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 0, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK: %[[MASKED_LOAD:.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* %[[BITCAST:.*]], i32 4, <vscale x 4 x i1> %[[ICMP]], <vscale x 4 x i32> poison)
+; CHECK-NEXT: %[[MASKED_GATHER:.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> %[[SPLAT]], i32 4, <vscale x 4 x i1> %[[ICMP]], <vscale x 4 x i32> undef)
+; CHECK-NEXT: %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[MASKED_GATHER]], %[[MASKED_LOAD]]
+; CHECK: call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> %[[ADD]], <vscale x 4 x i32>* %[[BITCAST1:.*]], i32 4, <vscale x 4 x i1> %[[ICMP]])
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ]
+  %arrayidx1 = getelementptr inbounds i32, i32* %b, i64 42
+  %arrayidx2 = getelementptr inbounds i32, i32* %cond, i64 %iv
+  %0 = load i32, i32* %arrayidx2, align 4
+  %tobool.not = icmp eq i32 %0, 0
+  br i1 %tobool.not, label %for.inc, label %if.then
+
+if.then:
+  %arrayidx3 = getelementptr inbounds i32, i32* %b, i64 %iv
+  %1 = load i32, i32* %arrayidx3, align 4
+  %2 = load i32, i32* %arrayidx1, align 4
+  %add = add nsw i32 %2, %1
+  %arrayidx4 = getelementptr inbounds i32, i32* %a, i64 %iv
+  store i32 %add, i32* %arrayidx4, align 4
+  br label %for.inc
+
+for.inc:
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+  ret void
+}
+
 !0 = distinct !{!0, !1, !2, !3, !4, !5}
 !1 = !{!"llvm.loop.mustprogress"}
 !2 = !{!"llvm.loop.vectorize.width", i32 4}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-extract-last-veclane.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-extract-last-veclane.ll
index 2e916b2d7262..56a53a5748a3 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-extract-last-veclane.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-extract-last-veclane.ll
@@ -1,9 +1,4 @@
-; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -S < %s 2>%t | FileCheck %s
-
-; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
-
-; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.
-; WARN-NOT: warning
+; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-unknown-linux-gnu"
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll
new file mode 100644
index 000000000000..192d181019a8
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll
@@ -0,0 +1,228 @@
+; RUN: opt -mtriple aarch64-linux-gnu -mattr=+sve -loop-vectorize -dce -instcombine < %s -S | FileCheck %s
+
+; Test that we can add on the induction variable
+;   for (long long i = 0; i < n; i++) {
+;     a[i] = b[i] + i;
+;   }
+; with an unroll factor (interleave count) of 2.
+
+define void @add_ind64_unrolled(i64* noalias nocapture %a, i64* noalias nocapture readonly %b, i64 %n) {
+; CHECK-LABEL: @add_ind64_unrolled(
+; CHECK-NEXT:  entry:
+; CHECK: vector.body:
+; CHECK-NEXT:  %[[INDEX:.*]] = phi i64 [ 0, %vector.ph ], [ %{{.*}}, %vector.body ]
+; CHECK-NEXT:  %[[STEPVEC:.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+; CHECK-NEXT:  %[[TMP1:.*]] = insertelement <vscale x 2 x i64> poison, i64 %[[INDEX]], i32 0
+; CHECK-NEXT:  %[[IDXSPLT:.*]] = shufflevector <vscale x 2 x i64> %[[TMP1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:  %[[VECIND1:.*]] = add <vscale x 2 x i64> %[[IDXSPLT]], %[[STEPVEC]]
+; CHECK-NEXT:  %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:  %[[EC:.*]] = shl i64 %[[VSCALE]], 1
+; CHECK-NEXT:  %[[TMP2:.*]] = insertelement <vscale x 2 x i64> poison, i64 %[[EC]], i32 0
+; CHECK-NEXT:  %[[ECSPLT:.*]] = shufflevector <vscale x 2 x i64> %[[TMP2]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:  %[[TMP3:.*]] = add <vscale x 2 x i64> %[[ECSPLT]], %[[STEPVEC]]
+; CHECK-NEXT:  %[[VECIND2:.*]] = add <vscale x 2 x i64> %[[IDXSPLT]], %[[TMP3]]
+; CHECK:       %[[LOAD1:.*]] = load <vscale x 2 x i64>
+; CHECK:       %[[LOAD2:.*]] = load <vscale x 2 x i64>
+; CHECK:       %[[STOREVAL1:.*]] = add nsw <vscale x 2 x i64> %[[LOAD1]], %[[VECIND1]]
+; CHECK:       %[[STOREVAL2:.*]] = add nsw <vscale x 2 x i64> %[[LOAD2]], %[[VECIND2]]
+; CHECK:       store <vscale x 2 x i64> %[[STOREVAL1]]
+; CHECK:       store <vscale x 2 x i64> %[[STOREVAL2]]
+
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.08 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i64, i64* %b, i64 %i.08
+  %0 = load i64, i64* %arrayidx, align 8
+  %add = add nsw i64 %0, %i.08
+  %arrayidx1 = getelementptr inbounds i64, i64* %a, i64 %i.08
+  store i64 %add, i64* %arrayidx1, align 8
+  %inc = add nuw nsw i64 %i.08, 1
+  %exitcond.not = icmp eq i64 %inc, %n
+  br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !0
+
+exit:                                 ; preds = %for.body
+  ret void
+}
+
+
+; Same as above, except we test with a vectorisation factor of (1, scalable)
+
+define void @add_ind64_unrolled_nxv1i64(i64* noalias nocapture %a, i64* noalias nocapture readonly %b, i64 %n) {
+; CHECK-LABEL: @add_ind64_unrolled_nxv1i64(
+; CHECK-NEXT:  entry:
+; CHECK: vector.body:
+; CHECK-NEXT:  %[[INDEX:.*]] = phi i64 [ 0, %vector.ph ], [ %{{.*}}, %vector.body ]
+; CHECK-NEXT:  %[[STEPVEC:.*]] = call <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64()
+; CHECK-NEXT:  %[[TMP1:.*]] = insertelement <vscale x 1 x i64> poison, i64 %[[INDEX]], i32 0
+; CHECK-NEXT:  %[[IDXSPLT:.*]] = shufflevector <vscale x 1 x i64> %[[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; CHECK-NEXT:  %[[VECIND1:.*]] = add <vscale x 1 x i64> %[[IDXSPLT]], %[[STEPVEC]]
+; CHECK-NEXT:  %[[EC:.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:  %[[TMP2:.*]] = insertelement <vscale x 1 x i64> poison, i64 %[[EC]], i32 0
+; CHECK-NEXT:  %[[ECSPLT:.*]] = shufflevector <vscale x 1 x i64> %[[TMP2]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; CHECK-NEXT:  %[[TMP3:.*]] = add <vscale x 1 x i64> %[[ECSPLT]], %[[STEPVEC]]
+; CHECK-NEXT:  %[[VECIND2:.*]] = add <vscale x 1 x i64> %[[IDXSPLT]], %[[TMP3]]
+; CHECK:       %[[LOAD1:.*]] = load <vscale x 1 x i64>
+; CHECK:       %[[LOAD2:.*]] = load <vscale x 1 x i64>
+; CHECK:       %[[STOREVAL1:.*]] = add nsw <vscale x 1 x i64> %[[LOAD1]], %[[VECIND1]]
+; CHECK:       %[[STOREVAL2:.*]] = add nsw <vscale x 1 x i64> %[[LOAD2]], %[[VECIND2]]
+; CHECK:       store <vscale x 1 x i64> %[[STOREVAL1]]
+; CHECK:       store <vscale x 1 x i64> %[[STOREVAL2]]
+
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.08 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i64, i64* %b, i64 %i.08
+  %0 = load i64, i64* %arrayidx, align 8
+  %add = add nsw i64 %0, %i.08
+  %arrayidx1 = getelementptr inbounds i64, i64* %a, i64 %i.08
+  store i64 %add, i64* %arrayidx1, align 8
+  %inc = add nuw nsw i64 %i.08, 1
+  %exitcond.not = icmp eq i64 %inc, %n
+  br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !9
+
+exit:                                 ; preds = %for.body
+  ret void
+}
+
+
+; Test that we can vectorize a separate induction variable (not used for the branch)
+;   int r = 0;
+;   for (long long i = 0; i < n; i++) {
+;     a[i] = r;
+;     r += 2;
+;   }
+; with an unroll factor (interleave count) of 1.
+
+
+define void @add_unique_ind32(i32* noalias nocapture %a, i64 %n) {
+; CHECK-LABEL: @add_unique_ind32(
+; CHECK:    vector.ph:
+; CHECK:      %[[STEPVEC:.*]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+; CHECK-NEXT: %[[INDINIT:.*]] = shl <vscale x 4 x i32> %[[STEPVEC]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> undef, i32 1, i32 0), <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT: %[[INC:.*]] = shl i32 %[[VSCALE]], 3
+; CHECK-NEXT: %[[TMP:.*]] = insertelement <vscale x 4 x i32> poison, i32 %[[INC]], i32 0
+; CHECK-NEXT: %[[VECINC:.*]] = shufflevector <vscale x 4 x i32> %[[TMP]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK:    vector.body:
+; CHECK:      %[[VECIND:.*]] = phi <vscale x 4 x i32> [ %[[INDINIT]], %vector.ph ], [ %[[VECINDNXT:.*]], %vector.body ]
+; CHECK:      store <vscale x 4 x i32> %[[VECIND]]
+; CHECK:      %[[VECINDNXT]] = add <vscale x 4 x i32> %[[VECIND]], %[[VECINC]]
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.08 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %r.07 = phi i32 [ %add, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %i.08
+  store i32 %r.07, i32* %arrayidx, align 4
+  %add = add nuw nsw i32 %r.07, 2
+  %inc = add nuw nsw i64 %i.08, 1
+  %exitcond.not = icmp eq i64 %inc, %n
+  br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !6
+
+exit:                                 ; preds = %for.body
+  ret void
+}
+
+
+; Test that we can vectorize a separate FP induction variable (not used for the branch)
+;   float r = 0;
+;   for (long long i = 0; i < n; i++) {
+;     a[i] = r;
+;     r += 2;
+;   }
+; with an unroll factor (interleave count) of 1.
+
+define void @add_unique_indf32(float* noalias nocapture %a, i64 %n) {
+; CHECK-LABEL: @add_unique_indf32(
+; CHECK:    vector.ph:
+; CHECK:      %[[STEPVEC:.*]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+; CHECK-NEXT: %[[TMP1:.*]] = uitofp <vscale x 4 x i32> %[[STEPVEC]] to <vscale x 4 x float>
+; CHECK-NEXT: %[[TMP2:.*]] = fmul <vscale x 4 x float> %[[TMP1]], shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 2.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT: %[[INDINIT:.*]] = fadd <vscale x 4 x float> %[[TMP2]], shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 0.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT: %[[TMP3:.*]] = shl i32 %8, 2
+; CHECK-NEXT: %[[TMP4:.*]] = sitofp i32 %[[TMP3]] to float
+; CHECK-NEXT: %[[INC:.*]] = fmul float %[[TMP4]], 2.000000e+00
+; CHECK-NEXT: %[[TMP5:.*]] = insertelement <vscale x 4 x float> poison, float %[[INC]], i32 0
+; CHECK-NEXT: %[[VECINC:.*]] = shufflevector <vscale x 4 x float> %[[TMP5]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK:   vector.body:
+; CHECK:     %[[VECIND:.*]] = phi <vscale x 4 x float> [ %[[INDINIT]], %vector.ph ], [ %[[VECINDNXT:.*]], %vector.body ]
+; CHECK:     store <vscale x 4 x float> %[[VECIND]]
+; CHECK:     %[[VECINDNXT]] = fadd <vscale x 4 x float> %[[VECIND]], %[[VECINC]]
+
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.08 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %r.07 = phi float [ %add, %for.body ], [ 0.000000e+00, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %i.08
+  store float %r.07, float* %arrayidx, align 4
+  %add = fadd float %r.07, 2.000000e+00
+  %inc = add nuw nsw i64 %i.08, 1
+  %exitcond.not = icmp eq i64 %inc, %n
+  br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !6
+
+exit:                                 ; preds = %for.body
+  ret void
+}
+
+; Test a case where the vectorised induction variable is used to
+; generate a mask:
+;   for (long long i = 0; i < n; i++) {
+;     if (i & 0x1)
+;       a[i] = b[i];
+;   }
+
+define void @cond_ind64(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i64 %n) {
+; CHECK-LABEL: @cond_ind64(
+; CHECK:    vector.body:
+; CHECK-NEXT: %[[INDEX:.*]] = phi i64 [ 0, %vector.ph ], [ %{{.*}}, %vector.body ]
+; CHECK:      %[[STEPVEC:.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; CHECK-NEXT: %[[TMP1:.*]] = insertelement <vscale x 4 x i64> poison, i64 %[[INDEX]], i32 0
+; CHECK-NEXT: %[[IDXSPLT:.*]] = shufflevector <vscale x 4 x i64> %[[TMP1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT: %[[VECIND:.*]] = add <vscale x 4 x i64> %[[IDXSPLT]], %[[STEPVEC]]
+; CHECK-NEXT: %[[MASK:.*]] = trunc <vscale x 4 x i64> %[[VECIND]] to <vscale x 4 x i1>
+; CHECK:      %[[LOAD:.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* %{{.*}}, i32 4, <vscale x 4 x i1> %[[MASK]], <vscale x 4 x i32> poison)
+; CHECK:      call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> %[[LOAD]], <vscale x 4 x i32>* %{{.*}}, i32 4, <vscale x 4 x i1> %[[MASK]])
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.inc
+  %i.08 = phi i64 [ %inc, %for.inc ], [ 0, %entry ]
+  %and = and i64 %i.08, 1
+  %tobool.not = icmp eq i64 %and, 0
+  br i1 %tobool.not, label %for.inc, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %i.08
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %a, i64 %i.08
+  store i32 %0, i32* %arrayidx1, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %inc = add nuw nsw i64 %i.08, 1
+  %exitcond.not = icmp eq i64 %inc, %n
+  br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !6
+
+exit:                                 ; preds = %for.inc
+  ret void
+}
+
+!0 = distinct !{!0, !1, !2, !3, !4, !5}
+!1 = !{!"llvm.loop.mustprogress"}
+!2 = !{!"llvm.loop.vectorize.width", i32 2}
+!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+!4 = !{!"llvm.loop.interleave.count", i32 2}
+!5 = !{!"llvm.loop.vectorize.enable", i1 true}
+!6 = distinct !{!6, !1, !7, !3, !8, !5}
+!7 = !{!"llvm.loop.vectorize.width", i32 4}
+!8 = !{!"llvm.loop.interleave.count", i32 1}
+!9 = distinct !{!9, !1, !10, !3, !4, !5}
+!10 = !{!"llvm.loop.vectorize.width", i32 1}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-loads.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-loads.ll
new file mode 100644
index 000000000000..a12ec7f29c42
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-loads.ll
@@ -0,0 +1,36 @@
+; RUN: opt -S -loop-vectorize -mattr=+sve -mtriple aarch64-linux-gnu < %s | FileCheck %s
+
+define void @invariant_load(i64 %n, i32* noalias nocapture %a, i32* nocapture readonly %b) {
+; CHECK-LABEL: @invariant_load
+; CHECK: vector.body:
+; CHECK: %[[GEP:.*]] = getelementptr inbounds i32, i32* %b, i64 42
+; CHECK-NEXT: %[[INVLOAD:.*]] = load i32, i32* %[[GEP]]
+; CHECK-NEXT: %[[SPLATINS:.*]] = insertelement <vscale x 4 x i32> poison, i32 %[[INVLOAD]], i32 0
+; CHECK-NEXT: %[[SPLAT:.*]] = shufflevector <vscale x 4 x i32> %[[SPLATINS]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK: %[[LOAD:.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>*
+; CHECK-NEXT: %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[SPLAT]], %[[LOAD]]
+; CHECK: store <vscale x 4 x i32> %[[ADD]], <vscale x 4 x i32>*
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 42
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %b, i64 %iv
+  %1 = load i32, i32* %arrayidx1, align 4
+  %add = add nsw i32 %0, %1
+  %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %iv
+  store i32 %add, i32* %arrayidx2, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+!1 = distinct !{!1, !2, !3, !4}
+!2 = !{!"llvm.loop.vectorize.width", i32 4}
+!3 = !{!"llvm.loop.interleave.count", i32 1}
+!4 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-large-strides.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-large-strides.ll
new file mode 100644
index 000000000000..e804dd3fe3da
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-large-strides.ll
@@ -0,0 +1,101 @@
+; RUN: opt -mtriple aarch64-linux-gnu -mattr=+sve -loop-vectorize -dce -instcombine -S <%s | FileCheck %s
+
+define void @stride7_i32(i32* noalias nocapture %dst, i64 %n) {
+; CHECK-LABEL: @stride7_i32(
+; CHECK:      vector.body
+; CHECK:        %[[VEC_IND:.*]] = phi <vscale x 4 x i64> [ %{{.*}}, %vector.ph ], [ %{{.*}}, %vector.body ]
+; CHECK-NEXT:   %[[PTR_INDICES:.*]] = mul nuw nsw <vscale x 4 x i64> %[[VEC_IND]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 7, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:   %[[PTRS:.*]] = getelementptr inbounds i32, i32* %dst, <vscale x 4 x i64> %[[PTR_INDICES]]
+; CHECK-NEXT:   %[[GLOAD:.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> %[[PTRS]]
+; CHECK-NEXT:   %[[VALS:.*]] = add nsw <vscale x 4 x i32> %[[GLOAD]],
+; CHECK-NEXT:   call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> %[[VALS]], <vscale x 4 x i32*> %[[PTRS]]
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.05 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %mul = mul nuw nsw i64 %i.05, 7
+  %arrayidx = getelementptr inbounds i32, i32* %dst, i64 %mul
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %0, 3
+  store i32 %add, i32* %arrayidx, align 4
+  %inc = add nuw nsw i64 %i.05, 1
+  %exitcond.not = icmp eq i64 %inc, %n
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+define void @stride7_f64(double* noalias nocapture %dst, i64 %n) {
+; CHECK-LABEL: @stride7_f64(
+; CHECK:      vector.body
+; CHECK:        %[[VEC_IND:.*]] = phi <vscale x 2 x i64> [ %{{.*}}, %vector.ph ], [ %{{.*}}, %vector.body ]
+; CHECK-NEXT:   %[[PTR_INDICES:.*]] = mul nuw nsw <vscale x 2 x i64> %[[VEC_IND]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 7, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:   %[[PTRS:.*]] = getelementptr inbounds double, double* %dst, <vscale x 2 x i64> %[[PTR_INDICES]]
+; CHECK-NEXT:   %[[GLOAD:.*]] = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0f64(<vscale x 2 x double*> %[[PTRS]],
+; CHECK-NEXT:   %[[VALS:.*]] = fadd <vscale x 2 x double> %[[GLOAD]],
+; CHECK-NEXT:  call void @llvm.masked.scatter.nxv2f64.nxv2p0f64(<vscale x 2 x double> %[[VALS]], <vscale x 2 x double*> %[[PTRS]],
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.05 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %mul = mul nuw nsw i64 %i.05, 7
+  %arrayidx = getelementptr inbounds double, double* %dst, i64 %mul
+  %0 = load double, double* %arrayidx, align 8
+  %add = fadd double %0, 1.000000e+00
+  store double %add, double* %arrayidx, align 8
+  %inc = add nuw nsw i64 %i.05, 1
+  %exitcond.not = icmp eq i64 %inc, %n
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !6
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+
+define void @cond_stride7_f64(double* noalias nocapture %dst, i64* noalias nocapture readonly %cond, i64 %n) {
+; CHECK-LABEL: @cond_stride7_f64(
+; CHECK:      vector.body
+; CHECK:        %[[MASK:.*]] = icmp ne <vscale x 2 x i64>
+; CHECK:        %[[PTRS:.*]] = getelementptr inbounds double, double* %dst, <vscale x 2 x i64> %{{.*}}
+; CHECK-NEXT:   %[[GLOAD:.*]] = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0f64(<vscale x 2 x double*> %[[PTRS]], i32 8, <vscale x 2 x i1> %[[MASK]]
+; CHECK-NEXT:   %[[VALS:.*]] = fadd <vscale x 2 x double> %[[GLOAD]],
+; CHECK-NEXT:  call void @llvm.masked.scatter.nxv2f64.nxv2p0f64(<vscale x 2 x double> %[[VALS]], <vscale x 2 x double*> %[[PTRS]], i32 8, <vscale x 2 x i1> %[[MASK]])
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.inc
+  %i.07 = phi i64 [ %inc, %for.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i64, i64* %cond, i64 %i.07
+  %0 = load i64, i64* %arrayidx, align 8
+  %tobool.not = icmp eq i64 %0, 0
+  br i1 %tobool.not, label %for.inc, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %mul = mul nsw i64 %i.07, 7
+  %arrayidx1 = getelementptr inbounds double, double* %dst, i64 %mul
+  %1 = load double, double* %arrayidx1, align 8
+  %add = fadd double %1, 1.000000e+00
+  store double %add, double* %arrayidx1, align 8
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %inc = add nuw nsw i64 %i.07, 1
+  %exitcond.not = icmp eq i64 %inc, %n
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !6
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+
+!0 = distinct !{!0, !1, !2, !3, !4, !5}
+!1 = !{!"llvm.loop.mustprogress"}
+!2 = !{!"llvm.loop.vectorize.width", i32 4}
+!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+!4 = !{!"llvm.loop.interleave.count", i32 1}
+!5 = !{!"llvm.loop.vectorize.enable", i1 true}
+!6 = distinct !{!6, !1, !7, !3, !4, !5}
+!7 = !{!"llvm.loop.vectorize.width", i32 2}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-scalable-load-in-loop.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-scalable-load-in-loop.ll
index 34264562e71f..ee80553853eb 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-scalable-load-in-loop.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-scalable-load-in-loop.ll
@@ -1,5 +1,4 @@
-; RUN: opt -S -loop-vectorize -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s
-; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
+; RUN: opt -S -loop-vectorize -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
 
 ; This test is checking that a scalable load inside a loop does not trigger a
 ; TypeSize error in the loop vectorization legality analysis. It is possible for
@@ -8,9 +7,6 @@
 ; load, it should not be considered for analysis, and we should not see a
 ; TypeSize error.
 
-; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.
-; WARN-NOT: warning:
-
 ; #include <arm_sve.h>
 ;
 ; void scalable_load_in_loop(long n, int *a, int *b, svuint32_t *x,
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll
index d803f6b75ed9..ab45cd847a26 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll
@@ -10,13 +10,7 @@
 
 ; The test checks if the mask is being correctly created, reverted and used
 
-; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -S < %s 2>%t | FileCheck %s
-
-; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
-
-; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.
-; WARN-NOT: warning
-
+; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-unknown-linux-gnu"
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll
index aef5efe030f5..c43927045550 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll
@@ -4,12 +4,7 @@
 ;  for (int i = N-1; i >= 0; --i)
 ;    a[i] = b[i] + 1.0;
 
-; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -S < %s 2>%t | FileCheck %s
-
-; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
-
-; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.
-; WARN-NOT: warning
+; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -S < %s | FileCheck %s
 
 define void @vector_reverse_f64(i64 %N, double* %a, double* %b) #0{
 ; CHECK-LABEL: @vector_reverse_f64
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll
index 22de7c1e1ca8..5d539ca36d49 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll
@@ -10,13 +10,7 @@
 
 ; The test checks if the mask is being correctly created, reverted  and used
 
-; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -S < %s 2>%t | FileCheck %s
-
-; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
-
-; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.
-; WARN-NOT: warning
-
+; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-unknown-linux-gnu"
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse.ll
index ae3aad1e75a8..311dfdb068ca 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse.ll
@@ -5,12 +5,7 @@
 ;  for (int i = N-1; i >= 0; --i)
 ;    a[i] = b[i] + 1.0;
 
-; RUN: opt -loop-vectorize -dce  -mtriple aarch64-linux-gnu -S < %s 2>%t | FileCheck %s
-
-; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
-
-; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.$
-; WARN-NOT: warning
+; RUN: opt -loop-vectorize -dce  -mtriple aarch64-linux-gnu -S < %s | FileCheck %s
 
 define void @vector_reverse_f64(i64 %N, double* %a, double* %b) #0 {
 ; CHECK-LABEL: vector_reverse_f64