diff options
Diffstat (limited to 'llvm/test/Transforms/LoopVectorize/AArch64')
13 files changed, 694 insertions, 44 deletions
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll index bb70cbcfd4e9..8fa057f0f888 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll @@ -72,11 +72,10 @@ for.end: } define void @vec_intrinsic(i64 %N, double* nocapture readonly %a) { -;; FIXME: Should be calling sin_vec, once the cost of scalarizing is handled. ; CHECK-LABEL: @vec_intrinsic ; CHECK: vector.body: ; CHECK: %[[LOAD:.*]] = load <vscale x 2 x double>, <vscale x 2 x double>* -; CHECK: call fast <vscale x 2 x double> @llvm.sin.nxv2f64(<vscale x 2 x double> %[[LOAD]]) +; CHECK: call fast <vscale x 2 x double> @sin_vec(<vscale x 2 x double> %[[LOAD]]) entry: %cmp7 = icmp sgt i64 %N, 0 br i1 %cmp7, label %for.body, label %for.end diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll new file mode 100644 index 000000000000..ed16349c14bb --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll @@ -0,0 +1,280 @@ +; RUN: opt < %s -loop-vectorize -instcombine -mtriple aarch64-unknown-linux-gnu -enable-strict-reductions -S | FileCheck %s -check-prefix=CHECK + +define float @fadd_strict(float* noalias nocapture readonly %a, i64 %n) { +; CHECK-LABEL: @fadd_strict +; CHECK: vector.body: +; CHECK: %[[VEC_PHI:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX:.*]], %vector.body ] +; CHECK: %[[LOAD:.*]] = load <8 x float>, <8 x float>* +; CHECK: %[[RDX]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[VEC_PHI]], <8 x float> %[[LOAD]]) +; CHECK: for.end +; CHECK: %[[PHI:.*]] = phi float [ %[[SCALAR:.*]], %for.body ], [ %[[RDX]], %middle.block ] +; CHECK: ret float %[[PHI]] +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %sum.07 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds float, float* %a, i64 %iv + %0 = load float, float* %arrayidx, align 4 + %add = fadd float %0, %sum.07 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret float %add +} + +define float @fadd_strict_unroll(float* noalias nocapture readonly %a, i64 %n) { +; CHECK-LABEL: @fadd_strict_unroll +; CHECK: vector.body: +; CHECK: %[[VEC_PHI1:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX4:.*]], %vector.body ] +; CHECK: %[[LOAD1:.*]] = load <8 x float>, <8 x float>* +; CHECK: %[[LOAD2:.*]] = load <8 x float>, <8 x float>* +; CHECK: %[[LOAD3:.*]] = load <8 x float>, <8 x float>* +; CHECK: %[[LOAD4:.*]] = load <8 x float>, <8 x float>* +; CHECK: %[[RDX1:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[VEC_PHI1]], <8 x float> %[[LOAD1]]) +; CHECK: %[[RDX2:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[RDX1]], <8 x float> %[[LOAD2]]) +; CHECK: %[[RDX3:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[RDX2]], <8 x float> %[[LOAD3]]) +; CHECK: %[[RDX4:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[RDX3]], <8 x float> %[[LOAD4]]) +; CHECK: for.end +; CHECK: %[[PHI:.*]] = phi float [ %[[SCALAR:.*]], %for.body ], [ %[[RDX4]], %middle.block ] +; CHECK: ret float %[[PHI]] +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %sum.07 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds float, float* %a, i64 %iv + %0 = load float, float* %arrayidx, align 4 + %add = fadd float %0, %sum.07 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1 + +for.end: + ret float %add +} + +define void @fadd_strict_interleave(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) { +; CHECK-LABEL: @fadd_strict_interleave +; CHECK: entry +; CHECK: %[[ARRAYIDX:.*]] = getelementptr inbounds float, float* %a, i64 1 +; CHECK: %[[LOAD1:.*]] = load float, float* %a +; CHECK: %[[LOAD2:.*]] = load float, float* %[[ARRAYIDX]] +; CHECK: vector.body +; CHECK: %[[VEC_PHI1:.*]] = phi float [ %[[LOAD2]], %vector.ph ], [ %[[RDX2:.*]], %vector.body ] +; CHECK: %[[VEC_PHI2:.*]] = phi float [ %[[LOAD1]], %vector.ph ], [ %[[RDX1:.*]], %vector.body ] +; CHECK: %[[WIDE_LOAD:.*]] = load <8 x float>, <8 x float>* +; CHECK: %[[STRIDED1:.*]] = shufflevector <8 x float> %[[WIDE_LOAD]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> +; CHECK: %[[STRIDED2:.*]] = shufflevector <8 x float> %[[WIDE_LOAD]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7> +; CHECK: %[[RDX1]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[VEC_PHI2]], <4 x float> %[[STRIDED1]]) +; CHECK: %[[RDX2]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[VEC_PHI1]], <4 x float> %[[STRIDED2]]) +; CHECK: for.end +; CHECK ret void +entry: + %arrayidxa = getelementptr inbounds float, float* %a, i64 1 + %a1 = load float, float* %a, align 4 + %a2 = load float, float* %arrayidxa, align 4 + br label %for.body + +for.body: + %add.phi1 = phi float [ %a2, %entry ], [ %add2, %for.body ] + %add.phi2 = phi float [ %a1, %entry ], [ %add1, %for.body ] + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidxb1 = getelementptr inbounds float, float* %b, i64 %iv + %0 = load float, float* %arrayidxb1, align 4 + %add1 = fadd float %0, %add.phi2 + %or = or i64 %iv, 1 + %arrayidxb2 = getelementptr inbounds float, float* %b, i64 %or + %1 = load float, float* %arrayidxb2, align 4 + %add2 = fadd float %1, %add.phi1 + %iv.next = add nuw nsw i64 %iv, 2 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !2 + +for.end: + store float %add1, float* %a, align 4 + store float %add2, float* %arrayidxa, align 4 + ret void +} + +define float @fadd_invariant(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) { +; CHECK-LABEL: @fadd_invariant +; CHECK: vector.body +; CHECK: %[[VEC_PHI1:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX:.*]], %vector.body ] +; CHECK: %[[LOAD1:.*]] = load <4 x float>, <4 x float>* +; CHECK: %[[LOAD2:.*]] = load <4 x float>, <4 x float>* +; CHECK: %[[ADD:.*]] = fadd <4 x float> %[[LOAD1]], %[[LOAD2]] +; CHECK: %[[RDX]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[VEC_PHI1]], <4 x float> %[[ADD]]) +; CHECK: for.end.loopexit +; CHECK: %[[EXIT_PHI:.*]] = phi float [ %[[SCALAR:.*]], %for.body ], [ %[[RDX]], %middle.block ] +; CHECK: for.end +; CHECK: %[[PHI:.*]] = phi float [ 0.000000e+00, %entry ], [ %[[EXIT_PHI]], %for.end.loopexit ] +; CHECK: ret float %[[PHI]] +entry: + %arrayidx = getelementptr inbounds float, float* %a, i64 1 + %0 = load float, float* %arrayidx, align 4 + %cmp1 = fcmp ogt float %0, 5.000000e-01 + br i1 %cmp1, label %for.body, label %for.end + +for.body: ; preds = %for.body + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %res.014 = phi float [ 0.000000e+00, %entry ], [ %rdx, %for.body ] + %arrayidx2 = getelementptr inbounds float, float* %a, i64 %iv + %1 = load float, float* %arrayidx2, align 4 + %arrayidx4 = getelementptr inbounds float, float* %b, i64 %iv + %2 = load float, float* %arrayidx4, align 4 + %add = fadd float %1, %2 + %rdx = fadd float %res.014, %add + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !2 + +for.end: ; preds = %for.body, %entry + %res = phi float [ 0.000000e+00, %entry ], [ %rdx, %for.body ] + ret float %res +} + +define float @fadd_conditional(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) { +; CHECK-LABEL: @fadd_conditional +; CHECK: vector.body: +; CHECK: %[[PHI:.*]] = phi float [ 1.000000e+00, %vector.ph ], [ %[[RDX:.*]], %pred.load.continue6 ] +; CHECK: %[[LOAD1:.*]] = load <4 x float>, <4 x float>* +; CHECK: %[[FCMP1:.*]] = fcmp une <4 x float> %[[LOAD1]], zeroinitializer +; CHECK: %[[EXTRACT:.*]] = extractelement <4 x i1> %[[FCMP1]], i32 0 +; CHECK: br i1 %[[EXTRACT]], label %pred.load.if, label %pred.load.continue +; CHECK: pred.load.continue6 +; CHECK: %[[PHI1:.*]] = phi <4 x float> [ %[[PHI0:.*]], %pred.load.continue4 ], [ %[[INS_ELT:.*]], %pred.load.if5 ] +; CHECK: %[[PRED:.*]] = select <4 x i1> %[[FCMP1]], <4 x float> %[[PHI1]], <4 x float> <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00> +; CHECK: %[[RDX]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[PHI]], <4 x float> %[[PRED]]) +; CHECK: for.body +; CHECK: %[[RES_PHI:.*]] = phi float [ %[[MERGE_RDX:.*]], %scalar.ph ], [ %[[FADD:.*]], %for.inc ] +; CHECK: %[[LOAD2:.*]] = load float, float* +; CHECK: %[[FCMP2:.*]] = fcmp une float %[[LOAD2]], 0.000000e+00 +; CHECK: br i1 %[[FCMP2]], label %if.then, label %for.inc +; CHECK: if.then +; CHECK: %[[LOAD3:.*]] = load float, float* +; CHECK: br label %for.inc +; CHECK: for.inc +; CHECK: %[[PHI2:.*]] = phi float [ %[[LOAD3]], %if.then ], [ 3.000000e+00, %for.body ] +; CHECK: %[[FADD]] = fadd float %[[RES_PHI]], %[[PHI2]] +; CHECK: for.end +; CHECK: %[[RDX_PHI:.*]] = phi float [ %[[FADD]], %for.inc ], [ %[[RDX]], %middle.block ] +; CHECK: ret float %[[RDX_PHI]] +entry: + br label %for.body + +for.body: ; preds = %for.body + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ] + %res = phi float [ 1.000000e+00, %entry ], [ %fadd, %for.inc ] + %arrayidx = getelementptr inbounds float, float* %b, i64 %iv + %0 = load float, float* %arrayidx, align 4 + %tobool = fcmp une float %0, 0.000000e+00 + br i1 %tobool, label %if.then, label %for.inc + +if.then: ; preds = %for.body + %arrayidx2 = getelementptr inbounds float, float* %a, i64 %iv + %1 = load float, float* %arrayidx2, align 4 + br label %for.inc + +for.inc: + %phi = phi float [ %1, %if.then ], [ 3.000000e+00, %for.body ] + %fadd = fadd float %res, %phi + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !2 + +for.end: + %rdx = phi float [ %fadd, %for.inc ] + ret float %rdx +} + +; Test to check masking correct, using the "llvm.loop.vectorize.predicate.enable" attribute +define float @fadd_predicated(float* noalias nocapture %a, i64 %n) { +; CHECK-LABEL: @fadd_predicated +; CHECK: vector.ph +; CHECK: %[[TRIP_MINUS_ONE:.*]] = add i64 %n, -1 +; CHECK: %[[BROADCAST_INS:.*]] = insertelement <2 x i64> poison, i64 %[[TRIP_MINUS_ONE]], i32 0 +; CHECK: %[[SPLAT:.*]] = shufflevector <2 x i64> %[[BROADCAST_INS]], <2 x i64> poison, <2 x i32> zeroinitializer +; CHECK: vector.body +; CHECK: %[[RDX_PHI:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX:.*]], %pred.load.continue2 ] +; CHECK: pred.load.continue2 +; CHECK: %[[PHI:.*]] = phi <2 x float> [ %[[PHI0:.*]], %pred.load.continue ], [ %[[INS_ELT:.*]], %pred.load.if1 ] +; CHECK: %[[MASK:.*]] = select <2 x i1> %0, <2 x float> %[[PHI]], <2 x float> <float -0.000000e+00, float -0.000000e+00> +; CHECK: %[[RDX]] = call float @llvm.vector.reduce.fadd.v2f32(float %[[RDX_PHI]], <2 x float> %[[MASK]]) +; CHECK: for.end: +; CHECK: %[[RES_PHI:.*]] = phi float [ undef, %for.body ], [ %[[RDX]], %middle.block ] +; CHECK: ret float %[[RES_PHI]] +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ] + %sum.02 = phi float [ %l7, %for.body ], [ 0.000000e+00, %entry ] + %l2 = getelementptr inbounds float, float* %a, i64 %iv + %l3 = load float, float* %l2, align 4 + %l7 = fadd float %sum.02, %l3 + %iv.next = add i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, %n + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !3 + +for.end: ; preds = %for.body + %sum.0.lcssa = phi float [ %l7, %for.body ] + ret float %sum.0.lcssa +} + +; Negative test - loop contains multiple fadds which we cannot safely reorder +define float @fadd_multiple(float* noalias nocapture %a, float* noalias nocapture %b, i64 %n) { +; CHECK-LABEL: @fadd_multiple +; CHECK: vector.body +; CHECK: %[[PHI:.*]] = phi <8 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ %[[VEC_FADD2:.*]], %vector.body ] +; CHECK: %[[VEC_LOAD1:.*]] = load <8 x float>, <8 x float> +; CHECK: %[[VEC_FADD1:.*]] = fadd <8 x float> %[[PHI]], %[[VEC_LOAD1]] +; CHECK: %[[VEC_LOAD2:.*]] = load <8 x float>, <8 x float> +; CHECK: %[[VEC_FADD2]] = fadd <8 x float> %[[VEC_FADD1]], %[[VEC_LOAD2]] +; CHECK: middle.block +; CHECK: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> %[[VEC_FADD2]]) +; CHECK: for.body +; CHECK: %[[SUM:.*]] = phi float [ %bc.merge.rdx, %scalar.ph ], [ %[[FADD2:.*]], %for.body ] +; CHECK: %[[LOAD1:.*]] = load float, float* +; CHECK: %[[FADD1:.*]] = fadd float %sum, %[[LOAD1]] +; CHECK: %[[LOAD2:.*]] = load float, float* +; CHECK: %[[FADD2]] = fadd float %[[FADD1]], %[[LOAD2]] +; CHECK: for.end +; CHECK: %[[RET:.*]] = phi float [ %[[FADD2]], %for.body ], [ %[[RDX]], %middle.block ] +; CHECK: ret float %[[RET]] +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %sum = phi float [ -0.000000e+00, %entry ], [ %add3, %for.body ] + %arrayidx = getelementptr inbounds float, float* %a, i64 %iv + %0 = load float, float* %arrayidx, align 4 + %add = fadd float %sum, %0 + %arrayidx2 = getelementptr inbounds float, float* %b, i64 %iv + %1 = load float, float* %arrayidx2, align 4 + %add3 = fadd float %add, %1 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: ; preds = %for.body + %rdx = phi float [ %add3, %for.body ] + ret float %rdx +} + +!0 = distinct !{!0, !4, !7, !9} +!1 = distinct !{!1, !4, !8, !9} +!2 = distinct !{!2, !5, !7, !9} +!3 = distinct !{!3, !6, !7, !9, !10} +!4 = !{!"llvm.loop.vectorize.width", i32 8} +!5 = !{!"llvm.loop.vectorize.width", i32 4} +!6 = !{!"llvm.loop.vectorize.width", i32 2} +!7 = !{!"llvm.loop.interleave.count", i32 1} +!8 = !{!"llvm.loop.interleave.count", i32 4} +!9 = !{!"llvm.loop.vectorize.enable", i1 true} +!10 = !{!"llvm.loop.vectorize.predicate.enable", i1 true} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-basic-vec.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-basic-vec.ll index f59b2bf85db3..d548ca76d9a6 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-basic-vec.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-basic-vec.ll @@ -1,9 +1,5 @@ -; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -mattr=+sve < %s -S 2>%t | FileCheck %s +; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -mattr=+sve < %s -S | FileCheck %s -; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t - -; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. -; WARN-NOT: warning target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-unknown-linux-gnu" diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll index 37e70adf64f8..2536c6c85dea 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll @@ -76,6 +76,47 @@ exit: ; preds = %for.inc ret void } +define void @invariant_load_cond(i32* noalias nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %cond, i64 %n) { +; CHECK-LABEL: @invariant_load_cond +; CHECK: vector.body +; CHECK: %[[GEP:.*]] = getelementptr inbounds i32, i32* %b, i64 42 +; CHECK-NEXT: %[[SPLATINS:.*]] = insertelement <vscale x 4 x i32*> poison, i32* %[[GEP]], i32 0 +; CHECK-NEXT: %[[SPLAT:.*]] = shufflevector <vscale x 4 x i32*> %[[SPLATINS]], <vscale x 4 x i32*> poison, <vscale x 4 x i32> zeroinitializer +; CHECK: %[[LOAD:.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* +; CHECK-NEXT: %[[ICMP:.*]] = icmp ne <vscale x 4 x i32> %[[LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 0, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer) +; CHECK: %[[MASKED_LOAD:.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* %[[BITCAST:.*]], i32 4, <vscale x 4 x i1> %[[ICMP]], <vscale x 4 x i32> poison) +; CHECK-NEXT: %[[MASKED_GATHER:.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> %[[SPLAT]], i32 4, <vscale x 4 x i1> %[[ICMP]], <vscale x 4 x i32> undef) +; CHECK-NEXT: %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[MASKED_GATHER]], %[[MASKED_LOAD]] +; CHECK: call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> %[[ADD]], <vscale x 4 x i32>* %[[BITCAST1:.*]], i32 4, <vscale x 4 x i1> %[[ICMP]]) +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ] + %arrayidx1 = getelementptr inbounds i32, i32* %b, i64 42 + %arrayidx2 = getelementptr inbounds i32, i32* %cond, i64 %iv + %0 = load i32, i32* %arrayidx2, align 4 + %tobool.not = icmp eq i32 %0, 0 + br i1 %tobool.not, label %for.inc, label %if.then + +if.then: + %arrayidx3 = getelementptr inbounds i32, i32* %b, i64 %iv + %1 = load i32, i32* %arrayidx3, align 4 + %2 = load i32, i32* %arrayidx1, align 4 + %add = add nsw i32 %2, %1 + %arrayidx4 = getelementptr inbounds i32, i32* %a, i64 %iv + store i32 %add, i32* %arrayidx4, align 4 + br label %for.inc + +for.inc: + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret void +} + !0 = distinct !{!0, !1, !2, !3, !4, !5} !1 = !{!"llvm.loop.mustprogress"} !2 = !{!"llvm.loop.vectorize.width", i32 4} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-extract-last-veclane.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-extract-last-veclane.ll index 2e916b2d7262..56a53a5748a3 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-extract-last-veclane.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-extract-last-veclane.ll @@ -1,9 +1,4 @@ -; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -S < %s 2>%t | FileCheck %s - -; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t - -; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. -; WARN-NOT: warning +; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -S < %s | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-unknown-linux-gnu" diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll new file mode 100644 index 000000000000..192d181019a8 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll @@ -0,0 +1,228 @@ +; RUN: opt -mtriple aarch64-linux-gnu -mattr=+sve -loop-vectorize -dce -instcombine < %s -S | FileCheck %s + +; Test that we can add on the induction variable +; for (long long i = 0; i < n; i++) { +; a[i] = b[i] + i; +; } +; with an unroll factor (interleave count) of 2. + +define void @add_ind64_unrolled(i64* noalias nocapture %a, i64* noalias nocapture readonly %b, i64 %n) { +; CHECK-LABEL: @add_ind64_unrolled( +; CHECK-NEXT: entry: +; CHECK: vector.body: +; CHECK-NEXT: %[[INDEX:.*]] = phi i64 [ 0, %vector.ph ], [ %{{.*}}, %vector.body ] +; CHECK-NEXT: %[[STEPVEC:.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64() +; CHECK-NEXT: %[[TMP1:.*]] = insertelement <vscale x 2 x i64> poison, i64 %[[INDEX]], i32 0 +; CHECK-NEXT: %[[IDXSPLT:.*]] = shufflevector <vscale x 2 x i64> %[[TMP1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer +; CHECK-NEXT: %[[VECIND1:.*]] = add <vscale x 2 x i64> %[[IDXSPLT]], %[[STEPVEC]] +; CHECK-NEXT: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: %[[EC:.*]] = shl i64 %[[VSCALE]], 1 +; CHECK-NEXT: %[[TMP2:.*]] = insertelement <vscale x 2 x i64> poison, i64 %[[EC]], i32 0 +; CHECK-NEXT: %[[ECSPLT:.*]] = shufflevector <vscale x 2 x i64> %[[TMP2]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer +; CHECK-NEXT: %[[TMP3:.*]] = add <vscale x 2 x i64> %[[ECSPLT]], %[[STEPVEC]] +; CHECK-NEXT: %[[VECIND2:.*]] = add <vscale x 2 x i64> %[[IDXSPLT]], %[[TMP3]] +; CHECK: %[[LOAD1:.*]] = load <vscale x 2 x i64> +; CHECK: %[[LOAD2:.*]] = load <vscale x 2 x i64> +; CHECK: %[[STOREVAL1:.*]] = add nsw <vscale x 2 x i64> %[[LOAD1]], %[[VECIND1]] +; CHECK: %[[STOREVAL2:.*]] = add nsw <vscale x 2 x i64> %[[LOAD2]], %[[VECIND2]] +; CHECK: store <vscale x 2 x i64> %[[STOREVAL1]] +; CHECK: store <vscale x 2 x i64> %[[STOREVAL2]] + +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %i.08 = phi i64 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i64, i64* %b, i64 %i.08 + %0 = load i64, i64* %arrayidx, align 8 + %add = add nsw i64 %0, %i.08 + %arrayidx1 = getelementptr inbounds i64, i64* %a, i64 %i.08 + store i64 %add, i64* %arrayidx1, align 8 + %inc = add nuw nsw i64 %i.08, 1 + %exitcond.not = icmp eq i64 %inc, %n + br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !0 + +exit: ; preds = %for.body + ret void +} + + +; Same as above, except we test with a vectorisation factor of (1, scalable) + +define void @add_ind64_unrolled_nxv1i64(i64* noalias nocapture %a, i64* noalias nocapture readonly %b, i64 %n) { +; CHECK-LABEL: @add_ind64_unrolled_nxv1i64( +; CHECK-NEXT: entry: +; CHECK: vector.body: +; CHECK-NEXT: %[[INDEX:.*]] = phi i64 [ 0, %vector.ph ], [ %{{.*}}, %vector.body ] +; CHECK-NEXT: %[[STEPVEC:.*]] = call <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64() +; CHECK-NEXT: %[[TMP1:.*]] = insertelement <vscale x 1 x i64> poison, i64 %[[INDEX]], i32 0 +; CHECK-NEXT: %[[IDXSPLT:.*]] = shufflevector <vscale x 1 x i64> %[[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer +; CHECK-NEXT: %[[VECIND1:.*]] = add <vscale x 1 x i64> %[[IDXSPLT]], %[[STEPVEC]] +; CHECK-NEXT: %[[EC:.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: %[[TMP2:.*]] = insertelement <vscale x 1 x i64> poison, i64 %[[EC]], i32 0 +; CHECK-NEXT: %[[ECSPLT:.*]] = shufflevector <vscale x 1 x i64> %[[TMP2]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer +; CHECK-NEXT: %[[TMP3:.*]] = add <vscale x 1 x i64> %[[ECSPLT]], %[[STEPVEC]] +; CHECK-NEXT: %[[VECIND2:.*]] = add <vscale x 1 x i64> %[[IDXSPLT]], %[[TMP3]] +; CHECK: %[[LOAD1:.*]] = load <vscale x 1 x i64> +; CHECK: %[[LOAD2:.*]] = load <vscale x 1 x i64> +; CHECK: %[[STOREVAL1:.*]] = add nsw <vscale x 1 x i64> %[[LOAD1]], %[[VECIND1]] +; CHECK: %[[STOREVAL2:.*]] = add nsw <vscale x 1 x i64> %[[LOAD2]], %[[VECIND2]] +; CHECK: store <vscale x 1 x i64> %[[STOREVAL1]] +; CHECK: store <vscale x 1 x i64> %[[STOREVAL2]] + +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %i.08 = phi i64 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i64, i64* %b, i64 %i.08 + %0 = load i64, i64* %arrayidx, align 8 + %add = add nsw i64 %0, %i.08 + %arrayidx1 = getelementptr inbounds i64, i64* %a, i64 %i.08 + store i64 %add, i64* %arrayidx1, align 8 + %inc = add nuw nsw i64 %i.08, 1 + %exitcond.not = icmp eq i64 %inc, %n + br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !9 + +exit: ; preds = %for.body + ret void +} + + +; Test that we can vectorize a separate induction variable (not used for the branch) +; int r = 0; +; for (long long i = 0; i < n; i++) { +; a[i] = r; +; r += 2; +; } +; with an unroll factor (interleave count) of 1. + + +define void @add_unique_ind32(i32* noalias nocapture %a, i64 %n) { +; CHECK-LABEL: @add_unique_ind32( +; CHECK: vector.ph: +; CHECK: %[[STEPVEC:.*]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32() +; CHECK-NEXT: %[[INDINIT:.*]] = shl <vscale x 4 x i32> %[[STEPVEC]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> undef, i32 1, i32 0), <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer) +; CHECK-NEXT: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: %[[INC:.*]] = shl i32 %[[VSCALE]], 3 +; CHECK-NEXT: %[[TMP:.*]] = insertelement <vscale x 4 x i32> poison, i32 %[[INC]], i32 0 +; CHECK-NEXT: %[[VECINC:.*]] = shufflevector <vscale x 4 x i32> %[[TMP]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer +; CHECK: vector.body: +; CHECK: %[[VECIND:.*]] = phi <vscale x 4 x i32> [ %[[INDINIT]], %vector.ph ], [ %[[VECINDNXT:.*]], %vector.body ] +; CHECK: store <vscale x 4 x i32> %[[VECIND]] +; CHECK: %[[VECINDNXT]] = add <vscale x 4 x i32> %[[VECIND]], %[[VECINC]] +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %i.08 = phi i64 [ %inc, %for.body ], [ 0, %entry ] + %r.07 = phi i32 [ %add, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %i.08 + store i32 %r.07, i32* %arrayidx, align 4 + %add = add nuw nsw i32 %r.07, 2 + %inc = add nuw nsw i64 %i.08, 1 + %exitcond.not = icmp eq i64 %inc, %n + br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !6 + +exit: ; preds = %for.body + ret void +} + + +; Test that we can vectorize a separate FP induction variable (not used for the branch) +; float r = 0; +; for (long long i = 0; i < n; i++) { +; a[i] = r; +; r += 2; +; } +; with an unroll factor (interleave count) of 1. + +define void @add_unique_indf32(float* noalias nocapture %a, i64 %n) { +; CHECK-LABEL: @add_unique_indf32( +; CHECK: vector.ph: +; CHECK: %[[STEPVEC:.*]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32() +; CHECK-NEXT: %[[TMP1:.*]] = uitofp <vscale x 4 x i32> %[[STEPVEC]] to <vscale x 4 x float> +; CHECK-NEXT: %[[TMP2:.*]] = fmul <vscale x 4 x float> %[[TMP1]], shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 2.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer) +; CHECK-NEXT: %[[INDINIT:.*]] = fadd <vscale x 4 x float> %[[TMP2]], shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 0.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer) +; CHECK-NEXT: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: %[[TMP3:.*]] = shl i32 %8, 2 +; CHECK-NEXT: %[[TMP4:.*]] = sitofp i32 %[[TMP3]] to float +; CHECK-NEXT: %[[INC:.*]] = fmul float %[[TMP4]], 2.000000e+00 +; CHECK-NEXT: %[[TMP5:.*]] = insertelement <vscale x 4 x float> poison, float %[[INC]], i32 0 +; CHECK-NEXT: %[[VECINC:.*]] = shufflevector <vscale x 4 x float> %[[TMP5]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer +; CHECK: vector.body: +; CHECK: %[[VECIND:.*]] = phi <vscale x 4 x float> [ %[[INDINIT]], %vector.ph ], [ %[[VECINDNXT:.*]], %vector.body ] +; CHECK: store <vscale x 4 x float> %[[VECIND]] +; CHECK: %[[VECINDNXT]] = fadd <vscale x 4 x float> %[[VECIND]], %[[VECINC]] + +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %i.08 = phi i64 [ %inc, %for.body ], [ 0, %entry ] + %r.07 = phi float [ %add, %for.body ], [ 0.000000e+00, %entry ] + %arrayidx = getelementptr inbounds float, float* %a, i64 %i.08 + store float %r.07, float* %arrayidx, align 4 + %add = fadd float %r.07, 2.000000e+00 + %inc = add nuw nsw i64 %i.08, 1 + %exitcond.not = icmp eq i64 %inc, %n + br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !6 + +exit: ; preds = %for.body + ret void +} + +; Test a case where the vectorised induction variable is used to +; generate a mask: +; for (long long i = 0; i < n; i++) { +; if (i & 0x1) +; a[i] = b[i]; +; } + +define void @cond_ind64(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i64 %n) { +; CHECK-LABEL: @cond_ind64( +; CHECK: vector.body: +; CHECK-NEXT: %[[INDEX:.*]] = phi i64 [ 0, %vector.ph ], [ %{{.*}}, %vector.body ] +; CHECK: %[[STEPVEC:.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64() +; CHECK-NEXT: %[[TMP1:.*]] = insertelement <vscale x 4 x i64> poison, i64 %[[INDEX]], i32 0 +; CHECK-NEXT: %[[IDXSPLT:.*]] = shufflevector <vscale x 4 x i64> %[[TMP1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer +; CHECK-NEXT: %[[VECIND:.*]] = add <vscale x 4 x i64> %[[IDXSPLT]], %[[STEPVEC]] +; CHECK-NEXT: %[[MASK:.*]] = trunc <vscale x 4 x i64> %[[VECIND]] to <vscale x 4 x i1> +; CHECK: %[[LOAD:.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* %{{.*}}, i32 4, <vscale x 4 x i1> %[[MASK]], <vscale x 4 x i32> poison) +; CHECK: call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> %[[LOAD]], <vscale x 4 x i32>* %{{.*}}, i32 4, <vscale x 4 x i1> %[[MASK]]) +entry: + br label %for.body + +for.body: ; preds = %entry, %for.inc + %i.08 = phi i64 [ %inc, %for.inc ], [ 0, %entry ] + %and = and i64 %i.08, 1 + %tobool.not = icmp eq i64 %and, 0 + br i1 %tobool.not, label %for.inc, label %if.then + +if.then: ; preds = %for.body + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %i.08 + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i32, i32* %a, i64 %i.08 + store i32 %0, i32* %arrayidx1, align 4 + br label %for.inc + +for.inc: ; preds = %for.body, %if.then + %inc = add nuw nsw i64 %i.08, 1 + %exitcond.not = icmp eq i64 %inc, %n + br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !6 + +exit: ; preds = %for.inc + ret void +} + +!0 = distinct !{!0, !1, !2, !3, !4, !5} +!1 = !{!"llvm.loop.mustprogress"} +!2 = !{!"llvm.loop.vectorize.width", i32 2} +!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} +!4 = !{!"llvm.loop.interleave.count", i32 2} +!5 = !{!"llvm.loop.vectorize.enable", i1 true} +!6 = distinct !{!6, !1, !7, !3, !8, !5} +!7 = !{!"llvm.loop.vectorize.width", i32 4} +!8 = !{!"llvm.loop.interleave.count", i32 1} +!9 = distinct !{!9, !1, !10, !3, !4, !5} +!10 = !{!"llvm.loop.vectorize.width", i32 1} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-loads.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-loads.ll new file mode 100644 index 000000000000..a12ec7f29c42 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-loads.ll @@ -0,0 +1,36 @@ +; RUN: opt -S -loop-vectorize -mattr=+sve -mtriple aarch64-linux-gnu < %s | FileCheck %s + +define void @invariant_load(i64 %n, i32* noalias nocapture %a, i32* nocapture readonly %b) { +; CHECK-LABEL: @invariant_load +; CHECK: vector.body: +; CHECK: %[[GEP:.*]] = getelementptr inbounds i32, i32* %b, i64 42 +; CHECK-NEXT: %[[INVLOAD:.*]] = load i32, i32* %[[GEP]] +; CHECK-NEXT: %[[SPLATINS:.*]] = insertelement <vscale x 4 x i32> poison, i32 %[[INVLOAD]], i32 0 +; CHECK-NEXT: %[[SPLAT:.*]] = shufflevector <vscale x 4 x i32> %[[SPLATINS]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer +; CHECK: %[[LOAD:.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* +; CHECK-NEXT: %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[SPLAT]], %[[LOAD]] +; CHECK: store <vscale x 4 x i32> %[[ADD]], <vscale x 4 x i32>* +entry: + br label %for.body + +for.body: ; preds = %for.body.lr.ph, %for.body + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %b, i64 42 + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i32, i32* %b, i64 %iv + %1 = load i32, i32* %arrayidx1, align 4 + %add = add nsw i32 %0, %1 + %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %iv + store i32 %add, i32* %arrayidx2, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1 + +for.end: ; preds = %for.body + ret void +} + +!1 = distinct !{!1, !2, !3, !4} +!2 = !{!"llvm.loop.vectorize.width", i32 4} +!3 = !{!"llvm.loop.interleave.count", i32 1} +!4 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-large-strides.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-large-strides.ll new file mode 100644 index 000000000000..e804dd3fe3da --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-large-strides.ll @@ -0,0 +1,101 @@ +; RUN: opt -mtriple aarch64-linux-gnu -mattr=+sve -loop-vectorize -dce -instcombine -S <%s | FileCheck %s + +define void @stride7_i32(i32* noalias nocapture %dst, i64 %n) { +; CHECK-LABEL: @stride7_i32( +; CHECK: vector.body +; CHECK: %[[VEC_IND:.*]] = phi <vscale x 4 x i64> [ %{{.*}}, %vector.ph ], [ %{{.*}}, %vector.body ] +; CHECK-NEXT: %[[PTR_INDICES:.*]] = mul nuw nsw <vscale x 4 x i64> %[[VEC_IND]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 7, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer) +; CHECK-NEXT: %[[PTRS:.*]] = getelementptr inbounds i32, i32* %dst, <vscale x 4 x i64> %[[PTR_INDICES]] +; CHECK-NEXT: %[[GLOAD:.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> %[[PTRS]] +; CHECK-NEXT: %[[VALS:.*]] = add nsw <vscale x 4 x i32> %[[GLOAD]], +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> %[[VALS]], <vscale x 4 x i32*> %[[PTRS]] +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %i.05 = phi i64 [ %inc, %for.body ], [ 0, %entry ] + %mul = mul nuw nsw i64 %i.05, 7 + %arrayidx = getelementptr inbounds i32, i32* %dst, i64 %mul + %0 = load i32, i32* %arrayidx, align 4 + %add = add nsw i32 %0, 3 + store i32 %add, i32* %arrayidx, align 4 + %inc = add nuw nsw i64 %i.05, 1 + %exitcond.not = icmp eq i64 %inc, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: ; preds = %for.end.loopexit, %entry + ret void +} + +define void @stride7_f64(double* noalias nocapture %dst, i64 %n) { +; CHECK-LABEL: @stride7_f64( +; CHECK: vector.body +; CHECK: %[[VEC_IND:.*]] = phi <vscale x 2 x i64> [ %{{.*}}, %vector.ph ], [ %{{.*}}, %vector.body ] +; CHECK-NEXT: %[[PTR_INDICES:.*]] = mul nuw nsw <vscale x 2 x i64> %[[VEC_IND]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 7, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer) +; CHECK-NEXT: %[[PTRS:.*]] = getelementptr inbounds double, double* %dst, <vscale x 2 x i64> %[[PTR_INDICES]] +; CHECK-NEXT: %[[GLOAD:.*]] = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0f64(<vscale x 2 x double*> %[[PTRS]], +; CHECK-NEXT: %[[VALS:.*]] = fadd <vscale x 2 x double> %[[GLOAD]], +; CHECK-NEXT: call void @llvm.masked.scatter.nxv2f64.nxv2p0f64(<vscale x 2 x double> %[[VALS]], <vscale x 2 x double*> %[[PTRS]], +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %i.05 = phi i64 [ %inc, %for.body ], [ 0, %entry ] + %mul = mul nuw nsw i64 %i.05, 7 + %arrayidx = getelementptr inbounds double, double* %dst, i64 %mul + %0 = load double, double* %arrayidx, align 8 + %add = fadd double %0, 1.000000e+00 + store double %add, double* %arrayidx, align 8 + %inc = add nuw nsw i64 %i.05, 1 + %exitcond.not = icmp eq i64 %inc, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !6 + +for.end: ; preds = %for.end.loopexit, %entry + ret void +} + + +define void @cond_stride7_f64(double* noalias nocapture %dst, i64* noalias nocapture readonly %cond, i64 %n) { +; CHECK-LABEL: @cond_stride7_f64( +; CHECK: vector.body +; CHECK: %[[MASK:.*]] = icmp ne <vscale x 2 x i64> +; CHECK: %[[PTRS:.*]] = getelementptr inbounds double, double* %dst, <vscale x 2 x i64> %{{.*}} +; CHECK-NEXT: %[[GLOAD:.*]] = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0f64(<vscale x 2 x double*> %[[PTRS]], i32 8, <vscale x 2 x i1> %[[MASK]] +; CHECK-NEXT: %[[VALS:.*]] = fadd <vscale x 2 x double> %[[GLOAD]], +; CHECK-NEXT: call void @llvm.masked.scatter.nxv2f64.nxv2p0f64(<vscale x 2 x double> %[[VALS]], <vscale x 2 x double*> %[[PTRS]], i32 8, <vscale x 2 x i1> %[[MASK]]) +entry: + br label %for.body + +for.body: ; preds = %entry, %for.inc + %i.07 = phi i64 [ %inc, %for.inc ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i64, i64* %cond, i64 %i.07 + %0 = load i64, i64* %arrayidx, align 8 + %tobool.not = icmp eq i64 %0, 0 + br i1 %tobool.not, label %for.inc, label %if.then + +if.then: ; preds = %for.body + %mul = mul nsw i64 %i.07, 7 + %arrayidx1 = getelementptr inbounds double, double* %dst, i64 %mul + %1 = load double, double* %arrayidx1, align 8 + %add = fadd double %1, 1.000000e+00 + store double %add, double* %arrayidx1, align 8 + br label %for.inc + +for.inc: ; preds = %for.body, %if.then + %inc = add nuw nsw i64 %i.07, 1 + %exitcond.not = icmp eq i64 %inc, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !6 + +for.end: ; preds = %for.end.loopexit, %entry + ret void +} + + +!0 = distinct !{!0, !1, !2, !3, !4, !5} +!1 = !{!"llvm.loop.mustprogress"} +!2 = !{!"llvm.loop.vectorize.width", i32 4} +!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} +!4 = !{!"llvm.loop.interleave.count", i32 1} +!5 = !{!"llvm.loop.vectorize.enable", i1 true} +!6 = distinct !{!6, !1, !7, !3, !4, !5} +!7 = !{!"llvm.loop.vectorize.width", i32 2} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-scalable-load-in-loop.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-scalable-load-in-loop.ll index 34264562e71f..ee80553853eb 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-scalable-load-in-loop.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-scalable-load-in-loop.ll @@ -1,5 +1,4 @@ -; RUN: opt -S -loop-vectorize -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s -; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t +; RUN: opt -S -loop-vectorize -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s ; This test is checking that a scalable load inside a loop does not trigger a ; TypeSize error in the loop vectorization legality analysis. It is possible for @@ -8,9 +7,6 @@ ; load, it should not be considered for analysis, and we should not see a ; TypeSize error. -; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. -; WARN-NOT: warning: - ; #include <arm_sve.h> ; ; void scalable_load_in_loop(long n, int *a, int *b, svuint32_t *x, diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll index d803f6b75ed9..ab45cd847a26 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll @@ -10,13 +10,7 @@ ; The test checks if the mask is being correctly created, reverted and used -; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -S < %s 2>%t | FileCheck %s - -; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t - -; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. -; WARN-NOT: warning - +; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -S < %s | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-unknown-linux-gnu" diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll index aef5efe030f5..c43927045550 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll @@ -4,12 +4,7 @@ ; for (int i = N-1; i >= 0; --i) ; a[i] = b[i] + 1.0; -; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -S < %s 2>%t | FileCheck %s - -; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t - -; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. -; WARN-NOT: warning +; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -S < %s | FileCheck %s define void @vector_reverse_f64(i64 %N, double* %a, double* %b) #0{ ; CHECK-LABEL: @vector_reverse_f64 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll index 22de7c1e1ca8..5d539ca36d49 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll @@ -10,13 +10,7 @@ ; The test checks if the mask is being correctly created, reverted and used -; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -S < %s 2>%t | FileCheck %s - -; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t - -; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. -; WARN-NOT: warning - +; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -S < %s | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-unknown-linux-gnu" diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse.ll index ae3aad1e75a8..311dfdb068ca 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse.ll @@ -5,12 +5,7 @@ ; for (int i = N-1; i >= 0; --i) ; a[i] = b[i] + 1.0; -; RUN: opt -loop-vectorize -dce -mtriple aarch64-linux-gnu -S < %s 2>%t | FileCheck %s - -; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t - -; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.$ -; WARN-NOT: warning +; RUN: opt -loop-vectorize -dce -mtriple aarch64-linux-gnu -S < %s | FileCheck %s define void @vector_reverse_f64(i64 %N, double* %a, double* %b) #0 { ; CHECK-LABEL: vector_reverse_f64 |