aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h5
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp64
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.h1
-rw-r--r--llvm/test/Transforms/LoopVectorize/X86/avx512.ll6
-rw-r--r--llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll22
5 files changed, 62 insertions, 36 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 809cd808f233..d76b557dda13 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -181,7 +181,10 @@ struct VectorizationFactor {
// Vector width with best cost
ElementCount Width;
// Cost of the loop with that width
- unsigned Cost;
+ InstructionCost Cost;
+
+ VectorizationFactor(ElementCount Width, InstructionCost Cost)
+ : Width(Width), Cost(Cost) {}
// Width 1 means no vectorization, cost 0 means uncomputed cost.
static VectorizationFactor Disabled() {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index a371831da384..d4eced438900 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1602,6 +1602,11 @@ public:
InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
bool &NeedToScalarize) const;
+ /// Returns true if the per-lane cost of VectorizationFactor A is lower than
+ /// that of B.
+ bool isMoreProfitable(const VectorizationFactor &A,
+ const VectorizationFactor &B) const;
+
/// Invalidates decisions already taken by the cost model.
void invalidateCostModelingDecisions() {
WideningDecisions.clear();
@@ -5876,6 +5881,18 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
return MaxVF;
}
+bool LoopVectorizationCostModel::isMoreProfitable(
+ const VectorizationFactor &A, const VectorizationFactor &B) const {
+ InstructionCost::CostType CostA = *A.Cost.getValue();
+ InstructionCost::CostType CostB = *B.Cost.getValue();
+
+ // To avoid the need for FP division:
+ // (CostA / A.Width) < (CostB / B.Width)
+ // <=> (CostA * B.Width) < (CostB * A.Width)
+ return (CostA * B.Width.getKnownMinValue()) <
+ (CostB * A.Width.getKnownMinValue());
+}
+
VectorizationFactor
LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) {
// FIXME: This can be fixed for scalable vectors later, because at this stage
@@ -5887,16 +5904,15 @@ LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) {
LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
- auto Width = ElementCount::getFixed(1);
- const float ScalarCost = *ExpectedCost.getValue();
- float Cost = ScalarCost;
+ const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost);
+ VectorizationFactor ChosenFactor = ScalarCost;
bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
if (ForceVectorization && MaxVF.isVector()) {
// Ignore scalar width, because the user explicitly wants vectorization.
// Initialize cost to max so that VF = 2 is, at least, chosen during cost
// evaluation.
- Cost = std::numeric_limits<float>::max();
+ ChosenFactor.Cost = std::numeric_limits<InstructionCost::CostType>::max();
}
for (auto i = ElementCount::getFixed(2); ElementCount::isKnownLE(i, MaxVF);
@@ -5905,10 +5921,14 @@ LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) {
// we need to divide the cost of the vector loops by the width of
// the vector elements.
VectorizationCostTy C = expectedCost(i);
+
assert(C.first.isValid() && "Unexpected invalid cost for vector loop");
- float VectorCost = *C.first.getValue() / (float)i.getFixedValue();
- LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
- << " costs: " << (int)VectorCost << ".\n");
+ VectorizationFactor Candidate(i, C.first);
+ LLVM_DEBUG(
+ dbgs() << "LV: Vector loop of width " << i << " costs: "
+ << (*Candidate.Cost.getValue() / Candidate.Width.getFixedValue())
+ << ".\n");
+
if (!C.second && !ForceVectorization) {
LLVM_DEBUG(
dbgs() << "LV: Not considering vector loop of width " << i
@@ -5917,32 +5937,27 @@ LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) {
}
// If profitable add it to ProfitableVF list.
- if (VectorCost < ScalarCost) {
- ProfitableVFs.push_back(VectorizationFactor(
- {i, (unsigned)VectorCost}));
- }
+ if (isMoreProfitable(Candidate, ScalarCost))
+ ProfitableVFs.push_back(Candidate);
- if (VectorCost < Cost) {
- Cost = VectorCost;
- Width = i;
- }
+ if (isMoreProfitable(Candidate, ChosenFactor))
+ ChosenFactor = Candidate;
}
if (!EnableCondStoresVectorization && NumPredStores) {
reportVectorizationFailure("There are conditional stores.",
"store that is conditionally executed prevents vectorization",
"ConditionalStore", ORE, TheLoop);
- Width = ElementCount::getFixed(1);
- Cost = ScalarCost;
+ ChosenFactor = ScalarCost;
}
- LLVM_DEBUG(if (ForceVectorization && !Width.isScalar() && Cost >= ScalarCost) dbgs()
+ LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
+ *ChosenFactor.Cost.getValue() >= *ScalarCost.Cost.getValue())
+ dbgs()
<< "LV: Vectorization seems to be not beneficial, "
<< "but was forced by a user.\n");
- LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
- VectorizationFactor Factor = {Width,
- (unsigned)(Width.getKnownMinValue() * Cost)};
- return Factor;
+ LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
+ return ChosenFactor;
}
bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
@@ -6055,7 +6070,8 @@ LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
for (auto &NextVF : ProfitableVFs)
if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) &&
- (Result.Width.getFixedValue() == 1 || NextVF.Cost < Result.Cost) &&
+ (Result.Width.getFixedValue() == 1 ||
+ isMoreProfitable(NextVF, Result)) &&
LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width}))
Result = NextVF;
@@ -9773,7 +9789,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
if (MaybeVF) {
VF = *MaybeVF;
// Select the interleave count.
- IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
+ IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue());
}
// Identify the diagnostic messages that should be produced.
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 495a98a97980..bcae779b5763 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -40,6 +40,7 @@
#include "llvm/ADT/ilist_node.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/IRBuilder.h"
+#include "llvm/Support/InstructionCost.h"
#include <algorithm>
#include <cassert>
#include <cstddef>
diff --git a/llvm/test/Transforms/LoopVectorize/X86/avx512.ll b/llvm/test/Transforms/LoopVectorize/X86/avx512.ll
index 0917e0072244..8542bc5c8219 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/avx512.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/avx512.ll
@@ -10,6 +10,8 @@ target triple = "x86_64-apple-macosx10.9.0"
; CHECK-LABEL: f:
; CHECK: vmovdqu64 %zmm{{.}},
; CHECK-NOT: %ymm
+; CHECK: epilog
+; CHECK: %ymm
; Verify that we don't generate 512-bit wide vectors when subtarget feature says not to
@@ -82,10 +84,14 @@ for.end: ; preds = %for.end.loopexit, %
; CHECK-LABEL: h:
; CHECK: vmovdqu64 %zmm{{.}},
; CHECK-NOT: %ymm
+; CHECK: epilog
+; CHECK: %ymm
; CHECK-PREFER-AVX256-LABEL: h:
; CHECK-PREFER-AVX256: vmovdqu64 %zmm{{.}},
; CHECK-PREFER-AVX256-NOT: %ymm
+; CHECK-PREFER-AVX256: epilog
+; CHECK-PREFER-AVX256: %ymm
define void @h(i32* %a, i32 %n) "prefer-vector-width"="512" {
entry:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll b/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll
index 5e213f18ebe5..1043091fe7a5 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll
@@ -168,7 +168,7 @@ define void @cttz(i8* nocapture readonly %pSrc, i8 signext %offset, i8* nocaptur
; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BLOCKSIZE]], -1
; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 7
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 15
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
; CHECK: vector.main.loop.iter.check:
; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[TMP0]], 127
@@ -225,7 +225,7 @@ define void @cttz(i8* nocapture readonly %pSrc, i8 signext %offset, i8* nocaptur
; CHECK-NEXT: [[IND_END26:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[N_VEC]]
; CHECK-NEXT: [[CAST_CRD22:%.*]] = trunc i64 [[N_VEC]] to i32
; CHECK-NEXT: [[IND_END23:%.*]] = sub i32 [[BLOCKSIZE]], [[CAST_CRD22]]
-; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[TMP2]], 120
+; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[TMP2]], 112
; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; CHECK: vec.epilog.ph:
@@ -233,24 +233,24 @@ define void @cttz(i8* nocapture readonly %pSrc, i8 signext %offset, i8* nocaptur
; CHECK-NEXT: [[TMP22:%.*]] = add i32 [[BLOCKSIZE]], -1
; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP22]] to i64
; CHECK-NEXT: [[TMP24:%.*]] = add nuw nsw i64 [[TMP23]], 1
-; CHECK-NEXT: [[N_VEC19:%.*]] = and i64 [[TMP24]], 8589934584
+; CHECK-NEXT: [[N_VEC19:%.*]] = and i64 [[TMP24]], 8589934576
; CHECK-NEXT: [[CAST_CRD:%.*]] = trunc i64 [[N_VEC19]] to i32
; CHECK-NEXT: [[IND_END:%.*]] = sub i32 [[BLOCKSIZE]], [[CAST_CRD]]
; CHECK-NEXT: [[IND_END25:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[N_VEC19]]
; CHECK-NEXT: [[IND_END28:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[N_VEC19]]
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT35:%.*]] = insertelement <8 x i8> poison, i8 [[OFFSET]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT36:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT35]], <8 x i8> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT35:%.*]] = insertelement <16 x i8> poison, i8 [[OFFSET]], i32 0
+; CHECK-NEXT: [[BROADCAST_SPLAT36:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT35]], <16 x i8> poison, <16 x i32> zeroinitializer
; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; CHECK: vec.epilog.vector.body:
; CHECK-NEXT: [[INDEX20:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT21:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
; CHECK-NEXT: [[NEXT_GEP32:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[INDEX20]]
; CHECK-NEXT: [[NEXT_GEP33:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[INDEX20]]
-; CHECK-NEXT: [[TMP25:%.*]] = bitcast i8* [[NEXT_GEP32]] to <8 x i8>*
-; CHECK-NEXT: [[WIDE_LOAD34:%.*]] = load <8 x i8>, <8 x i8>* [[TMP25]], align 2
-; CHECK-NEXT: [[TMP26:%.*]] = call <8 x i8> @llvm.fshl.v8i8(<8 x i8> [[WIDE_LOAD34]], <8 x i8> [[WIDE_LOAD34]], <8 x i8> [[BROADCAST_SPLAT36]])
-; CHECK-NEXT: [[TMP27:%.*]] = bitcast i8* [[NEXT_GEP33]] to <8 x i8>*
-; CHECK-NEXT: store <8 x i8> [[TMP26]], <8 x i8>* [[TMP27]], align 2
-; CHECK-NEXT: [[INDEX_NEXT21]] = add i64 [[INDEX20]], 8
+; CHECK-NEXT: [[TMP25:%.*]] = bitcast i8* [[NEXT_GEP32]] to <16 x i8>*
+; CHECK-NEXT: [[WIDE_LOAD34:%.*]] = load <16 x i8>, <16 x i8>* [[TMP25]], align 2
+; CHECK-NEXT: [[TMP26:%.*]] = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> [[WIDE_LOAD34]], <16 x i8> [[WIDE_LOAD34]], <16 x i8> [[BROADCAST_SPLAT36]])
+; CHECK-NEXT: [[TMP27:%.*]] = bitcast i8* [[NEXT_GEP33]] to <16 x i8>*
+; CHECK-NEXT: store <16 x i8> [[TMP26]], <16 x i8>* [[TMP27]], align 2
+; CHECK-NEXT: [[INDEX_NEXT21]] = add i64 [[INDEX20]], 16
; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT21]], [[N_VEC19]]
; CHECK-NEXT: br i1 [[TMP28]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]]
; CHECK: vec.epilog.middle.block: