diff options
author | Jessica Paquette <jpaquette@apple.com> | 2020-11-30 17:21:21 -0800 |
---|---|---|
committer | Jessica Paquette <jpaquette@apple.com> | 2020-12-01 15:45:14 -0800 |
commit | 6c3fa97d8a628541c82d8981aabefcb2dcb29f17 (patch) | |
tree | d6a8647ffe8dc41a49d5d93e847400f24ce6cf67 | |
parent | [NFC][AMDGPU] AMDGPU code object V4 ABI documentation (diff) | |
download | llvm-project-6c3fa97d8a628541c82d8981aabefcb2dcb29f17.tar.gz llvm-project-6c3fa97d8a628541c82d8981aabefcb2dcb29f17.tar.bz2 llvm-project-6c3fa97d8a628541c82d8981aabefcb2dcb29f17.zip |
[AArch64][GlobalISel] Select Bcc when it's better than TB(N)Z
Instead of falling back to selecting TB(N)Z when we fail to select an
optimized compare against 0, select Bcc instead.
Also simplify selectCompareBranch a little while we're here, because the logic
was kind of hard to follow.
At -O0, this is a 0.1% geomean code size improvement for CTMark.
A simple example of where this can kick in is here:
https://godbolt.org/z/4rra6P
In the example above, GlobalISel currently produces a subs, cset, and tbnz.
SelectionDAG, on the other hand, just emits a compare and b.le.
Differential Revision: https://reviews.llvm.org/D92358
3 files changed, 60 insertions, 52 deletions
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp index 6691bf068042..3dba92eea3d3 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp @@ -257,6 +257,11 @@ private: MachineBasicBlock *DstMBB, MachineIRBuilder &MIB) const; + /// Emit a CB(N)Z instruction which branches to \p DestMBB. + MachineInstr *emitCBZ(Register CompareReg, bool IsNegative, + MachineBasicBlock *DestMBB, + MachineIRBuilder &MIB) const; + // Equivalent to the i32shift_a and friends from AArch64InstrInfo.td. // We use these manually instead of using the importer since it doesn't // support SDNodeXForm. @@ -1394,9 +1399,7 @@ bool AArch64InstructionSelector::tryOptAndIntoCompareBranch( // Only support EQ and NE. If we have LT, then it *is* possible to fold, but // we don't want to do this. When we have an AND and LT, we need a TST/ANDS, // so folding would be redundant. - if (Pred != CmpInst::Predicate::ICMP_EQ && - Pred != CmpInst::Predicate::ICMP_NE) - return false; + assert(ICmpInst::isEquality(Pred) && "Expected only eq/ne?"); // Check if the AND has a constant on its RHS which we can use as a mask. // If it's a power of 2, then it's the same as checking a specific bit. @@ -1415,6 +1418,27 @@ bool AArch64InstructionSelector::tryOptAndIntoCompareBranch( return true; } +MachineInstr *AArch64InstructionSelector::emitCBZ(Register CompareReg, + bool IsNegative, + MachineBasicBlock *DestMBB, + MachineIRBuilder &MIB) const { + assert(ProduceNonFlagSettingCondBr && "CBZ does not set flags!"); + MachineRegisterInfo &MRI = *MIB.getMRI(); + assert(RBI.getRegBank(CompareReg, MRI, TRI)->getID() == + AArch64::GPRRegBankID && + "Expected GPRs only?"); + auto Ty = MRI.getType(CompareReg); + unsigned Width = Ty.getSizeInBits(); + assert(!Ty.isVector() && "Expected scalar only?"); + assert(Width <= 64 && "Expected width to be at most 64?"); + static const unsigned OpcTable[2][2] = {{AArch64::CBZW, AArch64::CBZX}, + {AArch64::CBNZW, AArch64::CBNZX}}; + unsigned Opc = OpcTable[IsNegative][Width == 64]; + auto BranchMI = MIB.buildInstr(Opc, {}, {CompareReg}).addMBB(DestMBB); + constrainSelectedInstRegOperands(*BranchMI, TII, TRI, RBI); + return &*BranchMI; +} + bool AArch64InstructionSelector::selectCompareBranch( MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const { @@ -1477,51 +1501,39 @@ bool AArch64InstructionSelector::selectCompareBranch( } } - if (!VRegAndVal) { - std::swap(RHS, LHS); - VRegAndVal = getConstantVRegValWithLookThrough(RHS, MRI); - LHSMI = getDefIgnoringCopies(LHS, MRI); - } + // Attempt to handle commutative condition codes. Right now, that's only + // eq/ne. + if (ICmpInst::isEquality(Pred)) { + if (!VRegAndVal) { + std::swap(RHS, LHS); + VRegAndVal = getConstantVRegValWithLookThrough(RHS, MRI); + LHSMI = getDefIgnoringCopies(LHS, MRI); + } - if (!VRegAndVal || VRegAndVal->Value != 0) { - // If we can't select a CBZ then emit a cmp + Bcc. - auto Pred = - static_cast<CmpInst::Predicate>(CCMI->getOperand(1).getPredicate()); - emitIntegerCompare(CCMI->getOperand(2), CCMI->getOperand(3), - CCMI->getOperand(1), MIB); - const AArch64CC::CondCode CC = changeICMPPredToAArch64CC(Pred); - MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC).addMBB(DestMBB); - I.eraseFromParent(); - return true; - } + if (VRegAndVal && VRegAndVal->Value == 0) { + // If there's a G_AND feeding into this branch, try to fold it away by + // emitting a TB(N)Z instead. + if (tryOptAndIntoCompareBranch(LHSMI, VRegAndVal->Value, Pred, DestMBB, + MIB)) { + I.eraseFromParent(); + return true; + } - // Try to emit a TB(N)Z for an eq or ne condition. - if (tryOptAndIntoCompareBranch(LHSMI, VRegAndVal->Value, Pred, DestMBB, - MIB)) { - I.eraseFromParent(); - return true; + // Otherwise, try to emit a CB(N)Z instead. + auto LHSTy = MRI.getType(LHS); + if (!LHSTy.isVector() && LHSTy.getSizeInBits() <= 64) { + emitCBZ(LHS, /*IsNegative = */ Pred == CmpInst::ICMP_NE, DestMBB, MIB); + I.eraseFromParent(); + return true; + } + } } - const RegisterBank &RB = *RBI.getRegBank(LHS, MRI, TRI); - if (RB.getID() != AArch64::GPRRegBankID) - return false; - if (Pred != CmpInst::ICMP_NE && Pred != CmpInst::ICMP_EQ) - return false; - - const unsigned CmpWidth = MRI.getType(LHS).getSizeInBits(); - unsigned CBOpc = 0; - if (CmpWidth <= 32) - CBOpc = (Pred == CmpInst::ICMP_EQ ? AArch64::CBZW : AArch64::CBNZW); - else if (CmpWidth == 64) - CBOpc = (Pred == CmpInst::ICMP_EQ ? AArch64::CBZX : AArch64::CBNZX); - else - return false; - - BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(CBOpc)) - .addUse(LHS) - .addMBB(DestMBB) - .constrainAllUses(TII, TRI, RBI); - + // Couldn't optimize. Emit a compare + bcc. + emitIntegerCompare(CCMI->getOperand(2), CCMI->getOperand(3), + CCMI->getOperand(1), MIB); + const AArch64CC::CondCode CC = changeICMPPredToAArch64CC(Pred); + MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC).addMBB(DestMBB); I.eraseFromParent(); return true; } diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/opt-and-tbnz-tbz.mir b/llvm/test/CodeGen/AArch64/GlobalISel/opt-and-tbnz-tbz.mir index bb6ba25d06f7..154f00b96de3 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/opt-and-tbnz-tbz.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/opt-and-tbnz-tbz.mir @@ -143,8 +143,7 @@ body: | ; CHECK: liveins: $w0 ; CHECK: [[COPY:%[0-9]+]]:gpr32 = COPY $w0 ; CHECK: [[ANDSWri:%[0-9]+]]:gpr32 = ANDSWri [[COPY]], 0, implicit-def $nzcv - ; CHECK: [[CSINCWr:%[0-9]+]]:gpr32 = CSINCWr $wzr, $wzr, 10, implicit $nzcv - ; CHECK: TBNZW [[CSINCWr]], 0, %bb.1 + ; CHECK: Bcc 11, %bb.1, implicit $nzcv ; CHECK: B %bb.0 ; CHECK: bb.1: ; CHECK: RET_ReallyLR @@ -176,8 +175,7 @@ body: | ; CHECK: liveins: $w0 ; CHECK: [[COPY:%[0-9]+]]:gpr32 = COPY $w0 ; CHECK: [[ANDSWri:%[0-9]+]]:gpr32 = ANDSWri [[COPY]], 0, implicit-def $nzcv - ; CHECK: [[CSINCWr:%[0-9]+]]:gpr32 = CSINCWr $wzr, $wzr, 13, implicit $nzcv - ; CHECK: TBNZW [[CSINCWr]], 0, %bb.1 + ; CHECK: Bcc 12, %bb.1, implicit $nzcv ; CHECK: B %bb.0 ; CHECK: bb.1: ; CHECK: RET_ReallyLR diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/tbnz-slt.mir b/llvm/test/CodeGen/AArch64/GlobalISel/tbnz-slt.mir index 2be18832a0e5..d8f962cdfb76 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/tbnz-slt.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/tbnz-slt.mir @@ -100,8 +100,7 @@ body: | ; CHECK: successors: %bb.0(0x40000000), %bb.1(0x40000000) ; CHECK: %copy:gpr64 = COPY $x0 ; CHECK: [[ANDSXri:%[0-9]+]]:gpr64 = ANDSXri %copy, 8000, implicit-def $nzcv - ; CHECK: %cmp:gpr32 = CSINCWr $wzr, $wzr, 10, implicit $nzcv - ; CHECK: TBNZW %cmp, 0, %bb.1 + ; CHECK: Bcc 11, %bb.1, implicit $nzcv ; CHECK: B %bb.0 ; CHECK: bb.1: ; CHECK: RET_ReallyLR @@ -133,8 +132,7 @@ body: | ; CHECK: %copy:gpr64 = COPY $x0 ; CHECK: %zero:gpr64 = COPY $xzr ; CHECK: [[SUBSXrr:%[0-9]+]]:gpr64 = SUBSXrr %zero, %copy, implicit-def $nzcv - ; CHECK: %cmp:gpr32 = CSINCWr $wzr, $wzr, 10, implicit $nzcv - ; CHECK: TBNZW %cmp, 0, %bb.1 + ; CHECK: Bcc 11, %bb.1, implicit $nzcv ; CHECK: B %bb.0 ; CHECK: bb.1: ; CHECK: RET_ReallyLR |