diff options
Diffstat (limited to 'llvm/lib/Target/AArch64')
25 files changed, 918 insertions, 547 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td index bdf2e517deda..133a6b16e979 100644 --- a/llvm/lib/Target/AArch64/AArch64.td +++ b/llvm/lib/Target/AArch64/AArch64.td @@ -147,12 +147,12 @@ def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true", def FeatureZCZeroingGP : SubtargetFeature<"zcz-gp", "HasZeroCycleZeroingGP", "true", "Has zero-cycle zeroing instructions for generic registers">; -def FeatureZCZeroingFP : SubtargetFeature<"zcz-fp", "HasZeroCycleZeroingFP", "true", - "Has zero-cycle zeroing instructions for FP registers">; +def FeatureNoZCZeroingFP : SubtargetFeature<"no-zcz-fp", "HasZeroCycleZeroingFP", "false", + "Has no zero-cycle zeroing instructions for FP registers">; def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true", "Has zero-cycle zeroing instructions", - [FeatureZCZeroingGP, FeatureZCZeroingFP]>; + [FeatureZCZeroingGP]>; /// ... but the floating-point version doesn't quite work in rare cases on older /// CPUs. @@ -915,8 +915,7 @@ def ProcExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3", FeatureLSLFast, FeaturePerfMon, FeaturePostRAScheduler, - FeaturePredictableSelectIsExpensive, - FeatureZCZeroingFP]>; + FeaturePredictableSelectIsExpensive]>; def ProcExynosM4 : SubtargetFeature<"exynosm4", "ARMProcFamily", "ExynosM3", "Samsung Exynos-M4 processors", diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp index 9b757f7aba5e..3373e6c91b7f 100644 --- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -1091,17 +1091,16 @@ void AArch64AsmPrinter::LowerFAULTING_OP(const MachineInstr &FaultingMI) { void AArch64AsmPrinter::EmitFMov0(const MachineInstr &MI) { Register DestReg = MI.getOperand(0).getReg(); if (STI->hasZeroCycleZeroingFP() && !STI->hasZeroCycleZeroingFPWorkaround()) { - // Convert H/S/D register to corresponding Q register + // Convert H/S register to corresponding D register if (AArch64::H0 <= DestReg && DestReg <= AArch64::H31) - DestReg = AArch64::Q0 + (DestReg - AArch64::H0); + DestReg = AArch64::D0 + (DestReg - AArch64::H0); else if (AArch64::S0 <= DestReg && DestReg <= AArch64::S31) - DestReg = AArch64::Q0 + (DestReg - AArch64::S0); - else { + DestReg = AArch64::D0 + (DestReg - AArch64::S0); + else assert(AArch64::D0 <= DestReg && DestReg <= AArch64::D31); - DestReg = AArch64::Q0 + (DestReg - AArch64::D0); - } + MCInst MOVI; - MOVI.setOpcode(AArch64::MOVIv2d_ns); + MOVI.setOpcode(AArch64::MOVID); MOVI.addOperand(MCOperand::createReg(DestReg)); MOVI.addOperand(MCOperand::createImm(0)); EmitToStreamer(*OutStreamer, MOVI); diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td index d5ea2d3eee98..07608fc56990 100644 --- a/llvm/lib/Target/AArch64/AArch64Combine.td +++ b/llvm/lib/Target/AArch64/AArch64Combine.td @@ -127,7 +127,14 @@ def adjust_icmp_imm : GICombineRule < (apply [{ applyAdjustICmpImmAndPred(*${root}, ${matchinfo}, B, Observer); }]) >; -def icmp_lowering : GICombineGroup<[adjust_icmp_imm]>; +def swap_icmp_operands : GICombineRule < + (defs root:$root), + (match (wip_match_opcode G_ICMP):$root, + [{ return trySwapICmpOperands(*${root}, MRI); }]), + (apply [{ applySwapICmpOperands(*${root}, Observer); }]) +>; + +def icmp_lowering : GICombineGroup<[adjust_icmp_imm, swap_icmp_operands]>; def extractvecelt_pairwise_add_matchdata : GIDefMatchData<"std::tuple<unsigned, LLT, Register>">; def extractvecelt_pairwise_add : GICombineRule< @@ -154,6 +161,14 @@ def build_vector_to_dup : GICombineRule< def build_vector_lowering : GICombineGroup<[build_vector_to_dup]>; +def bitfield_extract_from_sext_inreg : GICombineRule< + (defs root:$root, build_fn_matchinfo:$info), + (match (wip_match_opcode G_SEXT_INREG):$root, + [{ return matchBitfieldExtractFromSExtInReg(*${root}, MRI, ${info}); }]), + (apply [{ return Helper.applyBuildFn(*${root}, ${info}); }])>; + +def form_bitfield_extract : GICombineGroup<[bitfield_extract_from_sext_inreg]>; + // Post-legalization combines which should happen at all optimization levels. // (E.g. ones that facilitate matching for the selector) For example, matching // pseudos. @@ -172,6 +187,7 @@ def AArch64PostLegalizerCombinerHelper hoist_logic_op_with_same_opcode_hands, redundant_and, xor_of_and_with_same_reg, extractvecelt_pairwise_add, redundant_or, - mul_const, redundant_sext_inreg]> { + mul_const, redundant_sext_inreg, + form_bitfield_extract]> { let DisableRuleOption = "aarch64postlegalizercombiner-disable-rule"; } diff --git a/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/llvm/lib/Target/AArch64/AArch64FastISel.cpp index ecc68ccda03d..95b5699552b0 100644 --- a/llvm/lib/Target/AArch64/AArch64FastISel.cpp +++ b/llvm/lib/Target/AArch64/AArch64FastISel.cpp @@ -195,34 +195,32 @@ private: const Value *Cond); bool optimizeIntExtLoad(const Instruction *I, MVT RetVT, MVT SrcVT); bool optimizeSelect(const SelectInst *SI); - std::pair<unsigned, bool> getRegForGEPIndex(const Value *Idx); + unsigned getRegForGEPIndex(const Value *Idx); // Emit helper routines. unsigned emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS, const Value *RHS, bool SetFlags = false, bool WantResult = true, bool IsZExt = false); unsigned emitAddSub_rr(bool UseAdd, MVT RetVT, unsigned LHSReg, - bool LHSIsKill, unsigned RHSReg, bool RHSIsKill, - bool SetFlags = false, bool WantResult = true); + unsigned RHSReg, bool SetFlags = false, + bool WantResult = true); unsigned emitAddSub_ri(bool UseAdd, MVT RetVT, unsigned LHSReg, - bool LHSIsKill, uint64_t Imm, bool SetFlags = false, + uint64_t Imm, bool SetFlags = false, bool WantResult = true); unsigned emitAddSub_rs(bool UseAdd, MVT RetVT, unsigned LHSReg, - bool LHSIsKill, unsigned RHSReg, bool RHSIsKill, - AArch64_AM::ShiftExtendType ShiftType, + unsigned RHSReg, AArch64_AM::ShiftExtendType ShiftType, uint64_t ShiftImm, bool SetFlags = false, bool WantResult = true); unsigned emitAddSub_rx(bool UseAdd, MVT RetVT, unsigned LHSReg, - bool LHSIsKill, unsigned RHSReg, bool RHSIsKill, - AArch64_AM::ShiftExtendType ExtType, - uint64_t ShiftImm, bool SetFlags = false, + unsigned RHSReg, AArch64_AM::ShiftExtendType ExtType, + uint64_t ShiftImm, bool SetFlags = false, bool WantResult = true); // Emit functions. bool emitCompareAndBranch(const BranchInst *BI); bool emitCmp(const Value *LHS, const Value *RHS, bool IsZExt); bool emitICmp(MVT RetVT, const Value *LHS, const Value *RHS, bool IsZExt); - bool emitICmp_ri(MVT RetVT, unsigned LHSReg, bool LHSIsKill, uint64_t Imm); + bool emitICmp_ri(MVT RetVT, unsigned LHSReg, uint64_t Imm); bool emitFCmp(MVT RetVT, const Value *LHS, const Value *RHS); unsigned emitLoad(MVT VT, MVT ResultVT, Address Addr, bool WantZExt = true, MachineMemOperand *MMO = nullptr); @@ -235,42 +233,34 @@ private: unsigned emitAdd(MVT RetVT, const Value *LHS, const Value *RHS, bool SetFlags = false, bool WantResult = true, bool IsZExt = false); - unsigned emitAdd_ri_(MVT VT, unsigned Op0, bool Op0IsKill, int64_t Imm); + unsigned emitAdd_ri_(MVT VT, unsigned Op0, int64_t Imm); unsigned emitSub(MVT RetVT, const Value *LHS, const Value *RHS, bool SetFlags = false, bool WantResult = true, bool IsZExt = false); - unsigned emitSubs_rr(MVT RetVT, unsigned LHSReg, bool LHSIsKill, - unsigned RHSReg, bool RHSIsKill, bool WantResult = true); - unsigned emitSubs_rs(MVT RetVT, unsigned LHSReg, bool LHSIsKill, - unsigned RHSReg, bool RHSIsKill, + unsigned emitSubs_rr(MVT RetVT, unsigned LHSReg, unsigned RHSReg, + bool WantResult = true); + unsigned emitSubs_rs(MVT RetVT, unsigned LHSReg, unsigned RHSReg, AArch64_AM::ShiftExtendType ShiftType, uint64_t ShiftImm, bool WantResult = true); unsigned emitLogicalOp(unsigned ISDOpc, MVT RetVT, const Value *LHS, const Value *RHS); unsigned emitLogicalOp_ri(unsigned ISDOpc, MVT RetVT, unsigned LHSReg, - bool LHSIsKill, uint64_t Imm); + uint64_t Imm); unsigned emitLogicalOp_rs(unsigned ISDOpc, MVT RetVT, unsigned LHSReg, - bool LHSIsKill, unsigned RHSReg, bool RHSIsKill, - uint64_t ShiftImm); - unsigned emitAnd_ri(MVT RetVT, unsigned LHSReg, bool LHSIsKill, uint64_t Imm); - unsigned emitMul_rr(MVT RetVT, unsigned Op0, bool Op0IsKill, - unsigned Op1, bool Op1IsKill); - unsigned emitSMULL_rr(MVT RetVT, unsigned Op0, bool Op0IsKill, - unsigned Op1, bool Op1IsKill); - unsigned emitUMULL_rr(MVT RetVT, unsigned Op0, bool Op0IsKill, - unsigned Op1, bool Op1IsKill); - unsigned emitLSL_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill, - unsigned Op1Reg, bool Op1IsKill); - unsigned emitLSL_ri(MVT RetVT, MVT SrcVT, unsigned Op0Reg, bool Op0IsKill, - uint64_t Imm, bool IsZExt = true); - unsigned emitLSR_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill, - unsigned Op1Reg, bool Op1IsKill); - unsigned emitLSR_ri(MVT RetVT, MVT SrcVT, unsigned Op0Reg, bool Op0IsKill, - uint64_t Imm, bool IsZExt = true); - unsigned emitASR_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill, - unsigned Op1Reg, bool Op1IsKill); - unsigned emitASR_ri(MVT RetVT, MVT SrcVT, unsigned Op0Reg, bool Op0IsKill, - uint64_t Imm, bool IsZExt = false); + unsigned RHSReg, uint64_t ShiftImm); + unsigned emitAnd_ri(MVT RetVT, unsigned LHSReg, uint64_t Imm); + unsigned emitMul_rr(MVT RetVT, unsigned Op0, unsigned Op1); + unsigned emitSMULL_rr(MVT RetVT, unsigned Op0, unsigned Op1); + unsigned emitUMULL_rr(MVT RetVT, unsigned Op0, unsigned Op1); + unsigned emitLSL_rr(MVT RetVT, unsigned Op0Reg, unsigned Op1Reg); + unsigned emitLSL_ri(MVT RetVT, MVT SrcVT, unsigned Op0Reg, uint64_t Imm, + bool IsZExt = true); + unsigned emitLSR_rr(MVT RetVT, unsigned Op0Reg, unsigned Op1Reg); + unsigned emitLSR_ri(MVT RetVT, MVT SrcVT, unsigned Op0Reg, uint64_t Imm, + bool IsZExt = true); + unsigned emitASR_rr(MVT RetVT, unsigned Op0Reg, unsigned Op1Reg); + unsigned emitASR_ri(MVT RetVT, MVT SrcVT, unsigned Op0Reg, uint64_t Imm, + bool IsZExt = false); unsigned materializeInt(const ConstantInt *CI, MVT VT); unsigned materializeFP(const ConstantFP *CFP, MVT VT); @@ -414,8 +404,8 @@ unsigned AArch64FastISel::materializeFP(const ConstantFP *CFP, MVT VT) { return fastEmitInst_i(Opc, TLI.getRegClassFor(VT), Imm); } - // For the MachO large code model materialize the FP constant in code. - if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) { + // For the large code model materialize the FP constant in code. + if (TM.getCodeModel() == CodeModel::Large) { unsigned Opc1 = Is64Bit ? AArch64::MOVi64imm : AArch64::MOVi32imm; const TargetRegisterClass *RC = Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass; @@ -554,7 +544,7 @@ unsigned AArch64FastISel::fastMaterializeFloatZero(const ConstantFP* CFP) { bool Is64Bit = (VT == MVT::f64); unsigned ZReg = Is64Bit ? AArch64::XZR : AArch64::WZR; unsigned Opc = Is64Bit ? AArch64::FMOVXDr : AArch64::FMOVWSr; - return fastEmitInst_r(Opc, TLI.getRegClassFor(VT), ZReg, /*IsKill=*/true); + return fastEmitInst_r(Opc, TLI.getRegClassFor(VT), ZReg); } /// Check if the multiply is by a power-of-2 constant. @@ -764,9 +754,7 @@ bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty) unsigned Reg = getRegForValue(LHS); if (!Reg) return false; - bool RegIsKill = hasTrivialKill(LHS); - Reg = fastEmitInst_extractsubreg(MVT::i32, Reg, RegIsKill, - AArch64::sub_32); + Reg = fastEmitInst_extractsubreg(MVT::i32, Reg, AArch64::sub_32); Addr.setOffsetReg(Reg); return true; } @@ -862,9 +850,7 @@ bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty) unsigned Reg = getRegForValue(LHS); if (!Reg) return false; - bool RegIsKill = hasTrivialKill(LHS); - Reg = fastEmitInst_extractsubreg(MVT::i32, Reg, RegIsKill, - AArch64::sub_32); + Reg = fastEmitInst_extractsubreg(MVT::i32, Reg, AArch64::sub_32); Addr.setOffsetReg(Reg); return true; } @@ -1064,26 +1050,22 @@ bool AArch64FastISel::simplifyAddress(Address &Addr, MVT VT) { if (Addr.getExtendType() == AArch64_AM::SXTW || Addr.getExtendType() == AArch64_AM::UXTW ) ResultReg = emitAddSub_rx(/*UseAdd=*/true, MVT::i64, Addr.getReg(), - /*TODO:IsKill=*/false, Addr.getOffsetReg(), - /*TODO:IsKill=*/false, Addr.getExtendType(), + Addr.getOffsetReg(), Addr.getExtendType(), Addr.getShift()); else ResultReg = emitAddSub_rs(/*UseAdd=*/true, MVT::i64, Addr.getReg(), - /*TODO:IsKill=*/false, Addr.getOffsetReg(), - /*TODO:IsKill=*/false, AArch64_AM::LSL, + Addr.getOffsetReg(), AArch64_AM::LSL, Addr.getShift()); } else { if (Addr.getExtendType() == AArch64_AM::UXTW) ResultReg = emitLSL_ri(MVT::i64, MVT::i32, Addr.getOffsetReg(), - /*Op0IsKill=*/false, Addr.getShift(), - /*IsZExt=*/true); + Addr.getShift(), /*IsZExt=*/true); else if (Addr.getExtendType() == AArch64_AM::SXTW) ResultReg = emitLSL_ri(MVT::i64, MVT::i32, Addr.getOffsetReg(), - /*Op0IsKill=*/false, Addr.getShift(), - /*IsZExt=*/false); + Addr.getShift(), /*IsZExt=*/false); else ResultReg = emitLSL_ri(MVT::i64, MVT::i64, Addr.getOffsetReg(), - /*Op0IsKill=*/false, Addr.getShift()); + Addr.getShift()); } if (!ResultReg) return false; @@ -1100,7 +1082,7 @@ bool AArch64FastISel::simplifyAddress(Address &Addr, MVT VT) { unsigned ResultReg; if (Addr.getReg()) // Try to fold the immediate into the add instruction. - ResultReg = emitAdd_ri_(MVT::i64, Addr.getReg(), /*IsKill=*/false, Offset); + ResultReg = emitAdd_ri_(MVT::i64, Addr.getReg(), Offset); else ResultReg = fastEmit_i(MVT::i64, MVT::i64, ISD::Constant, Offset); @@ -1199,7 +1181,6 @@ unsigned AArch64FastISel::emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS, unsigned LHSReg = getRegForValue(LHS); if (!LHSReg) return 0; - bool LHSIsKill = hasTrivialKill(LHS); if (NeedExtend) LHSReg = emitIntExt(SrcVT, LHSReg, RetVT, IsZExt); @@ -1208,15 +1189,14 @@ unsigned AArch64FastISel::emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS, if (const auto *C = dyn_cast<ConstantInt>(RHS)) { uint64_t Imm = IsZExt ? C->getZExtValue() : C->getSExtValue(); if (C->isNegative()) - ResultReg = emitAddSub_ri(!UseAdd, RetVT, LHSReg, LHSIsKill, -Imm, - SetFlags, WantResult); + ResultReg = emitAddSub_ri(!UseAdd, RetVT, LHSReg, -Imm, SetFlags, + WantResult); else - ResultReg = emitAddSub_ri(UseAdd, RetVT, LHSReg, LHSIsKill, Imm, SetFlags, + ResultReg = emitAddSub_ri(UseAdd, RetVT, LHSReg, Imm, SetFlags, WantResult); } else if (const auto *C = dyn_cast<Constant>(RHS)) if (C->isNullValue()) - ResultReg = emitAddSub_ri(UseAdd, RetVT, LHSReg, LHSIsKill, 0, SetFlags, - WantResult); + ResultReg = emitAddSub_ri(UseAdd, RetVT, LHSReg, 0, SetFlags, WantResult); if (ResultReg) return ResultReg; @@ -1230,17 +1210,14 @@ unsigned AArch64FastISel::emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS, unsigned RHSReg = getRegForValue(SI->getOperand(0)); if (!RHSReg) return 0; - bool RHSIsKill = hasTrivialKill(SI->getOperand(0)); - return emitAddSub_rx(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg, - RHSIsKill, ExtendType, C->getZExtValue(), - SetFlags, WantResult); + return emitAddSub_rx(UseAdd, RetVT, LHSReg, RHSReg, ExtendType, + C->getZExtValue(), SetFlags, WantResult); } unsigned RHSReg = getRegForValue(RHS); if (!RHSReg) return 0; - bool RHSIsKill = hasTrivialKill(RHS); - return emitAddSub_rx(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg, RHSIsKill, - ExtendType, 0, SetFlags, WantResult); + return emitAddSub_rx(UseAdd, RetVT, LHSReg, RHSReg, ExtendType, 0, + SetFlags, WantResult); } // Check if the mul can be folded into the instruction. @@ -1258,10 +1235,8 @@ unsigned AArch64FastISel::emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS, unsigned RHSReg = getRegForValue(MulLHS); if (!RHSReg) return 0; - bool RHSIsKill = hasTrivialKill(MulLHS); - ResultReg = emitAddSub_rs(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg, - RHSIsKill, AArch64_AM::LSL, ShiftVal, SetFlags, - WantResult); + ResultReg = emitAddSub_rs(UseAdd, RetVT, LHSReg, RHSReg, AArch64_AM::LSL, + ShiftVal, SetFlags, WantResult); if (ResultReg) return ResultReg; } @@ -1283,10 +1258,8 @@ unsigned AArch64FastISel::emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS, unsigned RHSReg = getRegForValue(SI->getOperand(0)); if (!RHSReg) return 0; - bool RHSIsKill = hasTrivialKill(SI->getOperand(0)); - ResultReg = emitAddSub_rs(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg, - RHSIsKill, ShiftType, ShiftVal, SetFlags, - WantResult); + ResultReg = emitAddSub_rs(UseAdd, RetVT, LHSReg, RHSReg, ShiftType, + ShiftVal, SetFlags, WantResult); if (ResultReg) return ResultReg; } @@ -1297,18 +1270,15 @@ unsigned AArch64FastISel::emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS, unsigned RHSReg = getRegForValue(RHS); if (!RHSReg) return 0; - bool RHSIsKill = hasTrivialKill(RHS); if (NeedExtend) RHSReg = emitIntExt(SrcVT, RHSReg, RetVT, IsZExt); - return emitAddSub_rr(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg, RHSIsKill, - SetFlags, WantResult); + return emitAddSub_rr(UseAdd, RetVT, LHSReg, RHSReg, SetFlags, WantResult); } unsigned AArch64FastISel::emitAddSub_rr(bool UseAdd, MVT RetVT, unsigned LHSReg, - bool LHSIsKill, unsigned RHSReg, - bool RHSIsKill, bool SetFlags, + unsigned RHSReg, bool SetFlags, bool WantResult) { assert(LHSReg && RHSReg && "Invalid register number."); @@ -1339,14 +1309,14 @@ unsigned AArch64FastISel::emitAddSub_rr(bool UseAdd, MVT RetVT, unsigned LHSReg, LHSReg = constrainOperandRegClass(II, LHSReg, II.getNumDefs()); RHSReg = constrainOperandRegClass(II, RHSReg, II.getNumDefs() + 1); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg) - .addReg(LHSReg, getKillRegState(LHSIsKill)) - .addReg(RHSReg, getKillRegState(RHSIsKill)); + .addReg(LHSReg) + .addReg(RHSReg); return ResultReg; } unsigned AArch64FastISel::emitAddSub_ri(bool UseAdd, MVT RetVT, unsigned LHSReg, - bool LHSIsKill, uint64_t Imm, - bool SetFlags, bool WantResult) { + uint64_t Imm, bool SetFlags, + bool WantResult) { assert(LHSReg && "Invalid register number."); if (RetVT != MVT::i32 && RetVT != MVT::i64) @@ -1383,15 +1353,14 @@ unsigned AArch64FastISel::emitAddSub_ri(bool UseAdd, MVT RetVT, unsigned LHSReg, const MCInstrDesc &II = TII.get(Opc); LHSReg = constrainOperandRegClass(II, LHSReg, II.getNumDefs()); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg) - .addReg(LHSReg, getKillRegState(LHSIsKill)) + .addReg(LHSReg) .addImm(Imm) .addImm(getShifterImm(AArch64_AM::LSL, ShiftImm)); return ResultReg; } unsigned AArch64FastISel::emitAddSub_rs(bool UseAdd, MVT RetVT, unsigned LHSReg, - bool LHSIsKill, unsigned RHSReg, - bool RHSIsKill, + unsigned RHSReg, AArch64_AM::ShiftExtendType ShiftType, uint64_t ShiftImm, bool SetFlags, bool WantResult) { @@ -1426,15 +1395,14 @@ unsigned AArch64FastISel::emitAddSub_rs(bool UseAdd, MVT RetVT, unsigned LHSReg, LHSReg = constrainOperandRegClass(II, LHSReg, II.getNumDefs()); RHSReg = constrainOperandRegClass(II, RHSReg, II.getNumDefs() + 1); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg) - .addReg(LHSReg, getKillRegState(LHSIsKill)) - .addReg(RHSReg, getKillRegState(RHSIsKill)) + .addReg(LHSReg) + .addReg(RHSReg) .addImm(getShifterImm(ShiftType, ShiftImm)); return ResultReg; } unsigned AArch64FastISel::emitAddSub_rx(bool UseAdd, MVT RetVT, unsigned LHSReg, - bool LHSIsKill, unsigned RHSReg, - bool RHSIsKill, + unsigned RHSReg, AArch64_AM::ShiftExtendType ExtType, uint64_t ShiftImm, bool SetFlags, bool WantResult) { @@ -1471,8 +1439,8 @@ unsigned AArch64FastISel::emitAddSub_rx(bool UseAdd, MVT RetVT, unsigned LHSReg, LHSReg = constrainOperandRegClass(II, LHSReg, II.getNumDefs()); RHSReg = constrainOperandRegClass(II, RHSReg, II.getNumDefs() + 1); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg) - .addReg(LHSReg, getKillRegState(LHSIsKill)) - .addReg(RHSReg, getKillRegState(RHSIsKill)) + .addReg(LHSReg) + .addReg(RHSReg) .addImm(getArithExtendImm(ExtType, ShiftImm)); return ResultReg; } @@ -1505,9 +1473,8 @@ bool AArch64FastISel::emitICmp(MVT RetVT, const Value *LHS, const Value *RHS, IsZExt) != 0; } -bool AArch64FastISel::emitICmp_ri(MVT RetVT, unsigned LHSReg, bool LHSIsKill, - uint64_t Imm) { - return emitAddSub_ri(/*UseAdd=*/false, RetVT, LHSReg, LHSIsKill, Imm, +bool AArch64FastISel::emitICmp_ri(MVT RetVT, unsigned LHSReg, uint64_t Imm) { + return emitAddSub_ri(/*UseAdd=*/false, RetVT, LHSReg, Imm, /*SetFlags=*/true, /*WantResult=*/false) != 0; } @@ -1525,24 +1492,22 @@ bool AArch64FastISel::emitFCmp(MVT RetVT, const Value *LHS, const Value *RHS) { unsigned LHSReg = getRegForValue(LHS); if (!LHSReg) return false; - bool LHSIsKill = hasTrivialKill(LHS); if (UseImm) { unsigned Opc = (RetVT == MVT::f64) ? AArch64::FCMPDri : AArch64::FCMPSri; BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc)) - .addReg(LHSReg, getKillRegState(LHSIsKill)); + .addReg(LHSReg); return true; } unsigned RHSReg = getRegForValue(RHS); if (!RHSReg) return false; - bool RHSIsKill = hasTrivialKill(RHS); unsigned Opc = (RetVT == MVT::f64) ? AArch64::FCMPDrr : AArch64::FCMPSrr; BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc)) - .addReg(LHSReg, getKillRegState(LHSIsKill)) - .addReg(RHSReg, getKillRegState(RHSIsKill)); + .addReg(LHSReg) + .addReg(RHSReg); return true; } @@ -1557,13 +1522,12 @@ unsigned AArch64FastISel::emitAdd(MVT RetVT, const Value *LHS, const Value *RHS, /// First try to emit an add with an immediate operand using emitAddSub_ri. If /// that fails, then try to materialize the immediate into a register and use /// emitAddSub_rr instead. -unsigned AArch64FastISel::emitAdd_ri_(MVT VT, unsigned Op0, bool Op0IsKill, - int64_t Imm) { +unsigned AArch64FastISel::emitAdd_ri_(MVT VT, unsigned Op0, int64_t Imm) { unsigned ResultReg; if (Imm < 0) - ResultReg = emitAddSub_ri(false, VT, Op0, Op0IsKill, -Imm); + ResultReg = emitAddSub_ri(false, VT, Op0, -Imm); else - ResultReg = emitAddSub_ri(true, VT, Op0, Op0IsKill, Imm); + ResultReg = emitAddSub_ri(true, VT, Op0, Imm); if (ResultReg) return ResultReg; @@ -1572,7 +1536,7 @@ unsigned AArch64FastISel::emitAdd_ri_(MVT VT, unsigned Op0, bool Op0IsKill, if (!CReg) return 0; - ResultReg = emitAddSub_rr(true, VT, Op0, Op0IsKill, CReg, true); + ResultReg = emitAddSub_rr(true, VT, Op0, CReg); return ResultReg; } @@ -1583,20 +1547,17 @@ unsigned AArch64FastISel::emitSub(MVT RetVT, const Value *LHS, const Value *RHS, } unsigned AArch64FastISel::emitSubs_rr(MVT RetVT, unsigned LHSReg, - bool LHSIsKill, unsigned RHSReg, - bool RHSIsKill, bool WantResult) { - return emitAddSub_rr(/*UseAdd=*/false, RetVT, LHSReg, LHSIsKill, RHSReg, - RHSIsKill, /*SetFlags=*/true, WantResult); + unsigned RHSReg, bool WantResult) { + return emitAddSub_rr(/*UseAdd=*/false, RetVT, LHSReg, RHSReg, + /*SetFlags=*/true, WantResult); } unsigned AArch64FastISel::emitSubs_rs(MVT RetVT, unsigned LHSReg, - bool LHSIsKill, unsigned RHSReg, - bool RHSIsKill, + unsigned RHSReg, AArch64_AM::ShiftExtendType ShiftType, uint64_t ShiftImm, bool WantResult) { - return emitAddSub_rs(/*UseAdd=*/false, RetVT, LHSReg, LHSIsKill, RHSReg, - RHSIsKill, ShiftType, ShiftImm, /*SetFlags=*/true, - WantResult); + return emitAddSub_rs(/*UseAdd=*/false, RetVT, LHSReg, RHSReg, ShiftType, + ShiftImm, /*SetFlags=*/true, WantResult); } unsigned AArch64FastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT, @@ -1619,12 +1580,11 @@ unsigned AArch64FastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT, unsigned LHSReg = getRegForValue(LHS); if (!LHSReg) return 0; - bool LHSIsKill = hasTrivialKill(LHS); unsigned ResultReg = 0; if (const auto *C = dyn_cast<ConstantInt>(RHS)) { uint64_t Imm = C->getZExtValue(); - ResultReg = emitLogicalOp_ri(ISDOpc, RetVT, LHSReg, LHSIsKill, Imm); + ResultReg = emitLogicalOp_ri(ISDOpc, RetVT, LHSReg, Imm); } if (ResultReg) return ResultReg; @@ -1645,9 +1605,7 @@ unsigned AArch64FastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT, unsigned RHSReg = getRegForValue(MulLHS); if (!RHSReg) return 0; - bool RHSIsKill = hasTrivialKill(MulLHS); - ResultReg = emitLogicalOp_rs(ISDOpc, RetVT, LHSReg, LHSIsKill, RHSReg, - RHSIsKill, ShiftVal); + ResultReg = emitLogicalOp_rs(ISDOpc, RetVT, LHSReg, RHSReg, ShiftVal); if (ResultReg) return ResultReg; } @@ -1661,9 +1619,7 @@ unsigned AArch64FastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT, unsigned RHSReg = getRegForValue(SI->getOperand(0)); if (!RHSReg) return 0; - bool RHSIsKill = hasTrivialKill(SI->getOperand(0)); - ResultReg = emitLogicalOp_rs(ISDOpc, RetVT, LHSReg, LHSIsKill, RHSReg, - RHSIsKill, ShiftVal); + ResultReg = emitLogicalOp_rs(ISDOpc, RetVT, LHSReg, RHSReg, ShiftVal); if (ResultReg) return ResultReg; } @@ -1672,20 +1628,18 @@ unsigned AArch64FastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT, unsigned RHSReg = getRegForValue(RHS); if (!RHSReg) return 0; - bool RHSIsKill = hasTrivialKill(RHS); MVT VT = std::max(MVT::i32, RetVT.SimpleTy); - ResultReg = fastEmit_rr(VT, VT, ISDOpc, LHSReg, LHSIsKill, RHSReg, RHSIsKill); + ResultReg = fastEmit_rr(VT, VT, ISDOpc, LHSReg, RHSReg); if (RetVT >= MVT::i8 && RetVT <= MVT::i16) { uint64_t Mask = (RetVT == MVT::i8) ? 0xff : 0xffff; - ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask); + ResultReg = emitAnd_ri(MVT::i32, ResultReg, Mask); } return ResultReg; } unsigned AArch64FastISel::emitLogicalOp_ri(unsigned ISDOpc, MVT RetVT, - unsigned LHSReg, bool LHSIsKill, - uint64_t Imm) { + unsigned LHSReg, uint64_t Imm) { static_assert((ISD::AND + 1 == ISD::OR) && (ISD::AND + 2 == ISD::XOR), "ISD nodes are not consecutive!"); static const unsigned OpcTable[3][2] = { @@ -1720,18 +1674,17 @@ unsigned AArch64FastISel::emitLogicalOp_ri(unsigned ISDOpc, MVT RetVT, return 0; unsigned ResultReg = - fastEmitInst_ri(Opc, RC, LHSReg, LHSIsKill, + fastEmitInst_ri(Opc, RC, LHSReg, AArch64_AM::encodeLogicalImmediate(Imm, RegSize)); if (RetVT >= MVT::i8 && RetVT <= MVT::i16 && ISDOpc != ISD::AND) { uint64_t Mask = (RetVT == MVT::i8) ? 0xff : 0xffff; - ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask); + ResultReg = emitAnd_ri(MVT::i32, ResultReg, Mask); } return ResultReg; } unsigned AArch64FastISel::emitLogicalOp_rs(unsigned ISDOpc, MVT RetVT, - unsigned LHSReg, bool LHSIsKill, - unsigned RHSReg, bool RHSIsKill, + unsigned LHSReg, unsigned RHSReg, uint64_t ShiftImm) { static_assert((ISD::AND + 1 == ISD::OR) && (ISD::AND + 2 == ISD::XOR), "ISD nodes are not consecutive!"); @@ -1763,18 +1716,18 @@ unsigned AArch64FastISel::emitLogicalOp_rs(unsigned ISDOpc, MVT RetVT, break; } unsigned ResultReg = - fastEmitInst_rri(Opc, RC, LHSReg, LHSIsKill, RHSReg, RHSIsKill, + fastEmitInst_rri(Opc, RC, LHSReg, RHSReg, AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftImm)); if (RetVT >= MVT::i8 && RetVT <= MVT::i16) { uint64_t Mask = (RetVT == MVT::i8) ? 0xff : 0xffff; - ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask); + ResultReg = emitAnd_ri(MVT::i32, ResultReg, Mask); } return ResultReg; } -unsigned AArch64FastISel::emitAnd_ri(MVT RetVT, unsigned LHSReg, bool LHSIsKill, +unsigned AArch64FastISel::emitAnd_ri(MVT RetVT, unsigned LHSReg, uint64_t Imm) { - return emitLogicalOp_ri(ISD::AND, RetVT, LHSReg, LHSIsKill, Imm); + return emitLogicalOp_ri(ISD::AND, RetVT, LHSReg, Imm); } unsigned AArch64FastISel::emitLoad(MVT VT, MVT RetVT, Address Addr, @@ -1895,7 +1848,7 @@ unsigned AArch64FastISel::emitLoad(MVT VT, MVT RetVT, Address Addr, // Loading an i1 requires special handling. if (VT == MVT::i1) { - unsigned ANDReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, 1); + unsigned ANDReg = emitAnd_ri(MVT::i32, ResultReg, 1); assert(ANDReg && "Unexpected AND instruction emission failure."); ResultReg = ANDReg; } @@ -2049,7 +2002,6 @@ bool AArch64FastISel::selectLoad(const Instruction *I) { removeDeadCode(I, std::next(I)); } else ResultReg = fastEmitInst_extractsubreg(MVT::i32, ResultReg, - /*IsKill=*/true, AArch64::sub_32); } updateValueMap(I, ResultReg); @@ -2157,7 +2109,7 @@ bool AArch64FastISel::emitStore(MVT VT, unsigned SrcReg, Address Addr, // Storing an i1 requires special handling. if (VTIsi1 && SrcReg != AArch64::WZR) { - unsigned ANDReg = emitAnd_ri(MVT::i32, SrcReg, /*TODO:IsKill=*/false, 1); + unsigned ANDReg = emitAnd_ri(MVT::i32, SrcReg, 1); assert(ANDReg && "Unexpected AND instruction emission failure."); SrcReg = ANDReg; } @@ -2390,11 +2342,9 @@ bool AArch64FastISel::emitCompareAndBranch(const BranchInst *BI) { unsigned SrcReg = getRegForValue(LHS); if (!SrcReg) return false; - bool SrcIsKill = hasTrivialKill(LHS); if (BW == 64 && !Is64Bit) - SrcReg = fastEmitInst_extractsubreg(MVT::i32, SrcReg, SrcIsKill, - AArch64::sub_32); + SrcReg = fastEmitInst_extractsubreg(MVT::i32, SrcReg, AArch64::sub_32); if ((BW < 32) && !IsBitTest) SrcReg = emitIntExt(VT, SrcReg, MVT::i32, /*isZExt=*/true); @@ -2403,7 +2353,7 @@ bool AArch64FastISel::emitCompareAndBranch(const BranchInst *BI) { SrcReg = constrainOperandRegClass(II, SrcReg, II.getNumDefs()); MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc)) - .addReg(SrcReg, getKillRegState(SrcIsKill)); + .addReg(SrcReg); if (IsBitTest) MIB.addImm(TestBit); MIB.addMBB(TBB); @@ -2521,7 +2471,6 @@ bool AArch64FastISel::selectBranch(const Instruction *I) { unsigned CondReg = getRegForValue(BI->getCondition()); if (CondReg == 0) return false; - bool CondRegIsKill = hasTrivialKill(BI->getCondition()); // i1 conditions come as i32 values, test the lowest bit with tb(n)z. unsigned Opcode = AArch64::TBNZW; @@ -2534,7 +2483,7 @@ bool AArch64FastISel::selectBranch(const Instruction *I) { unsigned ConstrainedCondReg = constrainOperandRegClass(II, CondReg, II.getNumDefs()); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II) - .addReg(ConstrainedCondReg, getKillRegState(CondRegIsKill)) + .addReg(ConstrainedCondReg) .addImm(0) .addMBB(TBB); @@ -2684,19 +2633,16 @@ bool AArch64FastISel::optimizeSelect(const SelectInst *SI) { unsigned Src1Reg = getRegForValue(Src1Val); if (!Src1Reg) return false; - bool Src1IsKill = hasTrivialKill(Src1Val); unsigned Src2Reg = getRegForValue(Src2Val); if (!Src2Reg) return false; - bool Src2IsKill = hasTrivialKill(Src2Val); - if (NeedExtraOp) { - Src1Reg = emitLogicalOp_ri(ISD::XOR, MVT::i32, Src1Reg, Src1IsKill, 1); - Src1IsKill = true; - } + if (NeedExtraOp) + Src1Reg = emitLogicalOp_ri(ISD::XOR, MVT::i32, Src1Reg, 1); + unsigned ResultReg = fastEmitInst_rr(Opc, &AArch64::GPR32RegClass, Src1Reg, - Src1IsKill, Src2Reg, Src2IsKill); + Src2Reg); updateValueMap(SI, ResultReg); return true; } @@ -2768,9 +2714,6 @@ bool AArch64FastISel::selectSelect(const Instruction *I) { unsigned SrcReg = getRegForValue(FoldSelect); if (!SrcReg) return false; - unsigned UseReg = lookUpRegForValue(SI); - if (UseReg) - MRI.clearKillFlags(UseReg); updateValueMap(I, SrcReg); return true; @@ -2799,7 +2742,6 @@ bool AArch64FastISel::selectSelect(const Instruction *I) { unsigned CondReg = getRegForValue(Cond); if (!CondReg) return false; - bool CondIsKill = hasTrivialKill(Cond); const MCInstrDesc &II = TII.get(AArch64::ANDSWri); CondReg = constrainOperandRegClass(II, CondReg, 1); @@ -2807,26 +2749,20 @@ bool AArch64FastISel::selectSelect(const Instruction *I) { // Emit a TST instruction (ANDS wzr, reg, #imm). BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, AArch64::WZR) - .addReg(CondReg, getKillRegState(CondIsKill)) + .addReg(CondReg) .addImm(AArch64_AM::encodeLogicalImmediate(1, 32)); } unsigned Src1Reg = getRegForValue(SI->getTrueValue()); - bool Src1IsKill = hasTrivialKill(SI->getTrueValue()); - unsigned Src2Reg = getRegForValue(SI->getFalseValue()); - bool Src2IsKill = hasTrivialKill(SI->getFalseValue()); if (!Src1Reg || !Src2Reg) return false; - if (ExtraCC != AArch64CC::AL) { - Src2Reg = fastEmitInst_rri(Opc, RC, Src1Reg, Src1IsKill, Src2Reg, - Src2IsKill, ExtraCC); - Src2IsKill = true; - } - unsigned ResultReg = fastEmitInst_rri(Opc, RC, Src1Reg, Src1IsKill, Src2Reg, - Src2IsKill, CC); + if (ExtraCC != AArch64CC::AL) + Src2Reg = fastEmitInst_rri(Opc, RC, Src1Reg, Src2Reg, ExtraCC); + + unsigned ResultReg = fastEmitInst_rri(Opc, RC, Src1Reg, Src2Reg, CC); updateValueMap(I, ResultReg); return true; } @@ -2911,7 +2847,6 @@ bool AArch64FastISel::selectIntToFP(const Instruction *I, bool Signed) { unsigned SrcReg = getRegForValue(I->getOperand(0)); if (!SrcReg) return false; - bool SrcIsKill = hasTrivialKill(I->getOperand(0)); EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType(), true); @@ -2921,7 +2856,6 @@ bool AArch64FastISel::selectIntToFP(const Instruction *I, bool Signed) { emitIntExt(SrcVT.getSimpleVT(), SrcReg, MVT::i32, /*isZExt*/ !Signed); if (!SrcReg) return false; - SrcIsKill = true; } unsigned Opc; @@ -2937,8 +2871,7 @@ bool AArch64FastISel::selectIntToFP(const Instruction *I, bool Signed) { Opc = (DestVT == MVT::f32) ? AArch64::UCVTFUWSri : AArch64::UCVTFUWDri; } - unsigned ResultReg = fastEmitInst_r(Opc, TLI.getRegClassFor(DestVT), SrcReg, - SrcIsKill); + unsigned ResultReg = fastEmitInst_r(Opc, TLI.getRegClassFor(DestVT), SrcReg); updateValueMap(I, ResultReg); return true; } @@ -3491,7 +3424,7 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { unsigned Depth = cast<ConstantInt>(II->getOperand(0))->getZExtValue(); while (Depth--) { DestReg = fastEmitInst_ri(AArch64::LDRXui, &AArch64::GPR64RegClass, - SrcReg, /*IsKill=*/true, 0); + SrcReg, 0); assert(DestReg && "Unexpected LDR instruction emission failure."); SrcReg = DestReg; } @@ -3637,10 +3570,9 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { unsigned SrcReg = getRegForValue(II->getOperand(0)); if (!SrcReg) return false; - bool SrcRegIsKill = hasTrivialKill(II->getOperand(0)); unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT)); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) - .addReg(SrcReg, getKillRegState(SrcRegIsKill)); + .addReg(SrcReg); updateValueMap(II, ResultReg); return true; } @@ -3663,9 +3595,8 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { unsigned Op0Reg = getRegForValue(II->getOperand(0)); if (!Op0Reg) return false; - bool Op0IsKill = hasTrivialKill(II->getOperand(0)); - unsigned ResultReg = fastEmit_r(VT, VT, ISD::FSQRT, Op0Reg, Op0IsKill); + unsigned ResultReg = fastEmit_r(VT, VT, ISD::FSQRT, Op0Reg); if (!ResultReg) return false; @@ -3742,33 +3673,26 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { unsigned LHSReg = getRegForValue(LHS); if (!LHSReg) return false; - bool LHSIsKill = hasTrivialKill(LHS); unsigned RHSReg = getRegForValue(RHS); if (!RHSReg) return false; - bool RHSIsKill = hasTrivialKill(RHS); if (VT == MVT::i32) { - MulReg = emitSMULL_rr(MVT::i64, LHSReg, LHSIsKill, RHSReg, RHSIsKill); - unsigned ShiftReg = emitLSR_ri(MVT::i64, MVT::i64, MulReg, - /*IsKill=*/false, 32); - MulReg = fastEmitInst_extractsubreg(VT, MulReg, /*IsKill=*/true, - AArch64::sub_32); - ShiftReg = fastEmitInst_extractsubreg(VT, ShiftReg, /*IsKill=*/true, - AArch64::sub_32); - emitSubs_rs(VT, ShiftReg, /*IsKill=*/true, MulReg, /*IsKill=*/false, - AArch64_AM::ASR, 31, /*WantResult=*/false); + MulReg = emitSMULL_rr(MVT::i64, LHSReg, RHSReg); + unsigned ShiftReg = emitLSR_ri(MVT::i64, MVT::i64, MulReg, 32); + MulReg = fastEmitInst_extractsubreg(VT, MulReg, AArch64::sub_32); + ShiftReg = fastEmitInst_extractsubreg(VT, ShiftReg, AArch64::sub_32); + emitSubs_rs(VT, ShiftReg, MulReg, AArch64_AM::ASR, 31, + /*WantResult=*/false); } else { assert(VT == MVT::i64 && "Unexpected value type."); // LHSReg and RHSReg cannot be killed by this Mul, since they are // reused in the next instruction. - MulReg = emitMul_rr(VT, LHSReg, /*IsKill=*/false, RHSReg, - /*IsKill=*/false); - unsigned SMULHReg = fastEmit_rr(VT, VT, ISD::MULHS, LHSReg, LHSIsKill, - RHSReg, RHSIsKill); - emitSubs_rs(VT, SMULHReg, /*IsKill=*/true, MulReg, /*IsKill=*/false, - AArch64_AM::ASR, 63, /*WantResult=*/false); + MulReg = emitMul_rr(VT, LHSReg, RHSReg); + unsigned SMULHReg = fastEmit_rr(VT, VT, ISD::MULHS, LHSReg, RHSReg); + emitSubs_rs(VT, SMULHReg, MulReg, AArch64_AM::ASR, 63, + /*WantResult=*/false); } break; } @@ -3777,30 +3701,23 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { unsigned LHSReg = getRegForValue(LHS); if (!LHSReg) return false; - bool LHSIsKill = hasTrivialKill(LHS); unsigned RHSReg = getRegForValue(RHS); if (!RHSReg) return false; - bool RHSIsKill = hasTrivialKill(RHS); if (VT == MVT::i32) { - MulReg = emitUMULL_rr(MVT::i64, LHSReg, LHSIsKill, RHSReg, RHSIsKill); - emitSubs_rs(MVT::i64, AArch64::XZR, /*IsKill=*/true, MulReg, - /*IsKill=*/false, AArch64_AM::LSR, 32, + MulReg = emitUMULL_rr(MVT::i64, LHSReg, RHSReg); + emitSubs_rs(MVT::i64, AArch64::XZR, MulReg, AArch64_AM::LSR, 32, /*WantResult=*/false); - MulReg = fastEmitInst_extractsubreg(VT, MulReg, /*IsKill=*/true, - AArch64::sub_32); + MulReg = fastEmitInst_extractsubreg(VT, MulReg, AArch64::sub_32); } else { assert(VT == MVT::i64 && "Unexpected value type."); // LHSReg and RHSReg cannot be killed by this Mul, since they are // reused in the next instruction. - MulReg = emitMul_rr(VT, LHSReg, /*IsKill=*/false, RHSReg, - /*IsKill=*/false); - unsigned UMULHReg = fastEmit_rr(VT, VT, ISD::MULHU, LHSReg, LHSIsKill, - RHSReg, RHSIsKill); - emitSubs_rr(VT, AArch64::XZR, /*IsKill=*/true, UMULHReg, - /*IsKill=*/false, /*WantResult=*/false); + MulReg = emitMul_rr(VT, LHSReg, RHSReg); + unsigned UMULHReg = fastEmit_rr(VT, VT, ISD::MULHU, LHSReg, RHSReg); + emitSubs_rr(VT, AArch64::XZR, UMULHReg, /*WantResult=*/false); } break; } @@ -3816,8 +3733,8 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { return false; ResultReg2 = fastEmitInst_rri(AArch64::CSINCWr, &AArch64::GPR32RegClass, - AArch64::WZR, /*IsKill=*/true, AArch64::WZR, - /*IsKill=*/true, getInvertedCondCode(CC)); + AArch64::WZR, AArch64::WZR, + getInvertedCondCode(CC)); (void)ResultReg2; assert((ResultReg1 + 1) == ResultReg2 && "Nonconsecutive result registers."); @@ -3917,7 +3834,7 @@ bool AArch64FastISel::selectRet(const Instruction *I) { // "Callee" (i.e. value producer) zero extends pointers at function // boundary. if (Subtarget->isTargetILP32() && RV->getType()->isPointerTy()) - SrcReg = emitAnd_ri(MVT::i64, SrcReg, false, 0xffffffff); + SrcReg = emitAnd_ri(MVT::i64, SrcReg, 0xffffffff); // Make the copy. BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, @@ -3959,7 +3876,6 @@ bool AArch64FastISel::selectTrunc(const Instruction *I) { unsigned SrcReg = getRegForValue(Op); if (!SrcReg) return false; - bool SrcIsKill = hasTrivialKill(Op); // If we're truncating from i64 to a smaller non-legal type then generate an // AND. Otherwise, we know the high bits are undefined and a truncate only @@ -3984,16 +3900,16 @@ bool AArch64FastISel::selectTrunc(const Instruction *I) { break; } // Issue an extract_subreg to get the lower 32-bits. - unsigned Reg32 = fastEmitInst_extractsubreg(MVT::i32, SrcReg, SrcIsKill, + unsigned Reg32 = fastEmitInst_extractsubreg(MVT::i32, SrcReg, AArch64::sub_32); // Create the AND instruction which performs the actual truncation. - ResultReg = emitAnd_ri(MVT::i32, Reg32, /*IsKill=*/true, Mask); + ResultReg = emitAnd_ri(MVT::i32, Reg32, Mask); assert(ResultReg && "Unexpected AND instruction emission failure."); } else { ResultReg = createResultReg(&AArch64::GPR32RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), ResultReg) - .addReg(SrcReg, getKillRegState(SrcIsKill)); + .addReg(SrcReg); } updateValueMap(I, ResultReg); @@ -4009,7 +3925,7 @@ unsigned AArch64FastISel::emiti1Ext(unsigned SrcReg, MVT DestVT, bool IsZExt) { DestVT = MVT::i32; if (IsZExt) { - unsigned ResultReg = emitAnd_ri(MVT::i32, SrcReg, /*TODO:IsKill=*/false, 1); + unsigned ResultReg = emitAnd_ri(MVT::i32, SrcReg, 1); assert(ResultReg && "Unexpected AND instruction emission failure."); if (DestVT == MVT::i64) { // We're ZExt i1 to i64. The ANDWri Wd, Ws, #1 implicitly clears the @@ -4029,12 +3945,11 @@ unsigned AArch64FastISel::emiti1Ext(unsigned SrcReg, MVT DestVT, bool IsZExt) { return 0; } return fastEmitInst_rii(AArch64::SBFMWri, &AArch64::GPR32RegClass, SrcReg, - /*TODO:IsKill=*/false, 0, 0); + 0, 0); } } -unsigned AArch64FastISel::emitMul_rr(MVT RetVT, unsigned Op0, bool Op0IsKill, - unsigned Op1, bool Op1IsKill) { +unsigned AArch64FastISel::emitMul_rr(MVT RetVT, unsigned Op0, unsigned Op1) { unsigned Opc, ZReg; switch (RetVT.SimpleTy) { default: return 0; @@ -4049,32 +3964,27 @@ unsigned AArch64FastISel::emitMul_rr(MVT RetVT, unsigned Op0, bool Op0IsKill, const TargetRegisterClass *RC = (RetVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass; - return fastEmitInst_rrr(Opc, RC, Op0, Op0IsKill, Op1, Op1IsKill, - /*IsKill=*/ZReg, true); + return fastEmitInst_rrr(Opc, RC, Op0, Op1, ZReg); } -unsigned AArch64FastISel::emitSMULL_rr(MVT RetVT, unsigned Op0, bool Op0IsKill, - unsigned Op1, bool Op1IsKill) { +unsigned AArch64FastISel::emitSMULL_rr(MVT RetVT, unsigned Op0, unsigned Op1) { if (RetVT != MVT::i64) return 0; return fastEmitInst_rrr(AArch64::SMADDLrrr, &AArch64::GPR64RegClass, - Op0, Op0IsKill, Op1, Op1IsKill, - AArch64::XZR, /*IsKill=*/true); + Op0, Op1, AArch64::XZR); } -unsigned AArch64FastISel::emitUMULL_rr(MVT RetVT, unsigned Op0, bool Op0IsKill, - unsigned Op1, bool Op1IsKill) { +unsigned AArch64FastISel::emitUMULL_rr(MVT RetVT, unsigned Op0, unsigned Op1) { if (RetVT != MVT::i64) return 0; return fastEmitInst_rrr(AArch64::UMADDLrrr, &AArch64::GPR64RegClass, - Op0, Op0IsKill, Op1, Op1IsKill, - AArch64::XZR, /*IsKill=*/true); + Op0, Op1, AArch64::XZR); } -unsigned AArch64FastISel::emitLSL_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill, - unsigned Op1Reg, bool Op1IsKill) { +unsigned AArch64FastISel::emitLSL_rr(MVT RetVT, unsigned Op0Reg, + unsigned Op1Reg) { unsigned Opc = 0; bool NeedTrunc = false; uint64_t Mask = 0; @@ -4088,20 +3998,17 @@ unsigned AArch64FastISel::emitLSL_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill, const TargetRegisterClass *RC = (RetVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass; - if (NeedTrunc) { - Op1Reg = emitAnd_ri(MVT::i32, Op1Reg, Op1IsKill, Mask); - Op1IsKill = true; - } - unsigned ResultReg = fastEmitInst_rr(Opc, RC, Op0Reg, Op0IsKill, Op1Reg, - Op1IsKill); if (NeedTrunc) - ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask); + Op1Reg = emitAnd_ri(MVT::i32, Op1Reg, Mask); + + unsigned ResultReg = fastEmitInst_rr(Opc, RC, Op0Reg, Op1Reg); + if (NeedTrunc) + ResultReg = emitAnd_ri(MVT::i32, ResultReg, Mask); return ResultReg; } unsigned AArch64FastISel::emitLSL_ri(MVT RetVT, MVT SrcVT, unsigned Op0, - bool Op0IsKill, uint64_t Shift, - bool IsZExt) { + uint64_t Shift, bool IsZExt) { assert(RetVT.SimpleTy >= SrcVT.SimpleTy && "Unexpected source/return type pair."); assert((SrcVT == MVT::i1 || SrcVT == MVT::i8 || SrcVT == MVT::i16 || @@ -4123,7 +4030,7 @@ unsigned AArch64FastISel::emitLSL_ri(MVT RetVT, MVT SrcVT, unsigned Op0, unsigned ResultReg = createResultReg(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), ResultReg) - .addReg(Op0, getKillRegState(Op0IsKill)); + .addReg(Op0); return ResultReg; } else return emitIntExt(SrcVT, Op0, RetVT, IsZExt); @@ -4171,16 +4078,15 @@ unsigned AArch64FastISel::emitLSL_ri(MVT RetVT, MVT SrcVT, unsigned Op0, BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::SUBREG_TO_REG), TmpReg) .addImm(0) - .addReg(Op0, getKillRegState(Op0IsKill)) + .addReg(Op0) .addImm(AArch64::sub_32); Op0 = TmpReg; - Op0IsKill = true; } - return fastEmitInst_rii(Opc, RC, Op0, Op0IsKill, ImmR, ImmS); + return fastEmitInst_rii(Opc, RC, Op0, ImmR, ImmS); } -unsigned AArch64FastISel::emitLSR_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill, - unsigned Op1Reg, bool Op1IsKill) { +unsigned AArch64FastISel::emitLSR_rr(MVT RetVT, unsigned Op0Reg, + unsigned Op1Reg) { unsigned Opc = 0; bool NeedTrunc = false; uint64_t Mask = 0; @@ -4195,20 +4101,17 @@ unsigned AArch64FastISel::emitLSR_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill, const TargetRegisterClass *RC = (RetVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass; if (NeedTrunc) { - Op0Reg = emitAnd_ri(MVT::i32, Op0Reg, Op0IsKill, Mask); - Op1Reg = emitAnd_ri(MVT::i32, Op1Reg, Op1IsKill, Mask); - Op0IsKill = Op1IsKill = true; + Op0Reg = emitAnd_ri(MVT::i32, Op0Reg, Mask); + Op1Reg = emitAnd_ri(MVT::i32, Op1Reg, Mask); } - unsigned ResultReg = fastEmitInst_rr(Opc, RC, Op0Reg, Op0IsKill, Op1Reg, - Op1IsKill); + unsigned ResultReg = fastEmitInst_rr(Opc, RC, Op0Reg, Op1Reg); if (NeedTrunc) - ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask); + ResultReg = emitAnd_ri(MVT::i32, ResultReg, Mask); return ResultReg; } unsigned AArch64FastISel::emitLSR_ri(MVT RetVT, MVT SrcVT, unsigned Op0, - bool Op0IsKill, uint64_t Shift, - bool IsZExt) { + uint64_t Shift, bool IsZExt) { assert(RetVT.SimpleTy >= SrcVT.SimpleTy && "Unexpected source/return type pair."); assert((SrcVT == MVT::i1 || SrcVT == MVT::i8 || SrcVT == MVT::i16 || @@ -4230,7 +4133,7 @@ unsigned AArch64FastISel::emitLSR_ri(MVT RetVT, MVT SrcVT, unsigned Op0, unsigned ResultReg = createResultReg(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), ResultReg) - .addReg(Op0, getKillRegState(Op0IsKill)); + .addReg(Op0); return ResultReg; } else return emitIntExt(SrcVT, Op0, RetVT, IsZExt); @@ -4274,7 +4177,6 @@ unsigned AArch64FastISel::emitLSR_ri(MVT RetVT, MVT SrcVT, unsigned Op0, Op0 = emitIntExt(SrcVT, Op0, RetVT, IsZExt); if (!Op0) return 0; - Op0IsKill = true; SrcVT = RetVT; SrcBits = SrcVT.getSizeInBits(); IsZExt = true; @@ -4292,16 +4194,15 @@ unsigned AArch64FastISel::emitLSR_ri(MVT RetVT, MVT SrcVT, unsigned Op0, BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::SUBREG_TO_REG), TmpReg) .addImm(0) - .addReg(Op0, getKillRegState(Op0IsKill)) + .addReg(Op0) .addImm(AArch64::sub_32); Op0 = TmpReg; - Op0IsKill = true; } - return fastEmitInst_rii(Opc, RC, Op0, Op0IsKill, ImmR, ImmS); + return fastEmitInst_rii(Opc, RC, Op0, ImmR, ImmS); } -unsigned AArch64FastISel::emitASR_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill, - unsigned Op1Reg, bool Op1IsKill) { +unsigned AArch64FastISel::emitASR_rr(MVT RetVT, unsigned Op0Reg, + unsigned Op1Reg) { unsigned Opc = 0; bool NeedTrunc = false; uint64_t Mask = 0; @@ -4317,19 +4218,16 @@ unsigned AArch64FastISel::emitASR_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill, (RetVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass; if (NeedTrunc) { Op0Reg = emitIntExt(RetVT, Op0Reg, MVT::i32, /*isZExt=*/false); - Op1Reg = emitAnd_ri(MVT::i32, Op1Reg, Op1IsKill, Mask); - Op0IsKill = Op1IsKill = true; + Op1Reg = emitAnd_ri(MVT::i32, Op1Reg, Mask); } - unsigned ResultReg = fastEmitInst_rr(Opc, RC, Op0Reg, Op0IsKill, Op1Reg, - Op1IsKill); + unsigned ResultReg = fastEmitInst_rr(Opc, RC, Op0Reg, Op1Reg); if (NeedTrunc) - ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask); + ResultReg = emitAnd_ri(MVT::i32, ResultReg, Mask); return ResultReg; } unsigned AArch64FastISel::emitASR_ri(MVT RetVT, MVT SrcVT, unsigned Op0, - bool Op0IsKill, uint64_t Shift, - bool IsZExt) { + uint64_t Shift, bool IsZExt) { assert(RetVT.SimpleTy >= SrcVT.SimpleTy && "Unexpected source/return type pair."); assert((SrcVT == MVT::i1 || SrcVT == MVT::i8 || SrcVT == MVT::i16 || @@ -4351,7 +4249,7 @@ unsigned AArch64FastISel::emitASR_ri(MVT RetVT, MVT SrcVT, unsigned Op0, unsigned ResultReg = createResultReg(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), ResultReg) - .addReg(Op0, getKillRegState(Op0IsKill)); + .addReg(Op0); return ResultReg; } else return emitIntExt(SrcVT, Op0, RetVT, IsZExt); @@ -4401,12 +4299,11 @@ unsigned AArch64FastISel::emitASR_ri(MVT RetVT, MVT SrcVT, unsigned Op0, BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::SUBREG_TO_REG), TmpReg) .addImm(0) - .addReg(Op0, getKillRegState(Op0IsKill)) + .addReg(Op0) .addImm(AArch64::sub_32); Op0 = TmpReg; - Op0IsKill = true; } - return fastEmitInst_rii(Opc, RC, Op0, Op0IsKill, ImmR, ImmS); + return fastEmitInst_rii(Opc, RC, Op0, ImmR, ImmS); } unsigned AArch64FastISel::emitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, @@ -4467,7 +4364,7 @@ unsigned AArch64FastISel::emitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, const TargetRegisterClass *RC = (DestVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass; - return fastEmitInst_rii(Opc, RC, SrcReg, /*TODO:IsKill=*/false, 0, Imm); + return fastEmitInst_rii(Opc, RC, SrcReg, 0, Imm); } static bool isZExtLoad(const MachineInstr *LI) { @@ -4590,7 +4487,6 @@ bool AArch64FastISel::selectIntExt(const Instruction *I) { unsigned SrcReg = getRegForValue(I->getOperand(0)); if (!SrcReg) return false; - bool SrcIsKill = hasTrivialKill(I->getOperand(0)); // Try to optimize already sign-/zero-extended values from function arguments. bool IsZExt = isa<ZExtInst>(I); @@ -4601,17 +4497,10 @@ bool AArch64FastISel::selectIntExt(const Instruction *I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::SUBREG_TO_REG), ResultReg) .addImm(0) - .addReg(SrcReg, getKillRegState(SrcIsKill)) + .addReg(SrcReg) .addImm(AArch64::sub_32); SrcReg = ResultReg; } - // Conservatively clear all kill flags from all uses, because we are - // replacing a sign-/zero-extend instruction at IR level with a nop at MI - // level. The result of the instruction at IR level might have been - // trivially dead, which is now not longer true. - unsigned UseReg = lookUpRegForValue(I); - if (UseReg) - MRI.clearKillFlags(UseReg); updateValueMap(I, SrcReg); return true; @@ -4651,23 +4540,18 @@ bool AArch64FastISel::selectRem(const Instruction *I, unsigned ISDOpcode) { unsigned Src0Reg = getRegForValue(I->getOperand(0)); if (!Src0Reg) return false; - bool Src0IsKill = hasTrivialKill(I->getOperand(0)); unsigned Src1Reg = getRegForValue(I->getOperand(1)); if (!Src1Reg) return false; - bool Src1IsKill = hasTrivialKill(I->getOperand(1)); const TargetRegisterClass *RC = (DestVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass; - unsigned QuotReg = fastEmitInst_rr(DivOpc, RC, Src0Reg, /*IsKill=*/false, - Src1Reg, /*IsKill=*/false); + unsigned QuotReg = fastEmitInst_rr(DivOpc, RC, Src0Reg, Src1Reg); assert(QuotReg && "Unexpected DIV instruction emission failure."); // The remainder is computed as numerator - (quotient * denominator) using the // MSUB instruction. - unsigned ResultReg = fastEmitInst_rrr(MSubOpc, RC, QuotReg, /*IsKill=*/true, - Src1Reg, Src1IsKill, Src0Reg, - Src0IsKill); + unsigned ResultReg = fastEmitInst_rrr(MSubOpc, RC, QuotReg, Src1Reg, Src0Reg); updateValueMap(I, ResultReg); return true; } @@ -4715,10 +4599,9 @@ bool AArch64FastISel::selectMul(const Instruction *I) { unsigned Src0Reg = getRegForValue(Src0); if (!Src0Reg) return false; - bool Src0IsKill = hasTrivialKill(Src0); unsigned ResultReg = - emitLSL_ri(VT, SrcVT, Src0Reg, Src0IsKill, ShiftVal, IsZExt); + emitLSL_ri(VT, SrcVT, Src0Reg, ShiftVal, IsZExt); if (ResultReg) { updateValueMap(I, ResultReg); @@ -4729,14 +4612,12 @@ bool AArch64FastISel::selectMul(const Instruction *I) { unsigned Src0Reg = getRegForValue(I->getOperand(0)); if (!Src0Reg) return false; - bool Src0IsKill = hasTrivialKill(I->getOperand(0)); unsigned Src1Reg = getRegForValue(I->getOperand(1)); if (!Src1Reg) return false; - bool Src1IsKill = hasTrivialKill(I->getOperand(1)); - unsigned ResultReg = emitMul_rr(VT, Src0Reg, Src0IsKill, Src1Reg, Src1IsKill); + unsigned ResultReg = emitMul_rr(VT, Src0Reg, Src1Reg); if (!ResultReg) return false; @@ -4782,18 +4663,17 @@ bool AArch64FastISel::selectShift(const Instruction *I) { unsigned Op0Reg = getRegForValue(Op0); if (!Op0Reg) return false; - bool Op0IsKill = hasTrivialKill(Op0); switch (I->getOpcode()) { default: llvm_unreachable("Unexpected instruction."); case Instruction::Shl: - ResultReg = emitLSL_ri(RetVT, SrcVT, Op0Reg, Op0IsKill, ShiftVal, IsZExt); + ResultReg = emitLSL_ri(RetVT, SrcVT, Op0Reg, ShiftVal, IsZExt); break; case Instruction::AShr: - ResultReg = emitASR_ri(RetVT, SrcVT, Op0Reg, Op0IsKill, ShiftVal, IsZExt); + ResultReg = emitASR_ri(RetVT, SrcVT, Op0Reg, ShiftVal, IsZExt); break; case Instruction::LShr: - ResultReg = emitLSR_ri(RetVT, SrcVT, Op0Reg, Op0IsKill, ShiftVal, IsZExt); + ResultReg = emitLSR_ri(RetVT, SrcVT, Op0Reg, ShiftVal, IsZExt); break; } if (!ResultReg) @@ -4806,24 +4686,22 @@ bool AArch64FastISel::selectShift(const Instruction *I) { unsigned Op0Reg = getRegForValue(I->getOperand(0)); if (!Op0Reg) return false; - bool Op0IsKill = hasTrivialKill(I->getOperand(0)); unsigned Op1Reg = getRegForValue(I->getOperand(1)); if (!Op1Reg) return false; - bool Op1IsKill = hasTrivialKill(I->getOperand(1)); unsigned ResultReg = 0; switch (I->getOpcode()) { default: llvm_unreachable("Unexpected instruction."); case Instruction::Shl: - ResultReg = emitLSL_rr(RetVT, Op0Reg, Op0IsKill, Op1Reg, Op1IsKill); + ResultReg = emitLSL_rr(RetVT, Op0Reg, Op1Reg); break; case Instruction::AShr: - ResultReg = emitASR_rr(RetVT, Op0Reg, Op0IsKill, Op1Reg, Op1IsKill); + ResultReg = emitASR_rr(RetVT, Op0Reg, Op1Reg); break; case Instruction::LShr: - ResultReg = emitLSR_rr(RetVT, Op0Reg, Op0IsKill, Op1Reg, Op1IsKill); + ResultReg = emitLSR_rr(RetVT, Op0Reg, Op1Reg); break; } @@ -4865,9 +4743,8 @@ bool AArch64FastISel::selectBitCast(const Instruction *I) { unsigned Op0Reg = getRegForValue(I->getOperand(0)); if (!Op0Reg) return false; - bool Op0IsKill = hasTrivialKill(I->getOperand(0)); - unsigned ResultReg = fastEmitInst_r(Opc, RC, Op0Reg, Op0IsKill); + unsigned ResultReg = fastEmitInst_r(Opc, RC, Op0Reg); if (!ResultReg) return false; @@ -4930,10 +4807,9 @@ bool AArch64FastISel::selectSDiv(const Instruction *I) { unsigned Src0Reg = getRegForValue(I->getOperand(0)); if (!Src0Reg) return false; - bool Src0IsKill = hasTrivialKill(I->getOperand(0)); if (cast<BinaryOperator>(I)->isExact()) { - unsigned ResultReg = emitASR_ri(VT, VT, Src0Reg, Src0IsKill, Lg2); + unsigned ResultReg = emitASR_ri(VT, VT, Src0Reg, Lg2); if (!ResultReg) return false; updateValueMap(I, ResultReg); @@ -4941,12 +4817,12 @@ bool AArch64FastISel::selectSDiv(const Instruction *I) { } int64_t Pow2MinusOne = (1ULL << Lg2) - 1; - unsigned AddReg = emitAdd_ri_(VT, Src0Reg, /*IsKill=*/false, Pow2MinusOne); + unsigned AddReg = emitAdd_ri_(VT, Src0Reg, Pow2MinusOne); if (!AddReg) return false; // (Src0 < 0) ? Pow2 - 1 : 0; - if (!emitICmp_ri(VT, Src0Reg, /*IsKill=*/false, 0)) + if (!emitICmp_ri(VT, Src0Reg, 0)) return false; unsigned SelectOpc; @@ -4958,9 +4834,8 @@ bool AArch64FastISel::selectSDiv(const Instruction *I) { SelectOpc = AArch64::CSELWr; RC = &AArch64::GPR32RegClass; } - unsigned SelectReg = - fastEmitInst_rri(SelectOpc, RC, AddReg, /*IsKill=*/true, Src0Reg, - Src0IsKill, AArch64CC::LT); + unsigned SelectReg = fastEmitInst_rri(SelectOpc, RC, AddReg, Src0Reg, + AArch64CC::LT); if (!SelectReg) return false; @@ -4969,10 +4844,10 @@ bool AArch64FastISel::selectSDiv(const Instruction *I) { unsigned ZeroReg = (VT == MVT::i64) ? AArch64::XZR : AArch64::WZR; unsigned ResultReg; if (C.isNegative()) - ResultReg = emitAddSub_rs(/*UseAdd=*/false, VT, ZeroReg, /*IsKill=*/true, - SelectReg, /*IsKill=*/true, AArch64_AM::ASR, Lg2); + ResultReg = emitAddSub_rs(/*UseAdd=*/false, VT, ZeroReg, SelectReg, + AArch64_AM::ASR, Lg2); else - ResultReg = emitASR_ri(VT, VT, SelectReg, /*IsKill=*/true, Lg2); + ResultReg = emitASR_ri(VT, VT, SelectReg, Lg2); if (!ResultReg) return false; @@ -4984,23 +4859,20 @@ bool AArch64FastISel::selectSDiv(const Instruction *I) { /// This is mostly a copy of the existing FastISel getRegForGEPIndex code. We /// have to duplicate it for AArch64, because otherwise we would fail during the /// sign-extend emission. -std::pair<unsigned, bool> AArch64FastISel::getRegForGEPIndex(const Value *Idx) { +unsigned AArch64FastISel::getRegForGEPIndex(const Value *Idx) { unsigned IdxN = getRegForValue(Idx); if (IdxN == 0) // Unhandled operand. Halt "fast" selection and bail. - return std::pair<unsigned, bool>(0, false); - - bool IdxNIsKill = hasTrivialKill(Idx); + return 0; // If the index is smaller or larger than intptr_t, truncate or extend it. MVT PtrVT = TLI.getPointerTy(DL); EVT IdxVT = EVT::getEVT(Idx->getType(), /*HandleUnknown=*/false); if (IdxVT.bitsLT(PtrVT)) { IdxN = emitIntExt(IdxVT.getSimpleVT(), IdxN, PtrVT, /*isZExt=*/false); - IdxNIsKill = true; } else if (IdxVT.bitsGT(PtrVT)) llvm_unreachable("AArch64 FastISel doesn't support types larger than i64"); - return std::pair<unsigned, bool>(IdxN, IdxNIsKill); + return IdxN; } /// This is mostly a copy of the existing FastISel GEP code, but we have to @@ -5014,7 +4886,6 @@ bool AArch64FastISel::selectGetElementPtr(const Instruction *I) { unsigned N = getRegForValue(I->getOperand(0)); if (!N) return false; - bool NIsKill = hasTrivialKill(I->getOperand(0)); // Keep a running tab of the total offset to coalesce multiple N = N + Offset // into a single N = N + TotalOffset. @@ -5041,18 +4912,15 @@ bool AArch64FastISel::selectGetElementPtr(const Instruction *I) { continue; } if (TotalOffs) { - N = emitAdd_ri_(VT, N, NIsKill, TotalOffs); + N = emitAdd_ri_(VT, N, TotalOffs); if (!N) return false; - NIsKill = true; TotalOffs = 0; } // N = N + Idx * ElementSize; uint64_t ElementSize = DL.getTypeAllocSize(Ty); - std::pair<unsigned, bool> Pair = getRegForGEPIndex(Idx); - unsigned IdxN = Pair.first; - bool IdxNIsKill = Pair.second; + unsigned IdxN = getRegForGEPIndex(Idx); if (!IdxN) return false; @@ -5060,18 +4928,17 @@ bool AArch64FastISel::selectGetElementPtr(const Instruction *I) { unsigned C = fastEmit_i(VT, VT, ISD::Constant, ElementSize); if (!C) return false; - IdxN = emitMul_rr(VT, IdxN, IdxNIsKill, C, true); + IdxN = emitMul_rr(VT, IdxN, C); if (!IdxN) return false; - IdxNIsKill = true; } - N = fastEmit_rr(VT, VT, ISD::ADD, N, NIsKill, IdxN, IdxNIsKill); + N = fastEmit_rr(VT, VT, ISD::ADD, N, IdxN); if (!N) return false; } } if (TotalOffs) { - N = emitAdd_ri_(VT, N, NIsKill, TotalOffs); + N = emitAdd_ri_(VT, N, TotalOffs); if (!N) return false; } diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index f5df1c5e2929..06cc68155c37 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -252,7 +252,7 @@ bool AArch64FrameLowering::homogeneousPrologEpilog( // Bail on stack adjustment needed on return for simplicity. const MachineFrameInfo &MFI = MF.getFrameInfo(); const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo(); - if (MFI.hasVarSizedObjects() || RegInfo->needsStackRealignment(MF)) + if (MFI.hasVarSizedObjects() || RegInfo->hasStackRealignment(MF)) return false; if (Exit && getArgumentPopSize(MF, *Exit)) return false; @@ -363,7 +363,7 @@ bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const { return true; if (MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() || MFI.hasStackMap() || MFI.hasPatchPoint() || - RegInfo->needsStackRealignment(MF)) + RegInfo->hasStackRealignment(MF)) return true; // With large callframes around we may need to use FP to access the scavenging // emergency spillslot. @@ -616,7 +616,7 @@ bool AArch64FrameLowering::canUseAsPrologue( const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); // Don't need a scratch register if we're not going to re-align the stack. - if (!RegInfo->needsStackRealignment(*MF)) + if (!RegInfo->hasStackRealignment(*MF)) return true; // Otherwise, we can use any block as long as it has a scratch register // available. @@ -678,7 +678,7 @@ bool AArch64FrameLowering::shouldCombineCSRLocalStackBump( if (MFI.hasVarSizedObjects()) return false; - if (RegInfo->needsStackRealignment(MF)) + if (RegInfo->hasStackRealignment(MF)) return false; // This isn't strictly necessary, but it simplifies things a bit since the @@ -1375,7 +1375,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, if (NumBytes) { // Alignment is required for the parent frame, not the funclet const bool NeedsRealignment = - !IsFunclet && RegInfo->needsStackRealignment(MF); + !IsFunclet && RegInfo->hasStackRealignment(MF); unsigned scratchSPReg = AArch64::SP; if (NeedsRealignment) { @@ -1981,13 +1981,13 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference( // Argument access should always use the FP. if (isFixed) { UseFP = hasFP(MF); - } else if (isCSR && RegInfo->needsStackRealignment(MF)) { + } else if (isCSR && RegInfo->hasStackRealignment(MF)) { // References to the CSR area must use FP if we're re-aligning the stack // since the dynamically-sized alignment padding is between the SP/BP and // the CSR area. assert(hasFP(MF) && "Re-aligned stack must have frame pointer"); UseFP = true; - } else if (hasFP(MF) && !RegInfo->needsStackRealignment(MF)) { + } else if (hasFP(MF) && !RegInfo->hasStackRealignment(MF)) { // If the FPOffset is negative and we're producing a signed immediate, we // have to keep in mind that the available offset range for negative // offsets is smaller than for positive ones. If an offset is available @@ -2029,9 +2029,10 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference( } } - assert(((isFixed || isCSR) || !RegInfo->needsStackRealignment(MF) || !UseFP) && - "In the presence of dynamic stack pointer realignment, " - "non-argument/CSR objects cannot be accessed through the frame pointer"); + assert( + ((isFixed || isCSR) || !RegInfo->hasStackRealignment(MF) || !UseFP) && + "In the presence of dynamic stack pointer realignment, " + "non-argument/CSR objects cannot be accessed through the frame pointer"); if (isSVE) { StackOffset FPOffset = @@ -2041,10 +2042,9 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference( StackOffset::get(MFI.getStackSize() - AFI->getCalleeSavedStackSize(), ObjectOffset); // Always use the FP for SVE spills if available and beneficial. - if (hasFP(MF) && - (SPOffset.getFixed() || - FPOffset.getScalable() < SPOffset.getScalable() || - RegInfo->needsStackRealignment(MF))) { + if (hasFP(MF) && (SPOffset.getFixed() || + FPOffset.getScalable() < SPOffset.getScalable() || + RegInfo->hasStackRealignment(MF))) { FrameReg = RegInfo->getFrameRegister(MF); return FPOffset; } diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 94b5d7718d0c..f70eee603706 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -1339,6 +1339,11 @@ bool AArch64DAGToDAGISel::tryIndexedLoad(SDNode *N) { SDValue Ops[] = { Base, Offset, Chain }; SDNode *Res = CurDAG->getMachineNode(Opcode, dl, MVT::i64, DstVT, MVT::Other, Ops); + + // Transfer memoperands. + MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand(); + CurDAG->setNodeMemRefs(cast<MachineSDNode>(Res), {MemOp}); + // Either way, we're replacing the node, so tell the caller that. SDValue LoadedVal = SDValue(Res, 1); if (InsertTo64) { diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 5ab8d8a5d6f1..718fc8b7c1d0 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -344,6 +344,18 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setCondCodeAction(ISD::SETUGT, VT, Expand); setCondCodeAction(ISD::SETUEQ, VT, Expand); setCondCodeAction(ISD::SETUNE, VT, Expand); + + setOperationAction(ISD::FREM, VT, Expand); + setOperationAction(ISD::FPOW, VT, Expand); + setOperationAction(ISD::FPOWI, VT, Expand); + setOperationAction(ISD::FCOS, VT, Expand); + setOperationAction(ISD::FSIN, VT, Expand); + setOperationAction(ISD::FSINCOS, VT, Expand); + setOperationAction(ISD::FEXP, VT, Expand); + setOperationAction(ISD::FEXP2, VT, Expand); + setOperationAction(ISD::FLOG, VT, Expand); + setOperationAction(ISD::FLOG2, VT, Expand); + setOperationAction(ISD::FLOG10, VT, Expand); } } @@ -1135,6 +1147,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom); + setOperationAction(ISD::STEP_VECTOR, VT, Custom); setOperationAction(ISD::MULHU, VT, Expand); setOperationAction(ISD::MULHS, VT, Expand); @@ -1167,6 +1180,13 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32, MVT::nxv4f32, MVT::nxv2f64}) { + for (auto InnerVT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, + MVT::nxv2f32, MVT::nxv4f32, MVT::nxv2f64}) { + // Avoid marking truncating FP stores as legal to prevent the + // DAGCombiner from creating unsupported truncating stores. + setTruncStoreAction(VT, InnerVT, Expand); + } + setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); setOperationAction(ISD::MGATHER, VT, Custom); @@ -1387,6 +1407,20 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) { // We use EXTRACT_SUBVECTOR to "cast" a scalable vector to a fixed length one. setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); + if (VT.isFloatingPoint()) { + setCondCodeAction(ISD::SETO, VT, Expand); + setCondCodeAction(ISD::SETOLT, VT, Expand); + setCondCodeAction(ISD::SETLT, VT, Expand); + setCondCodeAction(ISD::SETOLE, VT, Expand); + setCondCodeAction(ISD::SETLE, VT, Expand); + setCondCodeAction(ISD::SETULT, VT, Expand); + setCondCodeAction(ISD::SETULE, VT, Expand); + setCondCodeAction(ISD::SETUGE, VT, Expand); + setCondCodeAction(ISD::SETUGT, VT, Expand); + setCondCodeAction(ISD::SETUEQ, VT, Expand); + setCondCodeAction(ISD::SETUNE, VT, Expand); + } + // Lower fixed length vector operations to scalable equivalents. setOperationAction(ISD::ABS, VT, Custom); setOperationAction(ISD::ADD, VT, Custom); @@ -1399,6 +1433,7 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) { setOperationAction(ISD::CTTZ, VT, Custom); setOperationAction(ISD::FABS, VT, Custom); setOperationAction(ISD::FADD, VT, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::FCEIL, VT, Custom); setOperationAction(ISD::FDIV, VT, Custom); setOperationAction(ISD::FFLOOR, VT, Custom); @@ -1420,6 +1455,7 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) { setOperationAction(ISD::MUL, VT, Custom); setOperationAction(ISD::OR, VT, Custom); setOperationAction(ISD::SDIV, VT, Custom); + setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::SHL, VT, Custom); setOperationAction(ISD::SIGN_EXTEND, VT, Custom); @@ -1442,6 +1478,7 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) { setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom); setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); setOperationAction(ISD::VECREDUCE_OR, VT, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom); setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); @@ -2123,6 +2160,24 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( // Lowering Code //===----------------------------------------------------------------------===// +/// isZerosVector - Check whether SDNode N is a zero-filled vector. +static bool isZerosVector(const SDNode *N) { + // Look through a bit convert. + while (N->getOpcode() == ISD::BITCAST) + N = N->getOperand(0).getNode(); + + if (ISD::isConstantSplatVectorAllZeros(N)) + return true; + + if (N->getOpcode() != AArch64ISD::DUP) + return false; + + auto Opnd0 = N->getOperand(0); + auto *CINT = dyn_cast<ConstantSDNode>(Opnd0); + auto *CFP = dyn_cast<ConstantFPSDNode>(Opnd0); + return (CINT && CINT->isNullValue()) || (CFP && CFP->isZero()); +} + /// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64 /// CC static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) { @@ -3894,9 +3949,13 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, Op.getOperand(2)); } case Intrinsic::aarch64_neon_sdot: - case Intrinsic::aarch64_neon_udot: { - unsigned Opcode = IntNo == Intrinsic::aarch64_neon_udot ? AArch64ISD::UDOT - : AArch64ISD::SDOT; + case Intrinsic::aarch64_neon_udot: + case Intrinsic::aarch64_sve_sdot: + case Intrinsic::aarch64_sve_udot: { + unsigned Opcode = (IntNo == Intrinsic::aarch64_neon_udot || + IntNo == Intrinsic::aarch64_sve_udot) + ? AArch64ISD::UDOT + : AArch64ISD::SDOT; return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); } @@ -4402,6 +4461,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, return LowerVECTOR_SHUFFLE(Op, DAG); case ISD::SPLAT_VECTOR: return LowerSPLAT_VECTOR(Op, DAG); + case ISD::STEP_VECTOR: + return LowerSTEP_VECTOR(Op, DAG); case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); case ISD::INSERT_SUBVECTOR: @@ -5107,11 +5168,11 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization( const Function &CallerF = MF.getFunction(); CallingConv::ID CallerCC = CallerF.getCallingConv(); - // If this function uses the C calling convention but has an SVE signature, - // then it preserves more registers and should assume the SVE_VectorCall CC. + // Functions using the C or Fast calling convention that have an SVE signature + // preserve more registers and should assume the SVE_VectorCall CC. // The check for matching callee-saved regs will determine whether it is // eligible for TCO. - if (CallerCC == CallingConv::C && + if ((CallerCC == CallingConv::C || CallerCC == CallingConv::Fast) && AArch64RegisterInfo::hasSVEArgsOrReturn(&MF)) CallerCC = CallingConv::AArch64_SVE_VectorCall; @@ -5304,7 +5365,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // Check callee args/returns for SVE registers and set calling convention // accordingly. - if (CallConv == CallingConv::C) { + if (CallConv == CallingConv::C || CallConv == CallingConv::Fast) { bool CalleeOutSVE = any_of(Outs, [](ISD::OutputArg &Out){ return Out.VT.isScalableVector(); }); @@ -6994,6 +7055,17 @@ SDValue AArch64TargetLowering::LowerSELECT(SDValue Op, return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal); } + if (useSVEForFixedLengthVectorVT(Ty)) { + // FIXME: Ideally this would be the same as above using i1 types, however + // for the moment we can't deal with fixed i1 vector types properly, so + // instead extend the predicate to a result type sized integer vector. + MVT SplatValVT = MVT::getIntegerVT(Ty.getScalarSizeInBits()); + MVT PredVT = MVT::getVectorVT(SplatValVT, Ty.getVectorElementCount()); + SDValue SplatVal = DAG.getSExtOrTrunc(CCVal, DL, SplatValVT); + SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, SplatVal); + return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal); + } + // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select // instruction. if (ISD::isOverflowIntrOpRes(CCVal)) { @@ -9049,6 +9121,20 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, return GenerateTBL(Op, ShuffleMask, DAG); } +SDValue AArch64TargetLowering::LowerSTEP_VECTOR(SDValue Op, + SelectionDAG &DAG) const { + SDLoc dl(Op); + EVT VT = Op.getValueType(); + assert(VT.isScalableVector() && + "Only expect scalable vectors for STEP_VECTOR"); + assert(VT.getScalarType() != MVT::i1 && + "Vectors of i1 types not supported for STEP_VECTOR"); + + SDValue StepVal = Op.getOperand(0); + SDValue Zero = DAG.getConstant(0, dl, StepVal.getValueType()); + return DAG.getNode(AArch64ISD::INDEX_VECTOR, dl, VT, Zero, StepVal); +} + SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); @@ -9663,10 +9749,10 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, } if (i > 0) isOnlyLowElement = false; - if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V)) + if (!isIntOrFPConstant(V)) isConstant = false; - if (isa<ConstantSDNode>(V) || isa<ConstantFPSDNode>(V)) { + if (isIntOrFPConstant(V)) { ++NumConstantLanes; if (!ConstantValue.getNode()) ConstantValue = V; @@ -9691,7 +9777,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, // Convert BUILD_VECTOR where all elements but the lowest are undef into // SCALAR_TO_VECTOR, except for when we have a single-element constant vector // as SimplifyDemandedBits will just turn that back into BUILD_VECTOR. - if (isOnlyLowElement && !(NumElts == 1 && isa<ConstantSDNode>(Value))) { + if (isOnlyLowElement && !(NumElts == 1 && isIntOrFPConstant(Value))) { LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 " "SCALAR_TO_VECTOR node\n"); return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value); @@ -9832,7 +9918,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, for (unsigned i = 0; i < NumElts; ++i) { SDValue V = Op.getOperand(i); SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64); - if (!isa<ConstantSDNode>(V) && !isa<ConstantFPSDNode>(V)) + if (!isIntOrFPConstant(V)) // Note that type legalization likely mucked about with the VT of the // source operand, so we may have to convert it here before inserting. Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx); @@ -9932,6 +10018,9 @@ SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!"); + if (useSVEForFixedLengthVectorVT(Op.getValueType())) + return LowerFixedLengthInsertVectorElt(Op, DAG); + // Check for non-constant or out of range lane. EVT VT = Op.getOperand(0).getValueType(); ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2)); @@ -9967,8 +10056,11 @@ AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!"); - // Check for non-constant or out of range lane. EVT VT = Op.getOperand(0).getValueType(); + if (useSVEForFixedLengthVectorVT(VT)) + return LowerFixedLengthExtractVectorElt(Op, DAG); + + // Check for non-constant or out of range lane. ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1)); if (!CI || CI->getZExtValue() >= VT.getVectorNumElements()) return SDValue(); @@ -10372,11 +10464,8 @@ static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS, SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { - if (Op.getValueType().isScalableVector()) { - if (Op.getOperand(0).getValueType().isFloatingPoint()) - return Op; + if (Op.getValueType().isScalableVector()) return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO); - } if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType())) return LowerFixedLengthVectorSetccToSVE(Op, DAG); @@ -13280,7 +13369,7 @@ static SDValue performAddDotCombine(SDNode *N, SelectionDAG &DAG) { auto isZeroDot = [](SDValue Dot) { return (Dot.getOpcode() == AArch64ISD::UDOT || Dot.getOpcode() == AArch64ISD::SDOT) && - ISD::isBuildVectorAllZeros(Dot.getOperand(0).getNode()); + isZerosVector(Dot.getOperand(0).getNode()); }; if (!isZeroDot(Dot)) std::swap(Dot, A); @@ -13911,78 +14000,7 @@ static SDValue performExtendCombine(SDNode *N, return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), NewABD); } - - // This is effectively a custom type legalization for AArch64. - // - // Type legalization will split an extend of a small, legal, type to a larger - // illegal type by first splitting the destination type, often creating - // illegal source types, which then get legalized in isel-confusing ways, - // leading to really terrible codegen. E.g., - // %result = v8i32 sext v8i8 %value - // becomes - // %losrc = extract_subreg %value, ... - // %hisrc = extract_subreg %value, ... - // %lo = v4i32 sext v4i8 %losrc - // %hi = v4i32 sext v4i8 %hisrc - // Things go rapidly downhill from there. - // - // For AArch64, the [sz]ext vector instructions can only go up one element - // size, so we can, e.g., extend from i8 to i16, but to go from i8 to i32 - // take two instructions. - // - // This implies that the most efficient way to do the extend from v8i8 - // to two v4i32 values is to first extend the v8i8 to v8i16, then do - // the normal splitting to happen for the v8i16->v8i32. - - // This is pre-legalization to catch some cases where the default - // type legalization will create ill-tempered code. - if (!DCI.isBeforeLegalizeOps()) - return SDValue(); - - // We're only interested in cleaning things up for non-legal vector types - // here. If both the source and destination are legal, things will just - // work naturally without any fiddling. - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - EVT ResVT = N->getValueType(0); - if (!ResVT.isVector() || TLI.isTypeLegal(ResVT)) - return SDValue(); - // If the vector type isn't a simple VT, it's beyond the scope of what - // we're worried about here. Let legalization do its thing and hope for - // the best. - SDValue Src = N->getOperand(0); - EVT SrcVT = Src->getValueType(0); - if (!ResVT.isSimple() || !SrcVT.isSimple()) - return SDValue(); - - // If the source VT is a 64-bit fixed or scalable vector, we can play games - // and get the better results we want. - if (SrcVT.getSizeInBits().getKnownMinSize() != 64) - return SDValue(); - - unsigned SrcEltSize = SrcVT.getScalarSizeInBits(); - ElementCount SrcEC = SrcVT.getVectorElementCount(); - SrcVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize * 2), SrcEC); - SDLoc DL(N); - Src = DAG.getNode(N->getOpcode(), DL, SrcVT, Src); - - // Now split the rest of the operation into two halves, each with a 64 - // bit source. - EVT LoVT, HiVT; - SDValue Lo, Hi; - LoVT = HiVT = ResVT.getHalfNumVectorElementsVT(*DAG.getContext()); - - EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(), - LoVT.getVectorElementCount()); - Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src, - DAG.getConstant(0, DL, MVT::i64)); - Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src, - DAG.getConstant(InNVT.getVectorMinNumElements(), DL, MVT::i64)); - Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo); - Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi); - - // Now combine the parts back together so we still have a single result - // like the combiner expects. - return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi); + return SDValue(); } static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St, @@ -15213,7 +15231,8 @@ static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) { } } - if (N0.getOpcode() != ISD::SETCC || CCVT.getVectorNumElements() != 1 || + if (N0.getOpcode() != ISD::SETCC || + CCVT.getVectorElementCount() != ElementCount::getFixed(1) || CCVT.getVectorElementType() != MVT::i1) return SDValue(); @@ -17221,6 +17240,35 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE( return convertFromScalableVector(DAG, VT, Val); } +SDValue AArch64TargetLowering::LowerFixedLengthExtractVectorElt( + SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + EVT InVT = Op.getOperand(0).getValueType(); + assert(InVT.isFixedLengthVector() && "Expected fixed length vector type!"); + + SDLoc DL(Op); + EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT); + SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0)); + + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Op.getOperand(1)); +} + +SDValue AArch64TargetLowering::LowerFixedLengthInsertVectorElt( + SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); + + SDLoc DL(Op); + EVT InVT = Op.getOperand(0).getValueType(); + EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT); + SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0)); + + auto ScalableRes = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT, Op0, + Op.getOperand(1), Op.getOperand(2)); + + return convertFromScalableVector(DAG, VT, ScalableRes); +} + // Convert vector operation 'Op' to an equivalent predicated operation whereby // the original operation's type is used to construct a suitable predicate. // NOTE: The results for inactive lanes are undefined. @@ -17437,10 +17485,6 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorSetccToSVE( assert(Op.getValueType() == InVT.changeTypeToInteger() && "Expected integer result of the same bit length as the inputs!"); - // Expand floating point vector comparisons. - if (InVT.isFloatingPoint()) - return SDValue(); - auto Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0)); auto Op2 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1)); auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 1264d6779924..63df22326150 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -936,6 +936,7 @@ private: SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSPLAT_VECTOR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSTEP_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDUPQLane(SDValue Op, SelectionDAG &DAG) const; SDValue LowerToPredicatedOp(SDValue Op, SelectionDAG &DAG, unsigned NewOp, bool OverrideNEON = false) const; @@ -987,6 +988,8 @@ private: SDValue LowerFixedLengthVectorStoreToSVE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFixedLengthVectorTruncateToSVE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFixedLengthExtractVectorElt(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFixedLengthInsertVectorElt(SDValue Op, SelectionDAG &DAG) const; SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl<SDNode *> &Created) const override; diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp index f90856d14b2f..9f1b791e18b5 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -382,7 +382,7 @@ bool AArch64RegisterInfo::hasBasePointer(const MachineFunction &MF) const { // stack needs to be dynamically re-aligned, the base pointer is the only // reliable way to reference the locals. if (MFI.hasVarSizedObjects() || MF.hasEHFunclets()) { - if (needsStackRealignment(MF)) + if (hasStackRealignment(MF)) return true; if (MF.getSubtarget<AArch64Subtarget>().hasSVE()) { @@ -437,7 +437,7 @@ AArch64RegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) const { assert((!MF.getSubtarget<AArch64Subtarget>().hasSVE() || AFI->hasCalculatedStackSizeSVE()) && "Expected SVE area to be calculated by this point"); - return TFI.hasFP(MF) && !needsStackRealignment(MF) && !AFI->getStackSizeSVE(); + return TFI.hasFP(MF) && !hasStackRealignment(MF) && !AFI->getStackSizeSVE(); } bool AArch64RegisterInfo::requiresFrameIndexScavenging( @@ -761,7 +761,7 @@ unsigned AArch64RegisterInfo::getLocalAddressRegister( const auto &MFI = MF.getFrameInfo(); if (!MF.hasEHFunclets() && !MFI.hasVarSizedObjects()) return AArch64::SP; - else if (needsStackRealignment(MF)) + else if (hasStackRealignment(MF)) return getBaseRegister(); return getFrameRegister(MF); } diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 63a53cc0c8f1..df4e2cd44623 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -262,18 +262,6 @@ def index_vector : SDNode<"AArch64ISD::INDEX_VECTOR", SDT_IndexVector, []>; def reinterpret_cast : SDNode<"AArch64ISD::REINTERPRET_CAST", SDTUnaryOp>; -def setoge_or_setge : PatFrags<(ops node:$lhs, node:$rhs), - [(setoge node:$lhs, node:$rhs), - (setge node:$lhs, node:$rhs)]>; -def setogt_or_setgt : PatFrags<(ops node:$lhs, node:$rhs), - [(setogt node:$lhs, node:$rhs), - (setgt node:$lhs, node:$rhs)]>; -def setoeq_or_seteq : PatFrags<(ops node:$lhs, node:$rhs), - [(setoeq node:$lhs, node:$rhs), - (seteq node:$lhs, node:$rhs)]>; -def setone_or_setne : PatFrags<(ops node:$lhs, node:$rhs), - [(setone node:$lhs, node:$rhs), - (setne node:$lhs, node:$rhs)]>; def AArch64mul_p_oneuse : PatFrag<(ops node:$pred, node:$src1, node:$src2), (AArch64mul_p node:$pred, node:$src1, node:$src2), [{ return N->hasOneUse(); @@ -365,8 +353,8 @@ let Predicates = [HasSVE] in { defm SDIV_ZPZZ : sve_int_bin_pred_sd<AArch64sdiv_p>; defm UDIV_ZPZZ : sve_int_bin_pred_sd<AArch64udiv_p>; - defm SDOT_ZZZ : sve_intx_dot<0b0, "sdot", int_aarch64_sve_sdot>; - defm UDOT_ZZZ : sve_intx_dot<0b1, "udot", int_aarch64_sve_udot>; + defm SDOT_ZZZ : sve_intx_dot<0b0, "sdot", AArch64sdot>; + defm UDOT_ZZZ : sve_intx_dot<0b1, "udot", AArch64udot>; defm SDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b0, "sdot", int_aarch64_sve_sdot_lane>; defm UDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b1, "udot", int_aarch64_sve_udot_lane>; @@ -1252,11 +1240,11 @@ let Predicates = [HasSVE] in { defm CMPLO_PPzZI : sve_int_ucmp_vi<0b10, "cmplo", SETULT, SETUGT>; defm CMPLS_PPzZI : sve_int_ucmp_vi<0b11, "cmpls", SETULE, SETUGE>; - defm FCMGE_PPzZZ : sve_fp_3op_p_pd_cc<0b000, "fcmge", int_aarch64_sve_fcmpge, setoge_or_setge>; - defm FCMGT_PPzZZ : sve_fp_3op_p_pd_cc<0b001, "fcmgt", int_aarch64_sve_fcmpgt, setogt_or_setgt>; - defm FCMEQ_PPzZZ : sve_fp_3op_p_pd_cc<0b010, "fcmeq", int_aarch64_sve_fcmpeq, setoeq_or_seteq>; - defm FCMNE_PPzZZ : sve_fp_3op_p_pd_cc<0b011, "fcmne", int_aarch64_sve_fcmpne, setone_or_setne>; - defm FCMUO_PPzZZ : sve_fp_3op_p_pd_cc<0b100, "fcmuo", int_aarch64_sve_fcmpuo, setuo>; + defm FCMGE_PPzZZ : sve_fp_3op_p_pd_cc<0b000, "fcmge", int_aarch64_sve_fcmpge, SETOGE, SETGE, SETOLE, SETLE>; + defm FCMGT_PPzZZ : sve_fp_3op_p_pd_cc<0b001, "fcmgt", int_aarch64_sve_fcmpgt, SETOGT, SETGT, SETOLT, SETLT>; + defm FCMEQ_PPzZZ : sve_fp_3op_p_pd_cc<0b010, "fcmeq", int_aarch64_sve_fcmpeq, SETOEQ, SETEQ, SETOEQ, SETEQ>; + defm FCMNE_PPzZZ : sve_fp_3op_p_pd_cc<0b011, "fcmne", int_aarch64_sve_fcmpne, SETONE, SETNE, SETONE, SETNE>; + defm FCMUO_PPzZZ : sve_fp_3op_p_pd_cc<0b100, "fcmuo", int_aarch64_sve_fcmpuo, SETUO, SETUO, SETUO, SETUO>; defm FACGE_PPzZZ : sve_fp_3op_p_pd<0b101, "facge", int_aarch64_sve_facge>; defm FACGT_PPzZZ : sve_fp_3op_p_pd<0b111, "facgt", int_aarch64_sve_facgt>; @@ -2288,8 +2276,6 @@ let Predicates = [HasSVE] in { (EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), dsub)>; def : Pat<(f16 (vector_extract (nxv8f16 ZPR:$vec), sve_elm_idx_extdup_h:$index)), (EXTRACT_SUBREG (DUP_ZZI_H ZPR:$vec, sve_elm_idx_extdup_h:$index), hsub)>; - def : Pat<(f16 (vector_extract (nxv8f16 ZPR:$vec), sve_elm_idx_extdup_h:$index)), - (EXTRACT_SUBREG (DUP_ZZI_H ZPR:$vec, sve_elm_idx_extdup_h:$index), hsub)>; def : Pat<(f16 (vector_extract (nxv4f16 ZPR:$vec), sve_elm_idx_extdup_s:$index)), (EXTRACT_SUBREG (DUP_ZZI_S ZPR:$vec, sve_elm_idx_extdup_s:$index), hsub)>; def : Pat<(f16 (vector_extract (nxv2f16 ZPR:$vec), sve_elm_idx_extdup_d:$index)), diff --git a/llvm/lib/Target/AArch64/AArch64SchedA55.td b/llvm/lib/Target/AArch64/AArch64SchedA55.td index ff7766f2caec..0015c27228f6 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedA55.td +++ b/llvm/lib/Target/AArch64/AArch64SchedA55.td @@ -339,5 +339,4 @@ def : InstRW<[CortexA55WriteFSqrtHP], (instregex "^.*SQRT.*16$")>; def : InstRW<[CortexA55WriteFSqrtSP], (instregex "^.*SQRT.*32$")>; def : InstRW<[CortexA55WriteFSqrtDP], (instregex "^.*SQRT.*64$")>; -def A55RCU : RetireControlUnit<64, 0>; } diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h index 8fe2f125982f..ce5a0128e622 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -196,9 +196,14 @@ protected: // HasZeroCycleZeroing - Has zero-cycle zeroing instructions. bool HasZeroCycleZeroing = false; bool HasZeroCycleZeroingGP = false; - bool HasZeroCycleZeroingFP = false; bool HasZeroCycleZeroingFPWorkaround = false; + // It is generally beneficial to rewrite "fmov s0, wzr" to "movi d0, #0". + // as movi is more efficient across all cores. Newer cores can eliminate + // fmovs early and there is no difference with movi, but this not true for + // all implementations. + bool HasZeroCycleZeroingFP = true; + // StrictAlign - Disallow unaligned memory accesses. bool StrictAlign = false; @@ -557,7 +562,7 @@ public: bool enableEarlyIfConversion() const override; - bool enableAdvancedRASplitCost() const override { return true; } + bool enableAdvancedRASplitCost() const override { return false; } std::unique_ptr<PBQPRAConstraint> getCustomPBQPConstraints() const override; diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 23b6978edac1..148239b3d789 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -212,7 +212,7 @@ AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) { return TTI::PSK_Software; } -unsigned +InstructionCost AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) { auto *RetTy = ICA.getReturnType(); @@ -260,6 +260,19 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, return LT.first; break; } + case Intrinsic::experimental_stepvector: { + unsigned Cost = 1; // Cost of the `index' instruction + auto LT = TLI->getTypeLegalizationCost(DL, RetTy); + // Legalisation of illegal vectors involves an `index' instruction plus + // (LT.first - 1) vector adds. + if (LT.first > 1) { + Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext()); + unsigned AddCost = + getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind); + Cost += AddCost * (LT.first - 1); + } + return Cost; + } default: break; } @@ -378,6 +391,23 @@ int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 }, { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 }, + // Truncations on nxvmiN + { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 1 }, + { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 1 }, + { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 1 }, + { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 1 }, + { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 1 }, + { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 2 }, + { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 1 }, + { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 3 }, + { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 5 }, + { ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 1 }, + { ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 1 }, + { ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 1 }, + { ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 2 }, + { ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 3 }, + { ISD::TRUNCATE, MVT::nxv8i32, MVT::nxv8i64, 6 }, + // The number of shll instructions for the extension. { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, @@ -459,6 +489,23 @@ int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 }, { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2 }, + // Lowering scalable + { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1 }, + { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1 }, + { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1 }, + { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1 }, + { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1 }, + { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1 }, + + + // Complex, from nxv2f32 legal type is nxv2i32 (no cost) or nxv2i64 (1 ext) + { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 2 }, + { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1 }, + { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1 }, + { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 2 }, + { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1 }, + { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1 }, + // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2. { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 }, { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 }, @@ -466,6 +513,75 @@ int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 }, { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 }, { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2 }, + + // Complex, from nxv2f64: legal type is nxv2i32, 1 narrowing => ~2. + { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 2 }, + { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 2 }, + { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 2 }, + { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 2 }, + { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 2 }, + { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 2 }, + + // Complex, from nxv4f32 legal type is nxv4i16, 1 narrowing => ~2 + { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 2 }, + { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 2 }, + { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 2 }, + { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 2 }, + + // Complex, from nxv8f64: legal type is nxv8i32, 1 narrowing => ~2. + { ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f64, 2 }, + { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 2 }, + { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 2 }, + { ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f64, 2 }, + { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 2 }, + { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 2 }, + + // Complex, from nxv4f64: legal type is nxv4i32, 1 narrowing => ~2. + { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 2 }, + { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 2 }, + { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 2 }, + { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 2 }, + { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 2 }, + { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 2 }, + + // Complex, from nxv8f32: legal type is nxv8i32 (no cost) or nxv8i64 (1 ext). + { ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f32, 2 }, + { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3 }, + { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 1 }, + { ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f32, 2 }, + { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 1 }, + { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 1 }, + + // Truncate from nxvmf32 to nxvmf16. + { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1 }, + { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1 }, + { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3 }, + + // Truncate from nxvmf64 to nxvmf16. + { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1 }, + { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3 }, + { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7 }, + + // Truncate from nxvmf64 to nxvmf32. + { ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1 }, + { ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3 }, + { ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6 }, + + // Extend from nxvmf16 to nxvmf32. + { ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1}, + { ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1}, + { ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2}, + + // Extend from nxvmf16 to nxvmf64. + { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1}, + { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2}, + { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4}, + + // Extend from nxvmf32 to nxvmf64. + { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1}, + { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2}, + { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6}, + }; if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD, @@ -537,7 +653,8 @@ int AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst, } unsigned AArch64TTIImpl::getCFInstrCost(unsigned Opcode, - TTI::TargetCostKind CostKind) { + TTI::TargetCostKind CostKind, + const Instruction *I) { if (CostKind != TTI::TCK_RecipThroughput) return Opcode == Instruction::PHI ? 0 : 1; assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind"); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index afb470592c8b..7a6cfd36fcc3 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -97,18 +97,22 @@ public: return 31; } - unsigned getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, - TTI::TargetCostKind CostKind); - - unsigned getRegisterBitWidth(bool Vector) const { - if (Vector) { + InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, + TTI::TargetCostKind CostKind); + + TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { + switch (K) { + case TargetTransformInfo::RGK_Scalar: + return TypeSize::getFixed(64); + case TargetTransformInfo::RGK_FixedWidthVector: if (ST->hasSVE()) - return std::max(ST->getMinSVEVectorSizeInBits(), 128u); - if (ST->hasNEON()) - return 128; - return 0; + return TypeSize::getFixed( + std::max(ST->getMinSVEVectorSizeInBits(), 128u)); + return TypeSize::getFixed(ST->hasNEON() ? 128 : 0); + case TargetTransformInfo::RGK_ScalableVector: + return TypeSize::getScalable(ST->hasSVE() ? 128 : 0); } - return 64; + llvm_unreachable("Unsupported register kind"); } unsigned getMinVectorRegisterBitWidth() { @@ -135,7 +139,8 @@ public: int getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index); - unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind); + unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, + const Instruction *I = nullptr); int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index e495003e3972..9141e786977a 100644 --- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -2906,6 +2906,7 @@ static const struct Extension { {"mte", {AArch64::FeatureMTE}}, {"memtag", {AArch64::FeatureMTE}}, {"tlb-rmi", {AArch64::FeatureTLB_RMI}}, + {"pan", {AArch64::FeaturePAN}}, {"pan-rwv", {AArch64::FeaturePAN_RWV}}, {"ccpp", {AArch64::FeatureCCPP}}, {"rcpc", {AArch64::FeatureRCPC}}, @@ -2921,7 +2922,6 @@ static const struct Extension { {"pauth", {AArch64::FeaturePAuth}}, {"flagm", {AArch64::FeatureFlagM}}, // FIXME: Unsupported extensions - {"pan", {}}, {"lor", {}}, {"rdma", {}}, {"profile", {}}, diff --git a/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.cpp b/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.cpp index 7c543028af9f..a1392ccb59e6 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.cpp @@ -29,10 +29,31 @@ AArch64GISelUtils::getAArch64VectorSplat(const MachineInstr &MI, return RegOrConstant(Src); } -Optional<int64_t> AArch64GISelUtils::getAArch64VectorSplatScalar( - const MachineInstr &MI, const MachineRegisterInfo &MRI) { +Optional<int64_t> +AArch64GISelUtils::getAArch64VectorSplatScalar(const MachineInstr &MI, + const MachineRegisterInfo &MRI) { auto Splat = getAArch64VectorSplat(MI, MRI); if (!Splat || Splat->isReg()) return None; return Splat->getCst(); } + +bool AArch64GISelUtils::isCMN(const MachineInstr *MaybeSub, + const CmpInst::Predicate &Pred, + const MachineRegisterInfo &MRI) { + // Match: + // + // %sub = G_SUB 0, %y + // %cmp = G_ICMP eq/ne, %sub, %z + // + // Or + // + // %sub = G_SUB 0, %y + // %cmp = G_ICMP eq/ne, %z, %sub + if (!MaybeSub || MaybeSub->getOpcode() != TargetOpcode::G_SUB || + !CmpInst::isEquality(Pred)) + return false; + auto MaybeZero = + getConstantVRegValWithLookThrough(MaybeSub->getOperand(1).getReg(), MRI); + return MaybeZero && MaybeZero->Value.getZExtValue() == 0; +} diff --git a/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.h b/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.h index b1e575d4e4d6..142d999ef05a 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.h +++ b/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.h @@ -15,9 +15,12 @@ #include "llvm/ADT/Optional.h" #include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/Register.h" +#include "MCTargetDesc/AArch64AddressingModes.h" +#include "llvm/IR/InstrTypes.h" #include <cstdint> namespace llvm { + namespace AArch64GISelUtils { /// \returns true if \p C is a legal immediate operand for an arithmetic @@ -36,6 +39,11 @@ Optional<RegOrConstant> getAArch64VectorSplat(const MachineInstr &MI, Optional<int64_t> getAArch64VectorSplatScalar(const MachineInstr &MI, const MachineRegisterInfo &MRI); +/// \returns true if \p MaybeSub and \p Pred are part of a CMN tree for an +/// integer compare. +bool isCMN(const MachineInstr *MaybeSub, const CmpInst::Predicate &Pred, + const MachineRegisterInfo &MRI); + } // namespace AArch64GISelUtils } // namespace llvm diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp index 68c2e1e95048..7160432884fe 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp @@ -18,6 +18,7 @@ #include "AArch64RegisterInfo.h" #include "AArch64Subtarget.h" #include "AArch64TargetMachine.h" +#include "AArch64GlobalISelUtils.h" #include "MCTargetDesc/AArch64AddressingModes.h" #include "MCTargetDesc/AArch64MCTargetDesc.h" #include "llvm/ADT/Optional.h" @@ -1796,7 +1797,7 @@ bool AArch64InstructionSelector::selectVectorAshrLshr( NegOpc = AArch64::NEGv8i16; } else if (Ty == LLT::vector(16, 8)) { Opc = IsASHR ? AArch64::SSHLv16i8 : AArch64::USHLv16i8; - NegOpc = AArch64::NEGv8i16; + NegOpc = AArch64::NEGv16i8; } else if (Ty == LLT::vector(8, 8)) { Opc = IsASHR ? AArch64::SSHLv8i8 : AArch64::USHLv8i8; NegOpc = AArch64::NEGv8i8; @@ -2305,6 +2306,30 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { MachineIRBuilder MIB(I); switch (Opcode) { + case TargetOpcode::G_SBFX: + case TargetOpcode::G_UBFX: { + static const unsigned OpcTable[2][2] = { + {AArch64::UBFMWri, AArch64::UBFMXri}, + {AArch64::SBFMWri, AArch64::SBFMXri}}; + bool IsSigned = Opcode == TargetOpcode::G_SBFX; + unsigned Size = Ty.getSizeInBits(); + unsigned Opc = OpcTable[IsSigned][Size == 64]; + auto Cst1 = + getConstantVRegValWithLookThrough(I.getOperand(2).getReg(), MRI); + assert(Cst1 && "Should have gotten a constant for src 1?"); + auto Cst2 = + getConstantVRegValWithLookThrough(I.getOperand(3).getReg(), MRI); + assert(Cst2 && "Should have gotten a constant for src 2?"); + auto LSB = Cst1->Value.getZExtValue(); + auto Width = Cst2->Value.getZExtValue(); + MachineIRBuilder MIB(I); + auto BitfieldInst = + MIB.buildInstr(Opc, {I.getOperand(0)}, {I.getOperand(1)}) + .addImm(LSB) + .addImm(Width); + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*BitfieldInst, TII, TRI, RBI); + } case TargetOpcode::G_BRCOND: return selectCompareBranch(I, MF, MRI); @@ -4553,37 +4578,10 @@ MachineInstr *AArch64InstructionSelector::tryFoldIntegerCompare( // // cmn z, y - // Helper lambda to detect the subtract followed by the compare. - // Takes in the def of the LHS or RHS, and checks if it's a subtract from 0. - auto IsCMN = [&](MachineInstr *DefMI, const AArch64CC::CondCode &CC) { - if (!DefMI || DefMI->getOpcode() != TargetOpcode::G_SUB) - return false; - - // Need to make sure NZCV is the same at the end of the transformation. - if (CC != AArch64CC::EQ && CC != AArch64CC::NE) - return false; - - // We want to match against SUBs. - if (DefMI->getOpcode() != TargetOpcode::G_SUB) - return false; - - // Make sure that we're getting - // x = G_SUB 0, y - auto ValAndVReg = - getConstantVRegValWithLookThrough(DefMI->getOperand(1).getReg(), MRI); - if (!ValAndVReg || ValAndVReg->Value != 0) - return false; - - // This can safely be represented as a CMN. - return true; - }; - // Check if the RHS or LHS of the G_ICMP is defined by a SUB MachineInstr *LHSDef = getDefIgnoringCopies(LHS.getReg(), MRI); MachineInstr *RHSDef = getDefIgnoringCopies(RHS.getReg(), MRI); - CmpInst::Predicate P = (CmpInst::Predicate)Predicate.getPredicate(); - const AArch64CC::CondCode CC = changeICMPPredToAArch64CC(P); - + auto P = static_cast<CmpInst::Predicate>(Predicate.getPredicate()); // Given this: // // x = G_SUB 0, y @@ -4592,7 +4590,7 @@ MachineInstr *AArch64InstructionSelector::tryFoldIntegerCompare( // Produce this: // // cmn y, z - if (IsCMN(LHSDef, CC)) + if (isCMN(LHSDef, P, MRI)) return emitCMN(LHSDef->getOperand(2), RHS, MIRBuilder); // Same idea here, but with the RHS of the compare instead: @@ -4605,7 +4603,7 @@ MachineInstr *AArch64InstructionSelector::tryFoldIntegerCompare( // Produce this: // // cmn z, y - if (IsCMN(RHSDef, CC)) + if (isCMN(RHSDef, P, MRI)) return emitCMN(LHS, RHSDef->getOperand(2), MIRBuilder); // Given this: diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index 83ffe09612bb..d2b7b566cf85 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -647,6 +647,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) .legalForCartesianProduct( {s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32}) .scalarize(1); + getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF).lower(); getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) .legalIf([=](const LegalityQuery &Query) { @@ -681,7 +682,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) getActionDefinitionsBuilder(G_DYN_STACKALLOC).lower(); - getActionDefinitionsBuilder({G_MEMCPY, G_MEMMOVE, G_MEMSET}).libcall(); + getActionDefinitionsBuilder({G_BZERO, G_MEMCPY, G_MEMMOVE, G_MEMSET}) + .libcall(); getActionDefinitionsBuilder(G_ABS).lowerIf( [=](const LegalityQuery &Query) { return Query.Types[0].isScalar(); }); @@ -689,16 +691,33 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) getActionDefinitionsBuilder(G_VECREDUCE_FADD) // We only have FADDP to do reduction-like operations. Lower the rest. .legalFor({{s32, v2s32}, {s64, v2s64}}) + .clampMaxNumElements(1, s64, 2) + .clampMaxNumElements(1, s32, 2) .lower(); getActionDefinitionsBuilder(G_VECREDUCE_ADD) .legalFor( {{s8, v16s8}, {s16, v8s16}, {s32, v4s32}, {s32, v2s32}, {s64, v2s64}}) + .clampMaxNumElements(1, s64, 2) + .clampMaxNumElements(1, s32, 4) .lower(); getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) .lowerIf([=](const LegalityQuery &Q) { return Q.Types[0].isScalar(); }); + getActionDefinitionsBuilder({G_FSHL, G_FSHR}).lower(); + + getActionDefinitionsBuilder(G_ROTR) + .legalFor({{s32, s64}, {s64, s64}}) + .customIf([=](const LegalityQuery &Q) { + return Q.Types[0].isScalar() && Q.Types[1].getScalarSizeInBits() < 64; + }) + .lower(); + getActionDefinitionsBuilder(G_ROTL).lower(); + + getActionDefinitionsBuilder({G_SBFX, G_UBFX}) + .customFor({{s32, s32}, {s64, s64}}); + computeTables(); verify(*ST.getInstrInfo()); } @@ -725,11 +744,33 @@ bool AArch64LegalizerInfo::legalizeCustom(LegalizerHelper &Helper, return legalizeSmallCMGlobalValue(MI, MRI, MIRBuilder, Observer); case TargetOpcode::G_TRUNC: return legalizeVectorTrunc(MI, Helper); + case TargetOpcode::G_SBFX: + case TargetOpcode::G_UBFX: + return legalizeBitfieldExtract(MI, MRI, Helper); + case TargetOpcode::G_ROTR: + return legalizeRotate(MI, MRI, Helper); } llvm_unreachable("expected switch to return"); } +bool AArch64LegalizerInfo::legalizeRotate(MachineInstr &MI, + MachineRegisterInfo &MRI, + LegalizerHelper &Helper) const { + // To allow for imported patterns to match, we ensure that the rotate amount + // is 64b with an extension. + Register AmtReg = MI.getOperand(2).getReg(); + LLT AmtTy = MRI.getType(AmtReg); + (void)AmtTy; + assert(AmtTy.isScalar() && "Expected a scalar rotate"); + assert(AmtTy.getSizeInBits() < 64 && "Expected this rotate to be legal"); + auto NewAmt = Helper.MIRBuilder.buildSExt(LLT::scalar(64), AmtReg); + Helper.Observer.changingInstr(MI); + MI.getOperand(2).setReg(NewAmt.getReg(0)); + Helper.Observer.changedInstr(MI); + return true; +} + static void extractParts(Register Reg, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder, LLT Ty, int NumParts, SmallVectorImpl<Register> &VRegs) { @@ -944,3 +985,11 @@ bool AArch64LegalizerInfo::legalizeVaArg(MachineInstr &MI, MI.eraseFromParent(); return true; } + +bool AArch64LegalizerInfo::legalizeBitfieldExtract( + MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const { + // Only legal if we can select immediate forms. + // TODO: Lower this otherwise. + return getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI) && + getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); +} diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h index 8217e37c8512..5d78dc64a2f1 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h @@ -17,6 +17,7 @@ #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h" #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" namespace llvm { @@ -47,6 +48,10 @@ private: MachineIRBuilder &MIRBuilder, GISelChangeObserver &Observer) const; bool legalizeVectorTrunc(MachineInstr &MI, LegalizerHelper &Helper) const; + bool legalizeBitfieldExtract(MachineInstr &MI, MachineRegisterInfo &MRI, + LegalizerHelper &Helper) const; + bool legalizeRotate(MachineInstr &MI, MachineRegisterInfo &MRI, + LegalizerHelper &Helper) const; const AArch64Subtarget *ST; }; } // End llvm namespace. diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp index 66a5747e3031..4bfbcb5c419b 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp @@ -24,6 +24,7 @@ #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" +#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/MachineDominators.h" @@ -36,6 +37,7 @@ #define DEBUG_TYPE "aarch64-postlegalizer-combiner" using namespace llvm; +using namespace MIPatternMatch; /// This combine tries do what performExtractVectorEltCombine does in SDAG. /// Rewrite for pairwise fadd pattern @@ -238,6 +240,34 @@ bool applyAArch64MulConstCombine( return true; } +/// Form a G_SBFX from a G_SEXT_INREG fed by a right shift. +static bool matchBitfieldExtractFromSExtInReg( + MachineInstr &MI, MachineRegisterInfo &MRI, + std::function<void(MachineIRBuilder &)> &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_SEXT_INREG); + Register Dst = MI.getOperand(0).getReg(); + Register Src = MI.getOperand(1).getReg(); + int64_t Width = MI.getOperand(2).getImm(); + LLT Ty = MRI.getType(Src); + assert((Ty == LLT::scalar(32) || Ty == LLT::scalar(64)) && + "Unexpected type for G_SEXT_INREG?"); + Register ShiftSrc; + int64_t ShiftImm; + if (!mi_match( + Src, MRI, + m_OneNonDBGUse(m_any_of(m_GAShr(m_Reg(ShiftSrc), m_ICst(ShiftImm)), + m_GLShr(m_Reg(ShiftSrc), m_ICst(ShiftImm)))))) + return false; + if (ShiftImm < 0 || ShiftImm + Width > Ty.getSizeInBits()) + return false; + MatchInfo = [=](MachineIRBuilder &B) { + auto Cst1 = B.buildConstant(Ty, ShiftImm); + auto Cst2 = B.buildConstant(Ty, ShiftImm + Width - 1); + B.buildInstr(TargetOpcode::G_SBFX, {Dst}, {ShiftSrc, Cst1, Cst2}); + }; + return true; +} + #define AARCH64POSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS #include "AArch64GenPostLegalizeGICombiner.inc" #undef AARCH64POSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp index 130416a04c6d..558cd239f6f7 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp @@ -735,6 +735,113 @@ static bool applyBuildVectorToDup(MachineInstr &MI, MachineRegisterInfo &MRI, return true; } +/// \returns how many instructions would be saved by folding a G_ICMP's shift +/// and/or extension operations. +static unsigned getCmpOperandFoldingProfit(Register CmpOp, + const MachineRegisterInfo &MRI) { + // No instructions to save if there's more than one use or no uses. + if (!MRI.hasOneNonDBGUse(CmpOp)) + return 0; + + // FIXME: This is duplicated with the selector. (See: selectShiftedRegister) + auto IsSupportedExtend = [&](const MachineInstr &MI) { + if (MI.getOpcode() == TargetOpcode::G_SEXT_INREG) + return true; + if (MI.getOpcode() != TargetOpcode::G_AND) + return false; + auto ValAndVReg = + getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); + if (!ValAndVReg) + return false; + uint64_t Mask = ValAndVReg->Value.getZExtValue(); + return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF); + }; + + MachineInstr *Def = getDefIgnoringCopies(CmpOp, MRI); + if (IsSupportedExtend(*Def)) + return 1; + + unsigned Opc = Def->getOpcode(); + if (Opc != TargetOpcode::G_SHL && Opc != TargetOpcode::G_ASHR && + Opc != TargetOpcode::G_LSHR) + return 0; + + auto MaybeShiftAmt = + getConstantVRegValWithLookThrough(Def->getOperand(2).getReg(), MRI); + if (!MaybeShiftAmt) + return 0; + uint64_t ShiftAmt = MaybeShiftAmt->Value.getZExtValue(); + MachineInstr *ShiftLHS = + getDefIgnoringCopies(Def->getOperand(1).getReg(), MRI); + + // Check if we can fold an extend and a shift. + // FIXME: This is duplicated with the selector. (See: + // selectArithExtendedRegister) + if (IsSupportedExtend(*ShiftLHS)) + return (ShiftAmt <= 4) ? 2 : 1; + + LLT Ty = MRI.getType(Def->getOperand(0).getReg()); + if (Ty.isVector()) + return 0; + unsigned ShiftSize = Ty.getSizeInBits(); + if ((ShiftSize == 32 && ShiftAmt <= 31) || + (ShiftSize == 64 && ShiftAmt <= 63)) + return 1; + return 0; +} + +/// \returns true if it would be profitable to swap the LHS and RHS of a G_ICMP +/// instruction \p MI. +static bool trySwapICmpOperands(MachineInstr &MI, + const MachineRegisterInfo &MRI) { + assert(MI.getOpcode() == TargetOpcode::G_ICMP); + // Swap the operands if it would introduce a profitable folding opportunity. + // (e.g. a shift + extend). + // + // For example: + // lsl w13, w11, #1 + // cmp w13, w12 + // can be turned into: + // cmp w12, w11, lsl #1 + + // Don't swap if there's a constant on the RHS, because we know we can fold + // that. + Register RHS = MI.getOperand(3).getReg(); + auto RHSCst = getConstantVRegValWithLookThrough(RHS, MRI); + if (RHSCst && isLegalArithImmed(RHSCst->Value.getSExtValue())) + return false; + + Register LHS = MI.getOperand(2).getReg(); + auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate()); + auto GetRegForProfit = [&](Register Reg) { + MachineInstr *Def = getDefIgnoringCopies(Reg, MRI); + return isCMN(Def, Pred, MRI) ? Def->getOperand(2).getReg() : Reg; + }; + + // Don't have a constant on the RHS. If we swap the LHS and RHS of the + // compare, would we be able to fold more instructions? + Register TheLHS = GetRegForProfit(LHS); + Register TheRHS = GetRegForProfit(RHS); + + // If the LHS is more likely to give us a folding opportunity, then swap the + // LHS and RHS. + return (getCmpOperandFoldingProfit(TheLHS, MRI) > + getCmpOperandFoldingProfit(TheRHS, MRI)); +} + +static bool applySwapICmpOperands(MachineInstr &MI, + GISelChangeObserver &Observer) { + auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate()); + Register LHS = MI.getOperand(2).getReg(); + Register RHS = MI.getOperand(3).getReg(); + Observer.changedInstr(MI); + MI.getOperand(1).setPredicate(CmpInst::getSwappedPredicate(Pred)); + MI.getOperand(2).setReg(RHS); + MI.getOperand(3).setReg(LHS); + Observer.changedInstr(MI); + return true; +} + #define AARCH64POSTLEGALIZERLOWERINGHELPER_GENCOMBINERHELPER_DEPS #include "AArch64GenPostLegalizeGILowering.inc" #undef AARCH64POSTLEGALIZERLOWERINGHELPER_GENCOMBINERHELPER_DEPS diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp index 26029b4db11f..4efc63ea68b7 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp @@ -217,6 +217,46 @@ static bool applyFoldGlobalOffset(MachineInstr &MI, MachineRegisterInfo &MRI, return true; } +/// Replace a G_MEMSET with a value of 0 with a G_BZERO instruction if it is +/// supported and beneficial to do so. +/// +/// \note This only applies on Darwin. +/// +/// \returns true if \p MI was replaced with a G_BZERO. +static bool tryEmitBZero(MachineInstr &MI, MachineIRBuilder &MIRBuilder, + bool MinSize) { + assert(MI.getOpcode() == TargetOpcode::G_MEMSET); + MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); + auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering(); + if (!TLI.getLibcallName(RTLIB::BZERO)) + return false; + auto Zero = getConstantVRegValWithLookThrough(MI.getOperand(1).getReg(), MRI); + if (!Zero || Zero->Value.getSExtValue() != 0) + return false; + + // It's not faster to use bzero rather than memset for sizes <= 256. + // However, it *does* save us a mov from wzr, so if we're going for + // minsize, use bzero even if it's slower. + if (!MinSize) { + // If the size is known, check it. If it is not known, assume using bzero is + // better. + if (auto Size = + getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI)) { + if (Size->Value.getSExtValue() <= 256) + return false; + } + } + + MIRBuilder.setInstrAndDebugLoc(MI); + MIRBuilder + .buildInstr(TargetOpcode::G_BZERO, {}, + {MI.getOperand(0), MI.getOperand(2)}) + .addImm(MI.getOperand(3).getImm()) + .addMemOperand(*MI.memoperands_begin()); + MI.eraseFromParent(); + return true; +} + class AArch64PreLegalizerCombinerHelperState { protected: CombinerHelper &Helper; @@ -263,7 +303,8 @@ bool AArch64PreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, if (Generated.tryCombineAll(Observer, MI, B)) return true; - switch (MI.getOpcode()) { + unsigned Opc = MI.getOpcode(); + switch (Opc) { case TargetOpcode::G_CONCAT_VECTORS: return Helper.tryCombineConcatVectors(MI); case TargetOpcode::G_SHUFFLE_VECTOR: @@ -275,7 +316,11 @@ bool AArch64PreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, // heuristics decide. unsigned MaxLen = EnableOpt ? 0 : 32; // Try to inline memcpy type calls if optimizations are enabled. - return !EnableMinSize ? Helper.tryCombineMemCpyFamily(MI, MaxLen) : false; + if (!EnableMinSize && Helper.tryCombineMemCpyFamily(MI, MaxLen)) + return true; + if (Opc == TargetOpcode::G_MEMSET) + return tryEmitBZero(MI, B, EnableMinSize); + return false; } } diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index 178c83b98599..2d3aa10b8c1e 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -1019,6 +1019,30 @@ multiclass sve_int_perm_dup_i<string asm> { (!cast<Instruction>(NAME # _D) ZPR64:$Zd, FPR64asZPR:$Dn, 0), 2>; def : InstAlias<"mov $Zd, $Qn", (!cast<Instruction>(NAME # _Q) ZPR128:$Zd, FPR128asZPR:$Qn, 0), 2>; + + // Duplicate extracted element of vector into all vector elements + def : Pat<(nxv16i8 (AArch64dup (i32 (vector_extract (nxv16i8 ZPR:$vec), sve_elm_idx_extdup_b:$index)))), + (!cast<Instruction>(NAME # _B) ZPR:$vec, sve_elm_idx_extdup_b:$index)>; + def : Pat<(nxv8i16 (AArch64dup (i32 (vector_extract (nxv8i16 ZPR:$vec), sve_elm_idx_extdup_h:$index)))), + (!cast<Instruction>(NAME # _H) ZPR:$vec, sve_elm_idx_extdup_h:$index)>; + def : Pat<(nxv4i32 (AArch64dup (i32 (vector_extract (nxv4i32 ZPR:$vec), sve_elm_idx_extdup_s:$index)))), + (!cast<Instruction>(NAME # _S) ZPR:$vec, sve_elm_idx_extdup_s:$index)>; + def : Pat<(nxv2i64 (AArch64dup (i64 (vector_extract (nxv2i64 ZPR:$vec), sve_elm_idx_extdup_d:$index)))), + (!cast<Instruction>(NAME # _D) ZPR:$vec, sve_elm_idx_extdup_d:$index)>; + def : Pat<(nxv8f16 (AArch64dup (f16 (vector_extract (nxv8f16 ZPR:$vec), sve_elm_idx_extdup_h:$index)))), + (!cast<Instruction>(NAME # _H) ZPR:$vec, sve_elm_idx_extdup_h:$index)>; + def : Pat<(nxv8bf16 (AArch64dup (bf16 (vector_extract (nxv8bf16 ZPR:$vec), sve_elm_idx_extdup_h:$index)))), + (!cast<Instruction>(NAME # _H) ZPR:$vec, sve_elm_idx_extdup_h:$index)>; + def : Pat<(nxv4f16 (AArch64dup (f16 (vector_extract (nxv4f16 ZPR:$vec), sve_elm_idx_extdup_s:$index)))), + (!cast<Instruction>(NAME # _S) ZPR:$vec, sve_elm_idx_extdup_s:$index)>; + def : Pat<(nxv2f16 (AArch64dup (f16 (vector_extract (nxv2f16 ZPR:$vec), sve_elm_idx_extdup_d:$index)))), + (!cast<Instruction>(NAME # _D) ZPR:$vec, sve_elm_idx_extdup_d:$index)>; + def : Pat<(nxv4f32 (AArch64dup (f32 (vector_extract (nxv4f32 ZPR:$vec), sve_elm_idx_extdup_s:$index)))), + (!cast<Instruction>(NAME # _S) ZPR:$vec, sve_elm_idx_extdup_s:$index)>; + def : Pat<(nxv2f32 (AArch64dup (f32 (vector_extract (nxv2f32 ZPR:$vec), sve_elm_idx_extdup_d:$index)))), + (!cast<Instruction>(NAME # _D) ZPR:$vec, sve_elm_idx_extdup_d:$index)>; + def : Pat<(nxv2f64 (AArch64dup (f64 (vector_extract (nxv2f64 ZPR:$vec), sve_elm_idx_extdup_d:$index)))), + (!cast<Instruction>(NAME # _D) ZPR:$vec, sve_elm_idx_extdup_d:$index)>; } class sve_int_perm_tbl<bits<2> sz8_64, bits<2> opc, string asm, ZPRRegOp zprty, @@ -2828,10 +2852,8 @@ multiclass sve_intx_dot_by_indexed_elem<bit opc, string asm, let Inst{19-16} = Zm; } - def : Pat<(nxv4i32 (op nxv4i32:$Op1, nxv16i8:$Op2, nxv16i8:$Op3, (i32 VectorIndexS32b_timm:$idx))), - (!cast<Instruction>(NAME # _S) $Op1, $Op2, $Op3, VectorIndexS32b_timm:$idx)>; - def : Pat<(nxv2i64 (op nxv2i64:$Op1, nxv8i16:$Op2, nxv8i16:$Op3, (i32 VectorIndexD32b_timm:$idx))), - (!cast<Instruction>(NAME # _D) $Op1, $Op2, $Op3, VectorIndexD32b_timm:$idx)>; + def : SVE_4_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv16i8, nxv16i8, i32, VectorIndexS32b_timm, !cast<Instruction>(NAME # _S)>; + def : SVE_4_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv8i16, nxv8i16, i32, VectorIndexD32b_timm, !cast<Instruction>(NAME # _D)>; } //===----------------------------------------------------------------------===// @@ -4685,20 +4707,22 @@ multiclass sve_fp_3op_p_pd<bits<3> opc, string asm, SDPatternOperator op> { } multiclass sve_fp_3op_p_pd_cc<bits<3> opc, string asm, SDPatternOperator op, - SDPatternOperator op_nopred> + CondCode cc1, CondCode cc2, + CondCode invcc1, CondCode invcc2> : sve_fp_3op_p_pd<opc, asm, op> { - def : SVE_2_Op_AllActive_Pat<nxv8i1, op_nopred, nxv8f16, nxv8f16, - !cast<Instruction>(NAME # _H), PTRUE_H>; - def : SVE_2_Op_AllActive_Pat<nxv4i1, op_nopred, nxv4f16, nxv4f16, - !cast<Instruction>(NAME # _H), PTRUE_S>; - def : SVE_2_Op_AllActive_Pat<nxv2i1, op_nopred, nxv2f16, nxv2f16, - !cast<Instruction>(NAME # _H), PTRUE_D>; - def : SVE_2_Op_AllActive_Pat<nxv4i1, op_nopred, nxv4f32, nxv4f32, - !cast<Instruction>(NAME # _S), PTRUE_S>; - def : SVE_2_Op_AllActive_Pat<nxv2i1, op_nopred, nxv2f32, nxv2f32, - !cast<Instruction>(NAME # _S), PTRUE_D>; - def : SVE_2_Op_AllActive_Pat<nxv2i1, op_nopred, nxv2f64, nxv2f64, - !cast<Instruction>(NAME # _D), PTRUE_D>; + defm : SVE_SETCC_Pat<cc1, invcc1, nxv8i1, nxv8f16, !cast<Instruction>(NAME # _H)>; + defm : SVE_SETCC_Pat<cc1, invcc1, nxv4i1, nxv4f16, !cast<Instruction>(NAME # _H)>; + defm : SVE_SETCC_Pat<cc1, invcc1, nxv2i1, nxv2f16, !cast<Instruction>(NAME # _H)>; + defm : SVE_SETCC_Pat<cc1, invcc1, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>; + defm : SVE_SETCC_Pat<cc1, invcc1, nxv2i1, nxv2f32, !cast<Instruction>(NAME # _S)>; + defm : SVE_SETCC_Pat<cc1, invcc1, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>; + + defm : SVE_SETCC_Pat<cc2, invcc2, nxv8i1, nxv8f16, !cast<Instruction>(NAME # _H)>; + defm : SVE_SETCC_Pat<cc2, invcc2, nxv4i1, nxv4f16, !cast<Instruction>(NAME # _H)>; + defm : SVE_SETCC_Pat<cc2, invcc2, nxv2i1, nxv2f16, !cast<Instruction>(NAME # _H)>; + defm : SVE_SETCC_Pat<cc2, invcc2, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>; + defm : SVE_SETCC_Pat<cc2, invcc2, nxv2i1, nxv2f32, !cast<Instruction>(NAME # _S)>; + defm : SVE_SETCC_Pat<cc2, invcc2, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>; } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp index 6b8cb786bb6c..e90740030460 100644 --- a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp +++ b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp @@ -78,6 +78,7 @@ private: static bool optimizeConvertFromSVBool(IntrinsicInst *I); static bool optimizePTest(IntrinsicInst *I); static bool optimizeVectorMul(IntrinsicInst *I); + static bool optimizeTBL(IntrinsicInst *I); static bool processPhiNode(IntrinsicInst *I); }; @@ -437,6 +438,41 @@ bool SVEIntrinsicOpts::optimizeVectorMul(IntrinsicInst *I) { return Changed; } +bool SVEIntrinsicOpts::optimizeTBL(IntrinsicInst *I) { + assert(I->getIntrinsicID() == Intrinsic::aarch64_sve_tbl && + "Unexpected opcode"); + + auto *OpVal = I->getOperand(0); + auto *OpIndices = I->getOperand(1); + VectorType *VTy = cast<VectorType>(I->getType()); + + // Check whether OpIndices is an aarch64_sve_dup_x intrinsic call with + // constant splat value < minimal element count of result. + auto *DupXIntrI = dyn_cast<IntrinsicInst>(OpIndices); + if (!DupXIntrI || DupXIntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup_x) + return false; + + auto *SplatValue = dyn_cast<ConstantInt>(DupXIntrI->getOperand(0)); + if (!SplatValue || + SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue())) + return false; + + // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to + // splat_vector(extractelement(OpVal, SplatValue)) for further optimization. + LLVMContext &Ctx = I->getContext(); + IRBuilder<> Builder(Ctx); + Builder.SetInsertPoint(I); + auto *Extract = Builder.CreateExtractElement(OpVal, SplatValue); + auto *VectorSplat = + Builder.CreateVectorSplat(VTy->getElementCount(), Extract); + + I->replaceAllUsesWith(VectorSplat); + I->eraseFromParent(); + if (DupXIntrI->use_empty()) + DupXIntrI->eraseFromParent(); + return true; +} + bool SVEIntrinsicOpts::optimizeConvertFromSVBool(IntrinsicInst *I) { assert(I->getIntrinsicID() == Intrinsic::aarch64_sve_convert_from_svbool && "Unexpected opcode"); @@ -507,6 +543,8 @@ bool SVEIntrinsicOpts::optimizeIntrinsic(Instruction *I) { case Intrinsic::aarch64_sve_ptest_first: case Intrinsic::aarch64_sve_ptest_last: return optimizePTest(IntrI); + case Intrinsic::aarch64_sve_tbl: + return optimizeTBL(IntrI); default: return false; } @@ -560,6 +598,7 @@ bool SVEIntrinsicOpts::runOnModule(Module &M) { case Intrinsic::aarch64_sve_ptrue: case Intrinsic::aarch64_sve_mul: case Intrinsic::aarch64_sve_fmul: + case Intrinsic::aarch64_sve_tbl: for (User *U : F.users()) Functions.insert(cast<Instruction>(U)->getFunction()); break; |