diff options
author | Albion Fung <albion.fung@ibm.com> | 2021-03-09 16:07:31 -0500 |
---|---|---|
committer | Albion Fung <conanap@lep82435v.canlab.ibm.com> | 2021-03-09 16:08:17 -0500 |
commit | 9b6ac9e999e730ee3a9f0bf4850f7794274de3f0 (patch) | |
tree | 71af007db61e2dfeaf1bb3a3230a1771712737b5 /llvm/lib/Target/PowerPC | |
parent | Revert "[InstCombine] Add simplification of two logical and/ors" (diff) | |
download | llvm-project-9b6ac9e999e730ee3a9f0bf4850f7794274de3f0.tar.gz llvm-project-9b6ac9e999e730ee3a9f0bf4850f7794274de3f0.tar.bz2 llvm-project-9b6ac9e999e730ee3a9f0bf4850f7794274de3f0.zip |
[P10] [Power PC] Exploiting new load rightmost vector element instructions.
This pull request implements patterns to exploit the load rightmost vector
element instructions for loading element 0 on little endian PowerPC subtargets
into v8i16 and v16i8 vector registers for i16 and i8 data types.
Differential Revision: https://reviews.llvm.org/D94816#inline-921403
Diffstat (limited to 'llvm/lib/Target/PowerPC')
-rw-r--r-- | llvm/lib/Target/PowerPC/PPCInstrPrefix.td | 5 | ||||
-rw-r--r-- | llvm/lib/Target/PowerPC/PPCInstrVSX.td | 41 |
2 files changed, 40 insertions, 6 deletions
diff --git a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td index 0c6749c8d235..14af94f5c814 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td +++ b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td @@ -2563,6 +2563,11 @@ let AddedComplexity = 400, Predicates = [IsISA3_1, IsLittleEndian] in { (STXVRDX $src, xoaddr:$dst)>; def : Pat<(store (f64 (extractelt v2f64:$src, 0)), xoaddr:$dst), (STXVRDX $src, xoaddr:$dst)>; + // Load element 0 of a VSX register to memory + def : Pat<(v8i16 (scalar_to_vector (i32 (extloadi16 xoaddr:$src)))), + (v8i16 (COPY_TO_REGCLASS (LXVRHX xoaddr:$src), VSRC))>; + def : Pat<(v16i8 (scalar_to_vector (i32 (extloadi8 xoaddr:$src)))), + (v16i8 (COPY_TO_REGCLASS (LXVRBX xoaddr:$src), VSRC))>; } // FIXME: The swap is overkill when the shift amount is a constant. diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td index e8babce4fb20..475098efedd7 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td +++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td @@ -152,6 +152,7 @@ def HasDirectMove : Predicate<"Subtarget->hasDirectMove()">; def NoP9Vector : Predicate<"!Subtarget->hasP9Vector()">; def HasP9Vector : Predicate<"Subtarget->hasP9Vector()">; def NoP9Altivec : Predicate<"!Subtarget->hasP9Altivec()">; +def NoP10Vector: Predicate<"!Subtarget->hasP10Vector()">; //--------------------- VSX-specific instruction formats ---------------------// // By default, all VSX instructions are to be selected over their Altivec @@ -2437,6 +2438,8 @@ def MrgWords { // [HasVSX, HasDirectMove, NoP9Altivec, IsLittleEndian] // [HasVSX, HasDirectMove, NoP9Vector, IsLittleEndian] // [HasVSX, HasP9Vector] +// [HasVSX, HasP9Vector, NoP10Vector] +// [HasVSX, HasP9Vector, IsBigEndian] // [HasVSX, HasP9Vector, IsBigEndian, IsPPC64] // [HasVSX, HasP9Vector, IsLittleEndian] // [HasVSX, HasP9Altivec] @@ -3735,9 +3738,6 @@ def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, xoaddr:$dst), (STXVX $rS, xoaddr:$dst)>; // Build vectors from i8 loads -defm : ScalToVecWPermute<v16i8, ScalarLoads.Li8, - (VSPLTBs 7, (LXSIBZX xoaddr:$src)), - (VSPLTBs 7, (LXSIBZX xoaddr:$src))>; defm : ScalToVecWPermute<v8i16, ScalarLoads.ZELi8, (VSPLTHs 3, (LXSIBZX xoaddr:$src)), (VSPLTHs 3, (LXSIBZX xoaddr:$src))>; @@ -3755,9 +3755,6 @@ defm : ScalToVecWPermute<v2i64, ScalarLoads.SELi8i64, (XXPERMDIs (VEXTSB2Ds (LXSIBZX xoaddr:$src)), 0)>; // Build vectors from i16 loads -defm : ScalToVecWPermute<v8i16, ScalarLoads.Li16, - (VSPLTHs 3, (LXSIHZX xoaddr:$src)), - (VSPLTHs 3, (LXSIHZX xoaddr:$src))>; defm : ScalToVecWPermute<v4i32, ScalarLoads.ZELi16, (XXSPLTWs (LXSIHZX xoaddr:$src), 1), (XXSPLTWs (LXSIHZX xoaddr:$src), 1)>; @@ -3955,6 +3952,38 @@ def : Pat<(v4i32 (PPCldsplat xoaddr:$A)), (v4i32 (LXVWSX xoaddr:$A))>; } // HasVSX, HasP9Vector +// Any Power9 VSX subtarget with equivalent length but better Power10 VSX +// patterns. +// Two identical blocks are required due to the slightly different predicates: +// One without P10 instructions, the other is BigEndian only with P10 instructions. +let Predicates = [HasVSX, HasP9Vector, NoP10Vector] in { +// Little endian Power10 subtargets produce a shorter pattern but require a +// COPY_TO_REGCLASS. The COPY_TO_REGCLASS makes it appear to need two instructions +// to perform the operation, when only one instruction is produced in practice. +// The NoP10Vector predicate excludes these patterns from Power10 VSX subtargets. +defm : ScalToVecWPermute<v16i8, ScalarLoads.Li8, + (VSPLTBs 7, (LXSIBZX xoaddr:$src)), + (VSPLTBs 7, (LXSIBZX xoaddr:$src))>; +// Build vectors from i16 loads +defm : ScalToVecWPermute<v8i16, ScalarLoads.Li16, + (VSPLTHs 3, (LXSIHZX xoaddr:$src)), + (VSPLTHs 3, (LXSIHZX xoaddr:$src))>; +} // HasVSX, HasP9Vector, NoP10Vector + +// Any big endian Power9 VSX subtarget +let Predicates = [HasVSX, HasP9Vector, IsBigEndian] in { +// Power10 VSX subtargets produce a shorter pattern for little endian targets +// but this is still the best pattern for Power9 and Power10 VSX big endian +// Build vectors from i8 loads +defm : ScalToVecWPermute<v16i8, ScalarLoads.Li8, + (VSPLTBs 7, (LXSIBZX xoaddr:$src)), + (VSPLTBs 7, (LXSIBZX xoaddr:$src))>; +// Build vectors from i16 loads +defm : ScalToVecWPermute<v8i16, ScalarLoads.Li16, + (VSPLTHs 3, (LXSIHZX xoaddr:$src)), + (VSPLTHs 3, (LXSIHZX xoaddr:$src))>; +} // HasVSX, HasP9Vector, NoP10Vector + // Big endian 64Bit Power9 subtarget. let Predicates = [HasVSX, HasP9Vector, IsBigEndian, IsPPC64] in { def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 0)))))), |