aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlbion Fung <albion.fung@ibm.com>2021-03-09 16:07:31 -0500
committerAlbion Fung <conanap@lep82435v.canlab.ibm.com>2021-03-09 16:08:17 -0500
commit9b6ac9e999e730ee3a9f0bf4850f7794274de3f0 (patch)
tree71af007db61e2dfeaf1bb3a3230a1771712737b5 /llvm/lib/Target/PowerPC
parentRevert "[InstCombine] Add simplification of two logical and/ors" (diff)
downloadllvm-project-9b6ac9e999e730ee3a9f0bf4850f7794274de3f0.tar.gz
llvm-project-9b6ac9e999e730ee3a9f0bf4850f7794274de3f0.tar.bz2
llvm-project-9b6ac9e999e730ee3a9f0bf4850f7794274de3f0.zip
[P10] [Power PC] Exploiting new load rightmost vector element instructions.
This pull request implements patterns to exploit the load rightmost vector element instructions for loading element 0 on little endian PowerPC subtargets into v8i16 and v16i8 vector registers for i16 and i8 data types. Differential Revision: https://reviews.llvm.org/D94816#inline-921403
Diffstat (limited to 'llvm/lib/Target/PowerPC')
-rw-r--r--llvm/lib/Target/PowerPC/PPCInstrPrefix.td5
-rw-r--r--llvm/lib/Target/PowerPC/PPCInstrVSX.td41
2 files changed, 40 insertions, 6 deletions
diff --git a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td
index 0c6749c8d235..14af94f5c814 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td
@@ -2563,6 +2563,11 @@ let AddedComplexity = 400, Predicates = [IsISA3_1, IsLittleEndian] in {
(STXVRDX $src, xoaddr:$dst)>;
def : Pat<(store (f64 (extractelt v2f64:$src, 0)), xoaddr:$dst),
(STXVRDX $src, xoaddr:$dst)>;
+ // Load element 0 of a VSX register to memory
+ def : Pat<(v8i16 (scalar_to_vector (i32 (extloadi16 xoaddr:$src)))),
+ (v8i16 (COPY_TO_REGCLASS (LXVRHX xoaddr:$src), VSRC))>;
+ def : Pat<(v16i8 (scalar_to_vector (i32 (extloadi8 xoaddr:$src)))),
+ (v16i8 (COPY_TO_REGCLASS (LXVRBX xoaddr:$src), VSRC))>;
}
// FIXME: The swap is overkill when the shift amount is a constant.
diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
index e8babce4fb20..475098efedd7 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
@@ -152,6 +152,7 @@ def HasDirectMove : Predicate<"Subtarget->hasDirectMove()">;
def NoP9Vector : Predicate<"!Subtarget->hasP9Vector()">;
def HasP9Vector : Predicate<"Subtarget->hasP9Vector()">;
def NoP9Altivec : Predicate<"!Subtarget->hasP9Altivec()">;
+def NoP10Vector: Predicate<"!Subtarget->hasP10Vector()">;
//--------------------- VSX-specific instruction formats ---------------------//
// By default, all VSX instructions are to be selected over their Altivec
@@ -2437,6 +2438,8 @@ def MrgWords {
// [HasVSX, HasDirectMove, NoP9Altivec, IsLittleEndian]
// [HasVSX, HasDirectMove, NoP9Vector, IsLittleEndian]
// [HasVSX, HasP9Vector]
+// [HasVSX, HasP9Vector, NoP10Vector]
+// [HasVSX, HasP9Vector, IsBigEndian]
// [HasVSX, HasP9Vector, IsBigEndian, IsPPC64]
// [HasVSX, HasP9Vector, IsLittleEndian]
// [HasVSX, HasP9Altivec]
@@ -3735,9 +3738,6 @@ def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, xoaddr:$dst),
(STXVX $rS, xoaddr:$dst)>;
// Build vectors from i8 loads
-defm : ScalToVecWPermute<v16i8, ScalarLoads.Li8,
- (VSPLTBs 7, (LXSIBZX xoaddr:$src)),
- (VSPLTBs 7, (LXSIBZX xoaddr:$src))>;
defm : ScalToVecWPermute<v8i16, ScalarLoads.ZELi8,
(VSPLTHs 3, (LXSIBZX xoaddr:$src)),
(VSPLTHs 3, (LXSIBZX xoaddr:$src))>;
@@ -3755,9 +3755,6 @@ defm : ScalToVecWPermute<v2i64, ScalarLoads.SELi8i64,
(XXPERMDIs (VEXTSB2Ds (LXSIBZX xoaddr:$src)), 0)>;
// Build vectors from i16 loads
-defm : ScalToVecWPermute<v8i16, ScalarLoads.Li16,
- (VSPLTHs 3, (LXSIHZX xoaddr:$src)),
- (VSPLTHs 3, (LXSIHZX xoaddr:$src))>;
defm : ScalToVecWPermute<v4i32, ScalarLoads.ZELi16,
(XXSPLTWs (LXSIHZX xoaddr:$src), 1),
(XXSPLTWs (LXSIHZX xoaddr:$src), 1)>;
@@ -3955,6 +3952,38 @@ def : Pat<(v4i32 (PPCldsplat xoaddr:$A)),
(v4i32 (LXVWSX xoaddr:$A))>;
} // HasVSX, HasP9Vector
+// Any Power9 VSX subtarget with equivalent length but better Power10 VSX
+// patterns.
+// Two identical blocks are required due to the slightly different predicates:
+// One without P10 instructions, the other is BigEndian only with P10 instructions.
+let Predicates = [HasVSX, HasP9Vector, NoP10Vector] in {
+// Little endian Power10 subtargets produce a shorter pattern but require a
+// COPY_TO_REGCLASS. The COPY_TO_REGCLASS makes it appear to need two instructions
+// to perform the operation, when only one instruction is produced in practice.
+// The NoP10Vector predicate excludes these patterns from Power10 VSX subtargets.
+defm : ScalToVecWPermute<v16i8, ScalarLoads.Li8,
+ (VSPLTBs 7, (LXSIBZX xoaddr:$src)),
+ (VSPLTBs 7, (LXSIBZX xoaddr:$src))>;
+// Build vectors from i16 loads
+defm : ScalToVecWPermute<v8i16, ScalarLoads.Li16,
+ (VSPLTHs 3, (LXSIHZX xoaddr:$src)),
+ (VSPLTHs 3, (LXSIHZX xoaddr:$src))>;
+} // HasVSX, HasP9Vector, NoP10Vector
+
+// Any big endian Power9 VSX subtarget
+let Predicates = [HasVSX, HasP9Vector, IsBigEndian] in {
+// Power10 VSX subtargets produce a shorter pattern for little endian targets
+// but this is still the best pattern for Power9 and Power10 VSX big endian
+// Build vectors from i8 loads
+defm : ScalToVecWPermute<v16i8, ScalarLoads.Li8,
+ (VSPLTBs 7, (LXSIBZX xoaddr:$src)),
+ (VSPLTBs 7, (LXSIBZX xoaddr:$src))>;
+// Build vectors from i16 loads
+defm : ScalToVecWPermute<v8i16, ScalarLoads.Li16,
+ (VSPLTHs 3, (LXSIHZX xoaddr:$src)),
+ (VSPLTHs 3, (LXSIHZX xoaddr:$src))>;
+} // HasVSX, HasP9Vector, NoP10Vector
+
// Big endian 64Bit Power9 subtarget.
let Predicates = [HasVSX, HasP9Vector, IsBigEndian, IsPPC64] in {
def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 0)))))),