-
Notifications
You must be signed in to change notification settings - Fork 13.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[ValueTracking] Improve Bitcast
handling to match SDAG
#125935
base: main
Are you sure you want to change the base?
Conversation
@RKSimon Can you please help me with some more tests? I only have this one from I got from the C code you provided in the issue (compiled with O0). define dso_local noundef <2 x i64> @shift64(<2 x i64> noundef %v, <2 x i64> noundef %s) {
entry:
%__q1.addr.i = alloca i64, align 8
%__q0.addr.i = alloca i64, align 8
%.compoundliteral.i = alloca <2 x i64>, align 16
%__a.addr.i13 = alloca <2 x i64>, align 16
%__a.addr.i12 = alloca <2 x i64>, align 16
%__a.addr.i11 = alloca <2 x double>, align 16
%__a.addr.i9 = alloca <2 x i64>, align 16
%__count.addr.i10 = alloca <2 x i64>, align 16
%__a.addr.i8 = alloca <2 x i64>, align 16
%__count.addr.i = alloca <2 x i64>, align 16
%__q.addr.i = alloca i64, align 8
%__a.addr.i = alloca <2 x i64>, align 16
%__b.addr.i = alloca <2 x i64>, align 16
%v.addr = alloca <2 x i64>, align 16
%s.addr = alloca <2 x i64>, align 16
%x_ = alloca <2 x i64>, align 16
%_y = alloca <2 x i64>, align 16
%xy = alloca <2 x i64>, align 16
store <2 x i64> %v, ptr %v.addr, align 16
store <2 x i64> %s, ptr %s.addr, align 16
%0 = load <2 x i64>, ptr %s.addr, align 16
store i64 63, ptr %__q.addr.i, align 8
%1 = load i64, ptr %__q.addr.i, align 8
%2 = load i64, ptr %__q.addr.i, align 8
store i64 %1, ptr %__q1.addr.i, align 8
store i64 %2, ptr %__q0.addr.i, align 8
%3 = load i64, ptr %__q0.addr.i, align 8
%vecinit.i = insertelement <2 x i64> poison, i64 %3, i32 0
%4 = load i64, ptr %__q1.addr.i, align 8
%vecinit1.i = insertelement <2 x i64> %vecinit.i, i64 %4, i32 1
store <2 x i64> %vecinit1.i, ptr %.compoundliteral.i, align 16
%5 = load <2 x i64>, ptr %.compoundliteral.i, align 16
store <2 x i64> %0, ptr %__a.addr.i, align 16
store <2 x i64> %5, ptr %__b.addr.i, align 16
%6 = load <2 x i64>, ptr %__a.addr.i, align 16
%7 = load <2 x i64>, ptr %__b.addr.i, align 16
%and.i = and <2 x i64> %6, %7
store <2 x i64> %and.i, ptr %s.addr, align 16
%8 = load <2 x i64>, ptr %v.addr, align 16
%9 = load <2 x i64>, ptr %s.addr, align 16
store <2 x i64> %8, ptr %__a.addr.i9, align 16
store <2 x i64> %9, ptr %__count.addr.i10, align 16
%10 = load <2 x i64>, ptr %__a.addr.i9, align 16
%11 = load <2 x i64>, ptr %__count.addr.i10, align 16
%12 = call noundef <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %10, <2 x i64> %11)
store <2 x i64> %12, ptr %x_, align 16
%13 = load <2 x i64>, ptr %v.addr, align 16
%14 = load <2 x i64>, ptr %s.addr, align 16
%cast = bitcast <2 x i64> %14 to <16 x i8>
%psrldq = shufflevector <16 x i8> %cast, <16 x i8> zeroinitializer, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
%cast3 = bitcast <16 x i8> %psrldq to <2 x i64>
store <2 x i64> %13, ptr %__a.addr.i8, align 16
store <2 x i64> %cast3, ptr %__count.addr.i, align 16
%15 = load <2 x i64>, ptr %__a.addr.i8, align 16
%16 = load <2 x i64>, ptr %__count.addr.i, align 16
%17 = call noundef <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %15, <2 x i64> %16)
store <2 x i64> %17, ptr %_y, align 16
%18 = load <2 x i64>, ptr %x_, align 16
store <2 x i64> %18, ptr %__a.addr.i13, align 16
%19 = load <2 x i64>, ptr %__a.addr.i13, align 16
%20 = bitcast <2 x i64> %19 to <2 x double>
%21 = load <2 x i64>, ptr %_y, align 16
store <2 x i64> %21, ptr %__a.addr.i12, align 16
%22 = load <2 x i64>, ptr %__a.addr.i12, align 16
%23 = bitcast <2 x i64> %22 to <2 x double>
%shufp = shufflevector <2 x double> %20, <2 x double> %23, <2 x i32> <i32 0, i32 3>
store <2 x double> %shufp, ptr %__a.addr.i11, align 16
%24 = load <2 x double>, ptr %__a.addr.i11, align 16
%25 = bitcast <2 x double> %24 to <2 x i64>
store <2 x i64> %25, ptr %xy, align 16
%26 = load <2 x i64>, ptr %xy, align 16
ret <2 x i64> %26
}
declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) #1
|
https://zig.godbolt.org/z/PWqzTGd31 - should be able to replace both psll intrinsics (currently only one) |
@llvm/pr-subscribers-llvm-analysis @llvm/pr-subscribers-llvm-transforms Author: Abhishek Kaushik (abhishek-kaushik22) ChangesCloses #125228 Full diff: https://github.com/llvm/llvm-project/pull/125935.diff 2 Files Affected:
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 6eba6c0f08c3f40..1a2ed6d4a75f871 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -1301,6 +1301,8 @@ static void computeKnownBitsFromOperator(const Operator *I,
isa<ScalableVectorType>(I->getType()))
break;
+ unsigned NumElts = DemandedElts.getBitWidth();
+ bool IsLE = Q.DL.isLittleEndian();
// Look through a cast from narrow vector elements to wider type.
// Examples: v4i32 -> v2i64, v3i8 -> v24
unsigned SubBitWidth = SrcVecTy->getScalarSizeInBits();
@@ -1319,7 +1321,6 @@ static void computeKnownBitsFromOperator(const Operator *I,
//
// The known bits of each sub-element are then inserted into place
// (dependent on endian) to form the full result of known bits.
- unsigned NumElts = DemandedElts.getBitWidth();
unsigned SubScale = BitWidth / SubBitWidth;
APInt SubDemandedElts = APInt::getZero(NumElts * SubScale);
for (unsigned i = 0; i != NumElts; ++i) {
@@ -1331,10 +1332,31 @@ static void computeKnownBitsFromOperator(const Operator *I,
for (unsigned i = 0; i != SubScale; ++i) {
computeKnownBits(I->getOperand(0), SubDemandedElts.shl(i), KnownSrc,
Depth + 1, Q);
- unsigned ShiftElt = Q.DL.isLittleEndian() ? i : SubScale - 1 - i;
+ unsigned ShiftElt = IsLE ? i : SubScale - 1 - i;
Known.insertBits(KnownSrc, ShiftElt * SubBitWidth);
}
}
+
+ if (SubBitWidth % BitWidth == 0) {
+ unsigned SubScale = SubBitWidth / BitWidth;
+ KnownBits KnownSrc(SubBitWidth);
+ APInt SubDemandedElts =
+ APIntOps::ScaleBitMask(DemandedElts, NumElts / SubScale);
+ computeKnownBits(I->getOperand(0), SubDemandedElts, KnownSrc, Depth + 1,
+ Q);
+
+ Known.Zero.setAllBits();
+ Known.One.setAllBits();
+ for (unsigned i = 0; i != SubScale; ++i) {
+ if (DemandedElts[i]) {
+ unsigned Shifts = IsLE ? i : NumElts - 1 - i;
+ unsigned Offset = (Shifts % SubScale) * BitWidth;
+ Known = Known.intersectWith(KnownSrc.extractBits(BitWidth, Offset));
+ if (Known.isUnknown())
+ break;
+ }
+ }
+ }
break;
}
case Instruction::SExt: {
diff --git a/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll b/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll
index d38e22a4e3851be..9fb4bbc557f3cb3 100644
--- a/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll
@@ -3732,6 +3732,34 @@ define <4 x i64> @test_avx2_psrl_0() {
ret <4 x i64> %16
}
+define <2 x i64> @pr125228(<2 x i64> %v, <2 x i64> %s) {
+; CHECK-LABEL: @pr125228(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[MASK:%.*]] = and <2 x i64> [[S:%.*]], splat (i64 63)
+; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <2 x i64> [[MASK]], <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[SLL0:%.*]] = shl <2 x i64> [[V:%.*]], [[TMP0]]
+; CHECK-NEXT: [[CAST:%.*]] = bitcast <2 x i64> [[MASK]] to <16 x i8>
+; CHECK-NEXT: [[PSRLDQ:%.*]] = shufflevector <16 x i8> [[CAST]], <16 x i8> poison, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[CAST3:%.*]] = bitcast <16 x i8> [[PSRLDQ]] to <2 x i64>
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[CAST3]], <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[SLL1:%.*]] = shl <2 x i64> [[V]], [[TMP1]]
+; CHECK-NEXT: [[SHUFP_UNCASTED:%.*]] = shufflevector <2 x i64> [[SLL0]], <2 x i64> [[SLL1]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: ret <2 x i64> [[SHUFP_UNCASTED]]
+;
+entry:
+ %mask = and <2 x i64> %s, splat (i64 63)
+ %sll0 = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %v, <2 x i64> %mask)
+ %cast = bitcast <2 x i64> %mask to <16 x i8>
+ %psrldq = shufflevector <16 x i8> %cast, <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+ %cast3 = bitcast <16 x i8> %psrldq to <2 x i64>
+ %sll1 = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %v, <2 x i64> %cast3)
+ %cast0 = bitcast <2 x i64> %sll0 to <2 x double>
+ %cast1 = bitcast <2 x i64> %sll1 to <2 x double>
+ %shufp = shufflevector <2 x double> %cast0, <2 x double> %cast1, <2 x i32> <i32 0, i32 3>
+ %res = bitcast <2 x double> %shufp to <2 x i64>
+ ret <2 x i64> %res
+}
+
declare <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64>, i32) #1
declare <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32>, i32) #1
declare <32 x i16> @llvm.x86.avx512.pslli.w.512(<32 x i16>, i32) #1
|
Pulled out of #125935 to show diff
Please can you rebase - I've added the test to trunk so this patch can show the diff more easily. The next step is add to add more test coverage - if you comment out the DAG equivalent code and see the test changes you might be able to adapt some of those tests to create new IR tests? |
Pulled out of llvm#125935 to show diff
941e0dd
to
345c8fa
Compare
Pulled out of llvm#125935 to show diff
@abhishek-kaushik22 any luck with generating extra tests? |
I've been looking at SDag tests that fail after commenting out the bitcast code but I've only been able to generate a single test (from ~950 failures). I'll try to generate more this week. |
Closes #125228