Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ValueTracking] Improve Bitcast handling to match SDAG #125935

Open
wants to merge 4 commits into
base: main
Choose a base branch
from

Conversation

abhishek-kaushik22
Copy link
Contributor

Closes #125228

@abhishek-kaushik22
Copy link
Contributor Author

@RKSimon Can you please help me with some more tests? I only have this one from I got from the C code you provided in the issue (compiled with O0).

define dso_local noundef <2 x i64> @shift64(<2 x i64> noundef %v, <2 x i64> noundef %s) {
entry:
  %__q1.addr.i = alloca i64, align 8
  %__q0.addr.i = alloca i64, align 8
  %.compoundliteral.i = alloca <2 x i64>, align 16
  %__a.addr.i13 = alloca <2 x i64>, align 16
  %__a.addr.i12 = alloca <2 x i64>, align 16
  %__a.addr.i11 = alloca <2 x double>, align 16
  %__a.addr.i9 = alloca <2 x i64>, align 16
  %__count.addr.i10 = alloca <2 x i64>, align 16
  %__a.addr.i8 = alloca <2 x i64>, align 16
  %__count.addr.i = alloca <2 x i64>, align 16
  %__q.addr.i = alloca i64, align 8
  %__a.addr.i = alloca <2 x i64>, align 16
  %__b.addr.i = alloca <2 x i64>, align 16
  %v.addr = alloca <2 x i64>, align 16
  %s.addr = alloca <2 x i64>, align 16
  %x_ = alloca <2 x i64>, align 16
  %_y = alloca <2 x i64>, align 16
  %xy = alloca <2 x i64>, align 16
  store <2 x i64> %v, ptr %v.addr, align 16
  store <2 x i64> %s, ptr %s.addr, align 16
  %0 = load <2 x i64>, ptr %s.addr, align 16
  store i64 63, ptr %__q.addr.i, align 8
  %1 = load i64, ptr %__q.addr.i, align 8
  %2 = load i64, ptr %__q.addr.i, align 8
  store i64 %1, ptr %__q1.addr.i, align 8
  store i64 %2, ptr %__q0.addr.i, align 8
  %3 = load i64, ptr %__q0.addr.i, align 8
  %vecinit.i = insertelement <2 x i64> poison, i64 %3, i32 0
  %4 = load i64, ptr %__q1.addr.i, align 8
  %vecinit1.i = insertelement <2 x i64> %vecinit.i, i64 %4, i32 1
  store <2 x i64> %vecinit1.i, ptr %.compoundliteral.i, align 16
  %5 = load <2 x i64>, ptr %.compoundliteral.i, align 16
  store <2 x i64> %0, ptr %__a.addr.i, align 16
  store <2 x i64> %5, ptr %__b.addr.i, align 16
  %6 = load <2 x i64>, ptr %__a.addr.i, align 16
  %7 = load <2 x i64>, ptr %__b.addr.i, align 16
  %and.i = and <2 x i64> %6, %7
  store <2 x i64> %and.i, ptr %s.addr, align 16
  %8 = load <2 x i64>, ptr %v.addr, align 16
  %9 = load <2 x i64>, ptr %s.addr, align 16
  store <2 x i64> %8, ptr %__a.addr.i9, align 16
  store <2 x i64> %9, ptr %__count.addr.i10, align 16
  %10 = load <2 x i64>, ptr %__a.addr.i9, align 16
  %11 = load <2 x i64>, ptr %__count.addr.i10, align 16
  %12 = call noundef <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %10, <2 x i64> %11)
  store <2 x i64> %12, ptr %x_, align 16
  %13 = load <2 x i64>, ptr %v.addr, align 16
  %14 = load <2 x i64>, ptr %s.addr, align 16
  %cast = bitcast <2 x i64> %14 to <16 x i8>
  %psrldq = shufflevector <16 x i8> %cast, <16 x i8> zeroinitializer, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
  %cast3 = bitcast <16 x i8> %psrldq to <2 x i64>
  store <2 x i64> %13, ptr %__a.addr.i8, align 16
  store <2 x i64> %cast3, ptr %__count.addr.i, align 16
  %15 = load <2 x i64>, ptr %__a.addr.i8, align 16
  %16 = load <2 x i64>, ptr %__count.addr.i, align 16
  %17 = call noundef <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %15, <2 x i64> %16)
  store <2 x i64> %17, ptr %_y, align 16
  %18 = load <2 x i64>, ptr %x_, align 16
  store <2 x i64> %18, ptr %__a.addr.i13, align 16
  %19 = load <2 x i64>, ptr %__a.addr.i13, align 16
  %20 = bitcast <2 x i64> %19 to <2 x double>
  %21 = load <2 x i64>, ptr %_y, align 16
  store <2 x i64> %21, ptr %__a.addr.i12, align 16
  %22 = load <2 x i64>, ptr %__a.addr.i12, align 16
  %23 = bitcast <2 x i64> %22 to <2 x double>
  %shufp = shufflevector <2 x double> %20, <2 x double> %23, <2 x i32> <i32 0, i32 3>
  store <2 x double> %shufp, ptr %__a.addr.i11, align 16
  %24 = load <2 x double>, ptr %__a.addr.i11, align 16
  %25 = bitcast <2 x double> %24 to <2 x i64>
  store <2 x i64> %25, ptr %xy, align 16
  %26 = load <2 x i64>, ptr %xy, align 16
  ret <2 x i64> %26
}

declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) #1

@RKSimon
Copy link
Collaborator

RKSimon commented Feb 5, 2025

https://zig.godbolt.org/z/PWqzTGd31 - should be able to replace both psll intrinsics (currently only one)

@llvmbot
Copy link
Member

llvmbot commented Feb 6, 2025

@llvm/pr-subscribers-llvm-analysis

@llvm/pr-subscribers-llvm-transforms

Author: Abhishek Kaushik (abhishek-kaushik22)

Changes

Closes #125228


Full diff: https://github.com/llvm/llvm-project/pull/125935.diff

2 Files Affected:

  • (modified) llvm/lib/Analysis/ValueTracking.cpp (+24-2)
  • (modified) llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll (+28)
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 6eba6c0f08c3f40..1a2ed6d4a75f871 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -1301,6 +1301,8 @@ static void computeKnownBitsFromOperator(const Operator *I,
         isa<ScalableVectorType>(I->getType()))
       break;
 
+    unsigned NumElts = DemandedElts.getBitWidth();
+    bool IsLE = Q.DL.isLittleEndian();
     // Look through a cast from narrow vector elements to wider type.
     // Examples: v4i32 -> v2i64, v3i8 -> v24
     unsigned SubBitWidth = SrcVecTy->getScalarSizeInBits();
@@ -1319,7 +1321,6 @@ static void computeKnownBitsFromOperator(const Operator *I,
       //
       // The known bits of each sub-element are then inserted into place
       // (dependent on endian) to form the full result of known bits.
-      unsigned NumElts = DemandedElts.getBitWidth();
       unsigned SubScale = BitWidth / SubBitWidth;
       APInt SubDemandedElts = APInt::getZero(NumElts * SubScale);
       for (unsigned i = 0; i != NumElts; ++i) {
@@ -1331,10 +1332,31 @@ static void computeKnownBitsFromOperator(const Operator *I,
       for (unsigned i = 0; i != SubScale; ++i) {
         computeKnownBits(I->getOperand(0), SubDemandedElts.shl(i), KnownSrc,
                          Depth + 1, Q);
-        unsigned ShiftElt = Q.DL.isLittleEndian() ? i : SubScale - 1 - i;
+        unsigned ShiftElt = IsLE ? i : SubScale - 1 - i;
         Known.insertBits(KnownSrc, ShiftElt * SubBitWidth);
       }
     }
+
+    if (SubBitWidth % BitWidth == 0) {
+      unsigned SubScale = SubBitWidth / BitWidth;
+      KnownBits KnownSrc(SubBitWidth);
+      APInt SubDemandedElts =
+          APIntOps::ScaleBitMask(DemandedElts, NumElts / SubScale);
+      computeKnownBits(I->getOperand(0), SubDemandedElts, KnownSrc, Depth + 1,
+                       Q);
+
+      Known.Zero.setAllBits();
+      Known.One.setAllBits();
+      for (unsigned i = 0; i != SubScale; ++i) {
+        if (DemandedElts[i]) {
+          unsigned Shifts = IsLE ? i : NumElts - 1 - i;
+          unsigned Offset = (Shifts % SubScale) * BitWidth;
+          Known = Known.intersectWith(KnownSrc.extractBits(BitWidth, Offset));
+          if (Known.isUnknown())
+            break;
+        }
+      }
+    }
     break;
   }
   case Instruction::SExt: {
diff --git a/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll b/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll
index d38e22a4e3851be..9fb4bbc557f3cb3 100644
--- a/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll
@@ -3732,6 +3732,34 @@ define <4 x i64> @test_avx2_psrl_0() {
   ret <4 x i64> %16
 }
 
+define <2 x i64> @pr125228(<2 x i64> %v, <2 x i64> %s) {
+; CHECK-LABEL: @pr125228(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MASK:%.*]] = and <2 x i64> [[S:%.*]], splat (i64 63)
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <2 x i64> [[MASK]], <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[SLL0:%.*]] = shl <2 x i64> [[V:%.*]], [[TMP0]]
+; CHECK-NEXT:    [[CAST:%.*]] = bitcast <2 x i64> [[MASK]] to <16 x i8>
+; CHECK-NEXT:    [[PSRLDQ:%.*]] = shufflevector <16 x i8> [[CAST]], <16 x i8> poison, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[CAST3:%.*]] = bitcast <16 x i8> [[PSRLDQ]] to <2 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i64> [[CAST3]], <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[SLL1:%.*]] = shl <2 x i64> [[V]], [[TMP1]]
+; CHECK-NEXT:    [[SHUFP_UNCASTED:%.*]] = shufflevector <2 x i64> [[SLL0]], <2 x i64> [[SLL1]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    ret <2 x i64> [[SHUFP_UNCASTED]]
+;
+entry:
+  %mask = and <2 x i64> %s, splat (i64 63)
+  %sll0 = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %v, <2 x i64> %mask)
+  %cast = bitcast <2 x i64> %mask to <16 x i8>
+  %psrldq = shufflevector <16 x i8> %cast, <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+  %cast3 = bitcast <16 x i8> %psrldq to <2 x i64>
+  %sll1 = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %v, <2 x i64> %cast3)
+  %cast0 = bitcast <2 x i64> %sll0 to <2 x double>
+  %cast1 = bitcast <2 x i64> %sll1 to <2 x double>
+  %shufp = shufflevector <2 x double> %cast0, <2 x double> %cast1, <2 x i32> <i32 0, i32 3>
+  %res = bitcast <2 x double> %shufp to <2 x i64>
+  ret <2 x i64> %res
+}
+
 declare <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64>, i32) #1
 declare <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32>, i32) #1
 declare <32 x i16> @llvm.x86.avx512.pslli.w.512(<32 x i16>, i32) #1

RKSimon added a commit that referenced this pull request Feb 14, 2025
@RKSimon
Copy link
Collaborator

RKSimon commented Feb 14, 2025

Please can you rebase - I've added the test to trunk so this patch can show the diff more easily.

The next step is add to add more test coverage - if you comment out the DAG equivalent code and see the test changes you might be able to adapt some of those tests to create new IR tests?

joaosaffran pushed a commit to joaosaffran/llvm-project that referenced this pull request Feb 14, 2025
sivan-shani pushed a commit to sivan-shani/llvm-project that referenced this pull request Feb 24, 2025
@RKSimon
Copy link
Collaborator

RKSimon commented Mar 13, 2025

@abhishek-kaushik22 any luck with generating extra tests?

@abhishek-kaushik22
Copy link
Contributor Author

@abhishek-kaushik22 any luck with generating extra tests?

I've been looking at SDag tests that fail after commenting out the bitcast code but I've only been able to generate a single test (from ~950 failures). I'll try to generate more this week.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

[ValueTracking] computeKnownBitsFromOperator - improve Instruction::BitCast handling to match SelectionDAG
3 participants