[ValueTracking] Improve `Bitcast` handling to match SDAG #125935

abhishek-kaushik22 · 2025-02-05T21:26:30Z

abhishek-kaushik22 · 2025-02-05T21:30:30Z

@RKSimon Can you please help me with some more tests? I only have this one from I got from the C code you provided in the issue (compiled with O0).

define dso_local noundef <2 x i64> @shift64(<2 x i64> noundef %v, <2 x i64> noundef %s) {
entry:
  %__q1.addr.i = alloca i64, align 8
  %__q0.addr.i = alloca i64, align 8
  %.compoundliteral.i = alloca <2 x i64>, align 16
  %__a.addr.i13 = alloca <2 x i64>, align 16
  %__a.addr.i12 = alloca <2 x i64>, align 16
  %__a.addr.i11 = alloca <2 x double>, align 16
  %__a.addr.i9 = alloca <2 x i64>, align 16
  %__count.addr.i10 = alloca <2 x i64>, align 16
  %__a.addr.i8 = alloca <2 x i64>, align 16
  %__count.addr.i = alloca <2 x i64>, align 16
  %__q.addr.i = alloca i64, align 8
  %__a.addr.i = alloca <2 x i64>, align 16
  %__b.addr.i = alloca <2 x i64>, align 16
  %v.addr = alloca <2 x i64>, align 16
  %s.addr = alloca <2 x i64>, align 16
  %x_ = alloca <2 x i64>, align 16
  %_y = alloca <2 x i64>, align 16
  %xy = alloca <2 x i64>, align 16
  store <2 x i64> %v, ptr %v.addr, align 16
  store <2 x i64> %s, ptr %s.addr, align 16
  %0 = load <2 x i64>, ptr %s.addr, align 16
  store i64 63, ptr %__q.addr.i, align 8
  %1 = load i64, ptr %__q.addr.i, align 8
  %2 = load i64, ptr %__q.addr.i, align 8
  store i64 %1, ptr %__q1.addr.i, align 8
  store i64 %2, ptr %__q0.addr.i, align 8
  %3 = load i64, ptr %__q0.addr.i, align 8
  %vecinit.i = insertelement <2 x i64> poison, i64 %3, i32 0
  %4 = load i64, ptr %__q1.addr.i, align 8
  %vecinit1.i = insertelement <2 x i64> %vecinit.i, i64 %4, i32 1
  store <2 x i64> %vecinit1.i, ptr %.compoundliteral.i, align 16
  %5 = load <2 x i64>, ptr %.compoundliteral.i, align 16
  store <2 x i64> %0, ptr %__a.addr.i, align 16
  store <2 x i64> %5, ptr %__b.addr.i, align 16
  %6 = load <2 x i64>, ptr %__a.addr.i, align 16
  %7 = load <2 x i64>, ptr %__b.addr.i, align 16
  %and.i = and <2 x i64> %6, %7
  store <2 x i64> %and.i, ptr %s.addr, align 16
  %8 = load <2 x i64>, ptr %v.addr, align 16
  %9 = load <2 x i64>, ptr %s.addr, align 16
  store <2 x i64> %8, ptr %__a.addr.i9, align 16
  store <2 x i64> %9, ptr %__count.addr.i10, align 16
  %10 = load <2 x i64>, ptr %__a.addr.i9, align 16
  %11 = load <2 x i64>, ptr %__count.addr.i10, align 16
  %12 = call noundef <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %10, <2 x i64> %11)
  store <2 x i64> %12, ptr %x_, align 16
  %13 = load <2 x i64>, ptr %v.addr, align 16
  %14 = load <2 x i64>, ptr %s.addr, align 16
  %cast = bitcast <2 x i64> %14 to <16 x i8>
  %psrldq = shufflevector <16 x i8> %cast, <16 x i8> zeroinitializer, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
  %cast3 = bitcast <16 x i8> %psrldq to <2 x i64>
  store <2 x i64> %13, ptr %__a.addr.i8, align 16
  store <2 x i64> %cast3, ptr %__count.addr.i, align 16
  %15 = load <2 x i64>, ptr %__a.addr.i8, align 16
  %16 = load <2 x i64>, ptr %__count.addr.i, align 16
  %17 = call noundef <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %15, <2 x i64> %16)
  store <2 x i64> %17, ptr %_y, align 16
  %18 = load <2 x i64>, ptr %x_, align 16
  store <2 x i64> %18, ptr %__a.addr.i13, align 16
  %19 = load <2 x i64>, ptr %__a.addr.i13, align 16
  %20 = bitcast <2 x i64> %19 to <2 x double>
  %21 = load <2 x i64>, ptr %_y, align 16
  store <2 x i64> %21, ptr %__a.addr.i12, align 16
  %22 = load <2 x i64>, ptr %__a.addr.i12, align 16
  %23 = bitcast <2 x i64> %22 to <2 x double>
  %shufp = shufflevector <2 x double> %20, <2 x double> %23, <2 x i32> <i32 0, i32 3>
  store <2 x double> %shufp, ptr %__a.addr.i11, align 16
  %24 = load <2 x double>, ptr %__a.addr.i11, align 16
  %25 = bitcast <2 x double> %24 to <2 x i64>
  store <2 x i64> %25, ptr %xy, align 16
  %26 = load <2 x i64>, ptr %xy, align 16
  ret <2 x i64> %26
}

declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) #1

RKSimon · 2025-02-05T22:12:57Z

https://zig.godbolt.org/z/PWqzTGd31 - should be able to replace both psll intrinsics (currently only one)

llvm/lib/Analysis/ValueTracking.cpp

llvmbot · 2025-02-06T15:42:47Z

@llvm/pr-subscribers-llvm-analysis

@llvm/pr-subscribers-llvm-transforms

Author: Abhishek Kaushik (abhishek-kaushik22)

Changes

Closes #125228

Full diff: https://github.com/llvm/llvm-project/pull/125935.diff

2 Files Affected:

(modified) llvm/lib/Analysis/ValueTracking.cpp (+24-2)
(modified) llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll (+28)

diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 6eba6c0f08c3f40..1a2ed6d4a75f871 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -1301,6 +1301,8 @@ static void computeKnownBitsFromOperator(const Operator *I,
         isa<ScalableVectorType>(I->getType()))
       break;
 
+    unsigned NumElts = DemandedElts.getBitWidth();
+    bool IsLE = Q.DL.isLittleEndian();
     // Look through a cast from narrow vector elements to wider type.
     // Examples: v4i32 -> v2i64, v3i8 -> v24
     unsigned SubBitWidth = SrcVecTy->getScalarSizeInBits();
@@ -1319,7 +1321,6 @@ static void computeKnownBitsFromOperator(const Operator *I,
       //
       // The known bits of each sub-element are then inserted into place
       // (dependent on endian) to form the full result of known bits.
-      unsigned NumElts = DemandedElts.getBitWidth();
       unsigned SubScale = BitWidth / SubBitWidth;
       APInt SubDemandedElts = APInt::getZero(NumElts * SubScale);
       for (unsigned i = 0; i != NumElts; ++i) {
@@ -1331,10 +1332,31 @@ static void computeKnownBitsFromOperator(const Operator *I,
       for (unsigned i = 0; i != SubScale; ++i) {
         computeKnownBits(I->getOperand(0), SubDemandedElts.shl(i), KnownSrc,
                          Depth + 1, Q);
-        unsigned ShiftElt = Q.DL.isLittleEndian() ? i : SubScale - 1 - i;
+        unsigned ShiftElt = IsLE ? i : SubScale - 1 - i;
         Known.insertBits(KnownSrc, ShiftElt * SubBitWidth);
       }
     }
+
+    if (SubBitWidth % BitWidth == 0) {
+      unsigned SubScale = SubBitWidth / BitWidth;
+      KnownBits KnownSrc(SubBitWidth);
+      APInt SubDemandedElts =
+          APIntOps::ScaleBitMask(DemandedElts, NumElts / SubScale);
+      computeKnownBits(I->getOperand(0), SubDemandedElts, KnownSrc, Depth + 1,
+                       Q);
+
+      Known.Zero.setAllBits();
+      Known.One.setAllBits();
+      for (unsigned i = 0; i != SubScale; ++i) {
+        if (DemandedElts[i]) {
+          unsigned Shifts = IsLE ? i : NumElts - 1 - i;
+          unsigned Offset = (Shifts % SubScale) * BitWidth;
+          Known = Known.intersectWith(KnownSrc.extractBits(BitWidth, Offset));
+          if (Known.isUnknown())
+            break;
+        }
+      }
+    }
     break;
   }
   case Instruction::SExt: {
diff --git a/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll b/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll
index d38e22a4e3851be..9fb4bbc557f3cb3 100644
--- a/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll
@@ -3732,6 +3732,34 @@ define <4 x i64> @test_avx2_psrl_0() {
   ret <4 x i64> %16
 }
 
+define <2 x i64> @pr125228(<2 x i64> %v, <2 x i64> %s) {
+; CHECK-LABEL: @pr125228(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MASK:%.*]] = and <2 x i64> [[S:%.*]], splat (i64 63)
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <2 x i64> [[MASK]], <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[SLL0:%.*]] = shl <2 x i64> [[V:%.*]], [[TMP0]]
+; CHECK-NEXT:    [[CAST:%.*]] = bitcast <2 x i64> [[MASK]] to <16 x i8>
+; CHECK-NEXT:    [[PSRLDQ:%.*]] = shufflevector <16 x i8> [[CAST]], <16 x i8> poison, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[CAST3:%.*]] = bitcast <16 x i8> [[PSRLDQ]] to <2 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i64> [[CAST3]], <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[SLL1:%.*]] = shl <2 x i64> [[V]], [[TMP1]]
+; CHECK-NEXT:    [[SHUFP_UNCASTED:%.*]] = shufflevector <2 x i64> [[SLL0]], <2 x i64> [[SLL1]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    ret <2 x i64> [[SHUFP_UNCASTED]]
+;
+entry:
+  %mask = and <2 x i64> %s, splat (i64 63)
+  %sll0 = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %v, <2 x i64> %mask)
+  %cast = bitcast <2 x i64> %mask to <16 x i8>
+  %psrldq = shufflevector <16 x i8> %cast, <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+  %cast3 = bitcast <16 x i8> %psrldq to <2 x i64>
+  %sll1 = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %v, <2 x i64> %cast3)
+  %cast0 = bitcast <2 x i64> %sll0 to <2 x double>
+  %cast1 = bitcast <2 x i64> %sll1 to <2 x double>
+  %shufp = shufflevector <2 x double> %cast0, <2 x double> %cast1, <2 x i32> <i32 0, i32 3>
+  %res = bitcast <2 x double> %shufp to <2 x i64>
+  ret <2 x i64> %res
+}
+
 declare <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64>, i32) #1
 declare <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32>, i32) #1
 declare <32 x i16> @llvm.x86.avx512.pslli.w.512(<32 x i16>, i32) #1

llvm/lib/Analysis/ValueTracking.cpp

Pulled out of #125935 to show diff

RKSimon · 2025-02-14T14:33:54Z

Please can you rebase - I've added the test to trunk so this patch can show the diff more easily.

The next step is add to add more test coverage - if you comment out the DAG equivalent code and see the test changes you might be able to adapt some of those tests to create new IR tests?

Pulled out of llvm#125935 to show diff

Closes llvm#125228

Pulled out of llvm#125935 to show diff

RKSimon · 2025-03-13T21:33:45Z

@abhishek-kaushik22 any luck with generating extra tests?

abhishek-kaushik22 · 2025-03-17T14:06:24Z

@abhishek-kaushik22 any luck with generating extra tests?

I've been looking at SDag tests that fail after commenting out the bitcast code but I've only been able to generate a single test (from ~950 failures). I'll try to generate more this week.

abhishek-kaushik22 requested a review from RKSimon February 5, 2025 21:26

RKSimon reviewed Feb 6, 2025

View reviewed changes

llvm/lib/Analysis/ValueTracking.cpp Outdated Show resolved Hide resolved

abhishek-kaushik22 marked this pull request as ready for review February 6, 2025 15:41

abhishek-kaushik22 requested a review from nikic as a code owner February 6, 2025 15:41

llvmbot added llvm:instcombine llvm:analysis llvm:transforms labels Feb 6, 2025

RKSimon reviewed Feb 6, 2025

View reviewed changes

llvm/lib/Analysis/ValueTracking.cpp Show resolved Hide resolved

RKSimon added a commit that referenced this pull request Feb 14, 2025

[InstCombine][X86] Add SSE shift test coverage for #125228

ee76861

Pulled out of #125935 to show diff

joaosaffran pushed a commit to joaosaffran/llvm-project that referenced this pull request Feb 14, 2025

[InstCombine][X86] Add SSE shift test coverage for llvm#125228

f51dafc

Pulled out of llvm#125935 to show diff

abhishek-kaushik22 added 3 commits February 22, 2025 23:33

[ValueTracking] Improve Bitcast handling to match SDAG

2c804af

Closes llvm#125228

Minor change and tests added

92ebaf5

Add comment

345c8fa

abhishek-kaushik22 force-pushed the tracking branch from 941e0dd to 345c8fa Compare February 22, 2025 18:06

Update x86-vector-shifts.ll

544b87b

sivan-shani pushed a commit to sivan-shani/llvm-project that referenced this pull request Feb 24, 2025

[InstCombine][X86] Add SSE shift test coverage for llvm#125228

ea8d4e9

Pulled out of llvm#125935 to show diff

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[ValueTracking] Improve `Bitcast` handling to match SDAG #125935

[ValueTracking] Improve `Bitcast` handling to match SDAG #125935

abhishek-kaushik22 commented Feb 5, 2025

abhishek-kaushik22 commented Feb 5, 2025

RKSimon commented Feb 5, 2025

llvmbot commented Feb 6, 2025 •

edited

Loading

RKSimon commented Feb 14, 2025

RKSimon commented Mar 13, 2025

abhishek-kaushik22 commented Mar 17, 2025

[ValueTracking] Improve Bitcast handling to match SDAG #125935

Are you sure you want to change the base?

[ValueTracking] Improve Bitcast handling to match SDAG #125935

Conversation

abhishek-kaushik22 commented Feb 5, 2025

abhishek-kaushik22 commented Feb 5, 2025

RKSimon commented Feb 5, 2025

llvmbot commented Feb 6, 2025 • edited Loading

RKSimon commented Feb 14, 2025

RKSimon commented Mar 13, 2025

abhishek-kaushik22 commented Mar 17, 2025

[ValueTracking] Improve `Bitcast` handling to match SDAG #125935

[ValueTracking] Improve `Bitcast` handling to match SDAG #125935

llvmbot commented Feb 6, 2025 •

edited

Loading