Coalescing of uniform values

ekochetk · igcbot · commit 693cfe5e5e1a · 2025-06-02T12:44:45.000+02:00
IGCVectorizer can vectorize uniform values. Then they exist bundled into a vector,
now vector emission supports coalescing. We emit single wide instruction
to cover multiple uniform values that were processed one by one before.
diff --git a/IGC/Compiler/CISACodeGen/EmitVISAPass.cpp b/IGC/Compiler/CISACodeGen/EmitVISAPass.cpp
@@ -4677,7 +4677,6 @@ void EmitPass::FPTrunc(const SSource sources[2], const DstModifier& modifier) {
     }
 }
 
-
 void EmitPass::Add(const SSource sources[2], const DstModifier& modifier)
 {
     CVariable* src[2];
@@ -4693,6 +4692,21 @@ void EmitPass::Add(const SSource sources[2], const DstModifier& modifier)
         IGC_ASSERT_EXIT_MESSAGE(numLanes(m_encoder->GetSimdSize()) == 16, "As of now Vector Emission is only supported for SIMD16");
         unsigned VectorSize = getVectorSize(sources[0].value);
 
+        bool AllUniform = src[0]->IsUniform() && src[1]->IsUniform() && m_destination->IsUniform();
+        // cannot emit 16 SIMD if SIMD SIZE is set to 8, but can emit 4
+        // simple ALU instructions has the same possible width as SIMD, "math" pipeline instructions
+        // has reduced width
+        bool CanEmitThisSize = VectorSize <= numLanes(m_currShader->m_SIMDSize);
+
+        if (IGC_IS_FLAG_ENABLED(VectorizerUniformValueVectorizationEnabled) && AllUniform && CanEmitThisSize) {
+            m_encoder->SetSrcRegion(0, 1, 1, 0);
+            m_encoder->SetSrcRegion(1, 1, 1, 0);
+            m_encoder->SetUniformSIMDSize(lanesToSIMDMode(VectorSize));
+            m_encoder->Add(m_destination, src[0], src[1]);
+            m_encoder->Push();
+            return;
+        }
+
         for (unsigned i = 0; i < VectorSize; ++i) {
             SetSourceModifiers(0, sources[0]);
             SetSourceModifiers(1, sources[1]);
@@ -4726,13 +4740,30 @@ void EmitPass::Mul(const SSource sources[2], const DstModifier& modifier)
         src[i] = GetSrcVariable(sources[i]);
     }
 
+    unsigned SIMDSize = numLanes(m_currShader->m_SIMDSize);
+
     if (IGC_IS_FLAG_ENABLED(EnableVectorEmitter) &&
             sources[0].value->getType()->isVectorTy() &&
             sources[1].value->getType()->isVectorTy()) {
 
         IGC_ASSERT_EXIT_MESSAGE(numLanes(m_encoder->GetSimdSize()) == 16, "As of now Vector Emission is only supported for SIMD16");
         unsigned VectorSize = getVectorSize(sources[0].value);
 
+        bool AllUniform = src[0]->IsUniform() && src[1]->IsUniform() && m_destination->IsUniform();
+        // cannot emit 16 SIMD if SIMD SIZE is set to 8, but can emit 4
+        // simple ALU instructions has the same possible width as SIMD, "math" pipeline instructions
+        // has reduced width
+        bool CanEmitThisSize = VectorSize <= SIMDSize;
+
+        if (IGC_IS_FLAG_ENABLED(VectorizerUniformValueVectorizationEnabled) && AllUniform && CanEmitThisSize) {
+            m_encoder->SetSrcRegion(0, 1, 1, 0);
+            m_encoder->SetSrcRegion(1, 1, 1, 0);
+            m_encoder->SetUniformSIMDSize(lanesToSIMDMode(VectorSize));
+            m_encoder->Mul(m_destination, src[0], src[1]);
+            m_encoder->Push();
+            return;
+        }
+
         for (unsigned i = 0; i < VectorSize; ++i) {
             SetSourceModifiers(0, sources[0]);
             SetSourceModifiers(1, sources[1]);
@@ -4850,6 +4881,27 @@ void EmitPass::VectorMad(const SSource sources[3], const DstModifier& modifier)
 
     unsigned VectorSize = getVectorSize(sources[0].value);
 
+    bool AllUniform = src[0]->IsUniform() &&
+        src[1]->IsUniform() && src[2]->IsUniform() &&
+        m_destination->IsUniform();
+
+    // cannot emit 16 SIMD if SIMD SIZE is set to 8, but can emit 4
+    // simple ALU instructions has the same possible width as SIMD, "math" pipeline instructions
+    // has reduced width
+    bool CanEmitThisSize = VectorSize <= numLanes(m_currShader->m_SIMDSize);
+
+    if (IGC_IS_FLAG_ENABLED(VectorizerUniformValueVectorizationEnabled) && AllUniform && CanEmitThisSize) {
+        // regioning must be updated by hand, DO NOT COPY for fptrunc
+        m_encoder->SetSrcRegion(0, 1, 1, 0);
+        m_encoder->SetSrcRegion(1, 1, 1, 0);
+        m_encoder->SetSrcRegion(2, 1, 1, 0);
+        // this will force no_mask and proper uniform SIMD SIZE
+        m_encoder->SetUniformSIMDSize(lanesToSIMDMode(VectorSize));
+        m_encoder->Mad(m_destination, src[0], src[1], src[2]);
+        m_encoder->Push();
+        return;
+    }
+
     for (unsigned i = 0; i < VectorSize; ++i) {
 
         SetSourceModifiers(0, sources[0]);
@@ -4863,10 +4915,8 @@ void EmitPass::VectorMad(const SSource sources[3], const DstModifier& modifier)
         if (src[2]->IsUniform()) m_encoder->SetSrcSubReg(2, i);
         else m_encoder->SetSrcSubVar(2, i);
 
-        bool AllAreUniform = src[0]->IsUniform() &&
-            src[1]->IsUniform() && src[2]->IsUniform();
 
-        if (AllAreUniform) m_encoder->SetDstSubReg(i);
+        if (AllUniform) m_encoder->SetDstSubReg(i);
         else m_encoder->SetDstSubVar(i);
 
         m_encoder->Mad(m_destination, src[0], src[1], src[2]);
diff --git a/IGC/Compiler/tests/EmitVISAPass/vectorizer-vector-emission-fmad-uniform.ll b/IGC/Compiler/tests/EmitVISAPass/vectorizer-vector-emission-fmad-uniform.ll
@@ -1,7 +1,7 @@
 ; UNSUPPORTED: system-windows
 ; REQUIRES: regkeys
 
-; RUN: igc_opt -S -dce -platformpvc -rev-id B -has-emulated-64-bit-insts -igc-emit-visa --regkey=DumpVISAASMToConsole=1 -simd-mode 16 < %s | FileCheck %s
+; RUN: igc_opt -S -dce -platformpvc -rev-id B -has-emulated-64-bit-insts -igc-emit-visa --regkey=DumpVISAASMToConsole=1 --regkey=VectorizerUniformValueVectorizationEnabled=0 -simd-mode 16 < %s | FileCheck %s
 
 ; CHECK: .decl vectorized_phi1095 v_type=G type=f num_elts=8 align=dword
 ; CHECK: .decl vectorized_phi1116 v_type=G type=f num_elts=8 align=dword
diff --git a/IGC/Compiler/tests/IGCVectorizer/vectorizer-test-ugly-chain.ll b/IGC/Compiler/tests/IGCVectorizer/vectorizer-test-ugly-chain.ll
@@ -0,0 +1,89 @@
+; RUN: igc_opt -S  --igc-vectorizer -dce < %s 2>&1 | FileCheck %s
+
+; CHECK: %vectorized_phi
+; CHECK: %vector = insertelement <8 x float> undef
+; CHECK: %vector1 = insertelement <8 x float> %vector
+; CHECK: %vector2 = insertelement <8 x float> %vector1
+; CHECK: %vector3 = insertelement <8 x float> %vector2
+; CHECK: %vector4 = insertelement <8 x float> %vector3
+; CHECK: %vector5 = insertelement <8 x float> %vector4
+; we use the same variable twice in original IR, vectorizer must process it correctly
+; CHECK: %vector6 = insertelement <8 x float> %vector5, float [[REPEATED:%.*]], i32 6
+; CHECK: %vector7 = insertelement <8 x float> %vector6, float [[REPEATED]], i32 7
+; CHECK: %vectorized_binary = fmul fast <8 x float> %vector7, %vectorized_phi
+; CHECK: call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> %vectorized_binary
+
+; ModuleID = 'reduced.ll'
+source_filename = "initial_test.ll"
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-n8:16:32"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @quux() {
+  br label %._crit_edge
+
+._crit_edge:                                      ; preds = %._crit_edge, %0
+  %1 = phi float [ 0.000000e+00, %0 ], [ %35, %._crit_edge ]
+  %2 = phi float [ 0.000000e+00, %0 ], [ %36, %._crit_edge ]
+  %3 = phi float [ 0.000000e+00, %0 ], [ %37, %._crit_edge ]
+  %4 = phi float [ 0.000000e+00, %0 ], [ %38, %._crit_edge ]
+  %5 = phi float [ 0.000000e+00, %0 ], [ %39, %._crit_edge ]
+  %6 = phi float [ 0.000000e+00, %0 ], [ %40, %._crit_edge ]
+  %7 = phi float [ 0.000000e+00, %0 ], [ %41, %._crit_edge ]
+  %8 = phi float [ 0.000000e+00, %0 ], [ %42, %._crit_edge ]
+  %9 = call float @llvm.exp2.f32(float 0.000000e+00)
+  %10 = call float @llvm.exp2.f32(float 1.000000e+00)
+  %11 = call float @llvm.exp2.f32(float 2.000000e+00)
+  %12 = call float @llvm.exp2.f32(float 3.000000e+00)
+  %13 = call float @llvm.exp2.f32(float 4.000000e+00)
+  %14 = call float @llvm.exp2.f32(float 5.000000e+00)
+  %15 = call float @llvm.exp2.f32(float 6.000000e+00)
+  %16 = call float @llvm.exp2.f32(float 7.000000e+00)
+  %17 = fmul fast float %9, %1
+  %18 = fmul fast float %10, %2
+  %19 = fmul fast float %11, %3
+  %20 = fmul fast float %12, %4
+  %21 = fmul fast float %13, %5
+  %22 = fmul fast float %14, %6
+  %23 = fmul fast float %15, %7
+  %24 = fmul fast float %15, %8 ; we double the same variable and check that vectorizer builds correct scheme
+  %25 = insertelement <8 x float> zeroinitializer, float %17, i64 0
+  %26 = insertelement <8 x float> %25, float %18, i64 1
+  %27 = insertelement <8 x float> %26, float %19, i64 2
+  %28 = insertelement <8 x float> %27, float %20, i64 3
+  %29 = insertelement <8 x float> %28, float %21, i64 4
+  %30 = insertelement <8 x float> %29, float %22, i64 5
+  %31 = insertelement <8 x float> %30, float %23, i64 6
+  %32 = insertelement <8 x float> %31, float %24, i64 7
+  %33 = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> %32, <8 x i16> zeroinitializer, <8 x i32> zeroinitializer, i32 0, i32 0, i32 0, i32 0, i1 false)
+  %34 = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> %33, <8 x i16> zeroinitializer, <8 x i32> zeroinitializer, i32 0, i32 0, i32 0, i32 0, i1 false)
+  %35 = extractelement <8 x float> %34, i64 0
+  %36 = extractelement <8 x float> %34, i64 1
+  %37 = extractelement <8 x float> %34, i64 2
+  %38 = extractelement <8 x float> %34, i64 3
+  %39 = extractelement <8 x float> %34, i64 4
+  %40 = extractelement <8 x float> %34, i64 5
+  %41 = extractelement <8 x float> %34, i64 6
+  %42 = extractelement <8 x float> %34, i64 7
+  br label %._crit_edge
+}
+
+; Function Attrs: convergent nounwind readnone willreturn
+declare <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float>, <8 x i16>, <8 x i32>, i32, i32, i32, i32, i1) #1
+
+; Function Attrs: nofree nosync nounwind readnone speculatable willreturn
+declare float @llvm.exp2.f32(float) #2
+
+; uselistorder directives
+uselistorder <8 x float> (<8 x float>, <8 x i16>, <8 x i32>, i32, i32, i32, i32, i1)* @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32, { 1, 0 }
+uselistorder float (float)* @llvm.exp2.f32, { 7, 6, 5, 4, 3, 2, 1, 0 }
+
+attributes #0 = { convergent nounwind }
+attributes #1 = { convergent nounwind readnone willreturn }
+attributes #2 = { nofree nosync nounwind readnone speculatable willreturn }
+
+!igc.functions = !{!0}
+!0 = !{void ()* @quux, !1}
+!1 = !{!2, !3}
+!2 = !{!"function_type", i32 0}
+!3 = !{!"sub_group_size", i32 16}
diff --git a/IGC/Compiler/tests/IGCVectorizer/vectorizer-test-wider-data.ll b/IGC/Compiler/tests/IGCVectorizer/vectorizer-test-wider-data.ll
@@ -0,0 +1,92 @@
+; RUN: igc_opt -S  --igc-vectorizer -dce < %s 2>&1 | FileCheck %s
+
+; ModuleID = 'reduced.ll'
+source_filename = "initial_test.ll"
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-n8:16:32"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @quux(< 16 x float>* %ptr) {
+
+  %bulkData = load <16 x float>, <16 x float>* %ptr
+
+  %extractedValue_0 = extractelement <16 x float> %bulkData, i64 0
+  %extractedValue_1 = extractelement <16 x float> %bulkData, i64 1
+  %extractedValue_2 = extractelement <16 x float> %bulkData, i64 2
+  %extractedValue_3 = extractelement <16 x float> %bulkData, i64 3
+  %extractedValue_4 = extractelement <16 x float> %bulkData, i64 4
+  %extractedValue_5 = extractelement <16 x float> %bulkData, i64 5
+  %extractedValue_6 = extractelement <16 x float> %bulkData, i64 6
+  %extractedValue_7 = extractelement <16 x float> %bulkData, i64 7
+
+  %extractedValue_8 = extractelement <16 x float> %bulkData, i64 8
+  %extractedValue_9 = extractelement <16 x float> %bulkData, i64 9
+  %extractedValue_10 = extractelement <16 x float> %bulkData, i64 10
+  %extractedValue_11 = extractelement <16 x float> %bulkData, i64 11
+  %extractedValue_12 = extractelement <16 x float> %bulkData, i64 12
+  %extractedValue_13 = extractelement <16 x float> %bulkData, i64 13
+  %extractedValue_14 = extractelement <16 x float> %bulkData, i64 14
+  %extractedValue_15 = extractelement <16 x float> %bulkData, i64 15
+
+  br label %._crit_edge
+
+  ; CHECK-LABEL: ._crit_edge:
+._crit_edge:                                      ; preds = %._crit_edge, %0
+
+  ; CHECK-NOT: phi <8 x float>{{.*}}
+  %1 = phi float [ %extractedValue_8, %0 ],  [ %a35, %._crit_edge ]
+  %2 = phi float [ %extractedValue_9, %0 ],  [ %a36, %._crit_edge ]
+  %3 = phi float [ %extractedValue_10, %0 ], [ %a37, %._crit_edge ]
+  %4 = phi float [ %extractedValue_11, %0 ], [ %a38, %._crit_edge ]
+  %5 = phi float [ %extractedValue_12, %0 ], [ %a39, %._crit_edge ]
+  %6 = phi float [ %extractedValue_13, %0 ], [ %a40, %._crit_edge ]
+  %7 = phi float [ %extractedValue_14, %0 ], [ %a41, %._crit_edge ]
+  %8 = phi float [ %extractedValue_15, %0 ], [ %a42, %._crit_edge ]
+
+  ; CHECK-NOT: fmul fast <8 x float>{{.*}}
+  %a17 = fmul fast float %1, %extractedValue_0
+  %a18 = fmul fast float %2, %extractedValue_1
+  %a19 = fmul fast float %3, %extractedValue_2
+  %a20 = fmul fast float %4, %extractedValue_3
+  %a21 = fmul fast float %5, %extractedValue_4
+  %a22 = fmul fast float %6, %extractedValue_5
+  %a23 = fmul fast float %7, %extractedValue_6
+  %a24 = fmul fast float %8, %extractedValue_7
+
+  %a25 = insertelement <8 x float> zeroinitializer, float %a17, i64 0
+  %a26 = insertelement <8 x float> %a25, float %a18, i64 1
+  %a27 = insertelement <8 x float> %a26, float %a19, i64 2
+  %a28 = insertelement <8 x float> %a27, float %a20, i64 3
+  %a29 = insertelement <8 x float> %a28, float %a21, i64 4
+  %a30 = insertelement <8 x float> %a29, float %a22, i64 5
+  %a31 = insertelement <8 x float> %a30, float %a23, i64 6
+  %a32 = insertelement <8 x float> %a31, float %a24, i64 7
+  %a33 = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> %a32, <8 x i16> zeroinitializer, <8 x i32> zeroinitializer, i32 0, i32 0, i32 0, i32 0, i1 false)
+  %a34 = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> %a33, <8 x i16> zeroinitializer, <8 x i32> zeroinitializer, i32 0, i32 0, i32 0, i32 0, i1 false)
+  %a35 = extractelement <8 x float> %a34, i64 0
+  %a36 = extractelement <8 x float> %a34, i64 1
+  %a37 = extractelement <8 x float> %a34, i64 2
+  %a38 = extractelement <8 x float> %a34, i64 3
+  %a39 = extractelement <8 x float> %a34, i64 4
+  %a40 = extractelement <8 x float> %a34, i64 5
+  %a41 = extractelement <8 x float> %a34, i64 6
+  %a42 = extractelement <8 x float> %a34, i64 7
+  ; CHECK: br label %._crit_edge
+  br label %._crit_edge
+}
+
+; Function Attrs: convergent nounwind readnone willreturn
+declare <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float>, <8 x i16>, <8 x i32>, i32, i32, i32, i32, i1) #1
+
+; Function Attrs: nofree nosync nounwind readnone speculatable willreturn
+declare float @llvm.exp2.f32(float) #2
+
+attributes #0 = { convergent nounwind }
+attributes #1 = { convergent nounwind readnone willreturn }
+attributes #2 = { nofree nosync nounwind readnone speculatable willreturn }
+
+!igc.functions = !{!0}
+!0 = !{void (<16 x float>* )* @quux, !1}
+!1 = !{!2, !3}
+!2 = !{!"function_type", i32 0}
+!3 = !{!"sub_group_size", i32 16}
diff --git a/IGC/common/igc_flags.h b/IGC/common/igc_flags.h
@@ -533,6 +533,7 @@ DECLARE_IGC_REGKEY(bool, VectorizerAllowFPTRUNC, true, "Allow FPTRUNC instructio
 DECLARE_IGC_REGKEY(bool, VectorizerAllowFDIV, true, "Allow FDIV instructions inside vectorizer", true)
 DECLARE_IGC_REGKEY(bool, VectorizerAllowFADD, true, "Allow FADD instructions inside vectorizer", true)
 DECLARE_IGC_REGKEY(bool, VectorizerAllowFMADMatching, true, "Allow FADD and FMUL instructions to be matched later in the pattern match pass", true)
+DECLARE_IGC_REGKEY(bool, VectorizerUniformValueVectorizationEnabled, true, "Vector Emitter emits vectorized instruction for uniform values", true)
 DECLARE_IGC_REGKEY(bool, DisableOCLScalarizer,          false, "Disable ScalarizeFunction pass in OCL pipeline", true)
 DECLARE_IGC_REGKEY(bool, DisablePHIScalarization,       false, "Disable scalarization of PHINode instructions", true)
 DECLARE_IGC_REGKEY(bool, EnableSelectiveScalarizer,     false,  "enable selective scalarizer on GPGPU path", true)