Skip to content

Commit 693cfe5

Browse files
ekochetkigcbot
authored andcommitted
Coalescing of uniform values
IGCVectorizer can vectorize uniform values. Then they exist bundled into a vector, now vector emission supports coalescing. We emit single wide instruction to cover multiple uniform values that were processed one by one before.
1 parent 59a11ab commit 693cfe5

File tree

5 files changed

+237
-5
lines changed

5 files changed

+237
-5
lines changed

IGC/Compiler/CISACodeGen/EmitVISAPass.cpp

Lines changed: 54 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4677,7 +4677,6 @@ void EmitPass::FPTrunc(const SSource sources[2], const DstModifier& modifier) {
46774677
}
46784678
}
46794679

4680-
46814680
void EmitPass::Add(const SSource sources[2], const DstModifier& modifier)
46824681
{
46834682
CVariable* src[2];
@@ -4693,6 +4692,21 @@ void EmitPass::Add(const SSource sources[2], const DstModifier& modifier)
46934692
IGC_ASSERT_EXIT_MESSAGE(numLanes(m_encoder->GetSimdSize()) == 16, "As of now Vector Emission is only supported for SIMD16");
46944693
unsigned VectorSize = getVectorSize(sources[0].value);
46954694

4695+
bool AllUniform = src[0]->IsUniform() && src[1]->IsUniform() && m_destination->IsUniform();
4696+
// cannot emit 16 SIMD if SIMD SIZE is set to 8, but can emit 4
4697+
// simple ALU instructions has the same possible width as SIMD, "math" pipeline instructions
4698+
// has reduced width
4699+
bool CanEmitThisSize = VectorSize <= numLanes(m_currShader->m_SIMDSize);
4700+
4701+
if (IGC_IS_FLAG_ENABLED(VectorizerUniformValueVectorizationEnabled) && AllUniform && CanEmitThisSize) {
4702+
m_encoder->SetSrcRegion(0, 1, 1, 0);
4703+
m_encoder->SetSrcRegion(1, 1, 1, 0);
4704+
m_encoder->SetUniformSIMDSize(lanesToSIMDMode(VectorSize));
4705+
m_encoder->Add(m_destination, src[0], src[1]);
4706+
m_encoder->Push();
4707+
return;
4708+
}
4709+
46964710
for (unsigned i = 0; i < VectorSize; ++i) {
46974711
SetSourceModifiers(0, sources[0]);
46984712
SetSourceModifiers(1, sources[1]);
@@ -4726,13 +4740,30 @@ void EmitPass::Mul(const SSource sources[2], const DstModifier& modifier)
47264740
src[i] = GetSrcVariable(sources[i]);
47274741
}
47284742

4743+
unsigned SIMDSize = numLanes(m_currShader->m_SIMDSize);
4744+
47294745
if (IGC_IS_FLAG_ENABLED(EnableVectorEmitter) &&
47304746
sources[0].value->getType()->isVectorTy() &&
47314747
sources[1].value->getType()->isVectorTy()) {
47324748

47334749
IGC_ASSERT_EXIT_MESSAGE(numLanes(m_encoder->GetSimdSize()) == 16, "As of now Vector Emission is only supported for SIMD16");
47344750
unsigned VectorSize = getVectorSize(sources[0].value);
47354751

4752+
bool AllUniform = src[0]->IsUniform() && src[1]->IsUniform() && m_destination->IsUniform();
4753+
// cannot emit 16 SIMD if SIMD SIZE is set to 8, but can emit 4
4754+
// simple ALU instructions has the same possible width as SIMD, "math" pipeline instructions
4755+
// has reduced width
4756+
bool CanEmitThisSize = VectorSize <= SIMDSize;
4757+
4758+
if (IGC_IS_FLAG_ENABLED(VectorizerUniformValueVectorizationEnabled) && AllUniform && CanEmitThisSize) {
4759+
m_encoder->SetSrcRegion(0, 1, 1, 0);
4760+
m_encoder->SetSrcRegion(1, 1, 1, 0);
4761+
m_encoder->SetUniformSIMDSize(lanesToSIMDMode(VectorSize));
4762+
m_encoder->Mul(m_destination, src[0], src[1]);
4763+
m_encoder->Push();
4764+
return;
4765+
}
4766+
47364767
for (unsigned i = 0; i < VectorSize; ++i) {
47374768
SetSourceModifiers(0, sources[0]);
47384769
SetSourceModifiers(1, sources[1]);
@@ -4850,6 +4881,27 @@ void EmitPass::VectorMad(const SSource sources[3], const DstModifier& modifier)
48504881

48514882
unsigned VectorSize = getVectorSize(sources[0].value);
48524883

4884+
bool AllUniform = src[0]->IsUniform() &&
4885+
src[1]->IsUniform() && src[2]->IsUniform() &&
4886+
m_destination->IsUniform();
4887+
4888+
// cannot emit 16 SIMD if SIMD SIZE is set to 8, but can emit 4
4889+
// simple ALU instructions has the same possible width as SIMD, "math" pipeline instructions
4890+
// has reduced width
4891+
bool CanEmitThisSize = VectorSize <= numLanes(m_currShader->m_SIMDSize);
4892+
4893+
if (IGC_IS_FLAG_ENABLED(VectorizerUniformValueVectorizationEnabled) && AllUniform && CanEmitThisSize) {
4894+
// regioning must be updated by hand, DO NOT COPY for fptrunc
4895+
m_encoder->SetSrcRegion(0, 1, 1, 0);
4896+
m_encoder->SetSrcRegion(1, 1, 1, 0);
4897+
m_encoder->SetSrcRegion(2, 1, 1, 0);
4898+
// this will force no_mask and proper uniform SIMD SIZE
4899+
m_encoder->SetUniformSIMDSize(lanesToSIMDMode(VectorSize));
4900+
m_encoder->Mad(m_destination, src[0], src[1], src[2]);
4901+
m_encoder->Push();
4902+
return;
4903+
}
4904+
48534905
for (unsigned i = 0; i < VectorSize; ++i) {
48544906

48554907
SetSourceModifiers(0, sources[0]);
@@ -4863,10 +4915,8 @@ void EmitPass::VectorMad(const SSource sources[3], const DstModifier& modifier)
48634915
if (src[2]->IsUniform()) m_encoder->SetSrcSubReg(2, i);
48644916
else m_encoder->SetSrcSubVar(2, i);
48654917

4866-
bool AllAreUniform = src[0]->IsUniform() &&
4867-
src[1]->IsUniform() && src[2]->IsUniform();
48684918

4869-
if (AllAreUniform) m_encoder->SetDstSubReg(i);
4919+
if (AllUniform) m_encoder->SetDstSubReg(i);
48704920
else m_encoder->SetDstSubVar(i);
48714921

48724922
m_encoder->Mad(m_destination, src[0], src[1], src[2]);

IGC/Compiler/tests/EmitVISAPass/vectorizer-vector-emission-fmad-uniform.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
; UNSUPPORTED: system-windows
22
; REQUIRES: regkeys
33

4-
; RUN: igc_opt -S -dce -platformpvc -rev-id B -has-emulated-64-bit-insts -igc-emit-visa --regkey=DumpVISAASMToConsole=1 -simd-mode 16 < %s | FileCheck %s
4+
; RUN: igc_opt -S -dce -platformpvc -rev-id B -has-emulated-64-bit-insts -igc-emit-visa --regkey=DumpVISAASMToConsole=1 --regkey=VectorizerUniformValueVectorizationEnabled=0 -simd-mode 16 < %s | FileCheck %s
55

66
; CHECK: .decl vectorized_phi1095 v_type=G type=f num_elts=8 align=dword
77
; CHECK: .decl vectorized_phi1116 v_type=G type=f num_elts=8 align=dword
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
; RUN: igc_opt -S --igc-vectorizer -dce < %s 2>&1 | FileCheck %s
2+
3+
; CHECK: %vectorized_phi
4+
; CHECK: %vector = insertelement <8 x float> undef
5+
; CHECK: %vector1 = insertelement <8 x float> %vector
6+
; CHECK: %vector2 = insertelement <8 x float> %vector1
7+
; CHECK: %vector3 = insertelement <8 x float> %vector2
8+
; CHECK: %vector4 = insertelement <8 x float> %vector3
9+
; CHECK: %vector5 = insertelement <8 x float> %vector4
10+
; we use the same variable twice in original IR, vectorizer must process it correctly
11+
; CHECK: %vector6 = insertelement <8 x float> %vector5, float [[REPEATED:%.*]], i32 6
12+
; CHECK: %vector7 = insertelement <8 x float> %vector6, float [[REPEATED]], i32 7
13+
; CHECK: %vectorized_binary = fmul fast <8 x float> %vector7, %vectorized_phi
14+
; CHECK: call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> %vectorized_binary
15+
16+
; ModuleID = 'reduced.ll'
17+
source_filename = "initial_test.ll"
18+
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-n8:16:32"
19+
target triple = "spir64-unknown-unknown"
20+
21+
; Function Attrs: convergent nounwind
22+
define spir_kernel void @quux() {
23+
br label %._crit_edge
24+
25+
._crit_edge: ; preds = %._crit_edge, %0
26+
%1 = phi float [ 0.000000e+00, %0 ], [ %35, %._crit_edge ]
27+
%2 = phi float [ 0.000000e+00, %0 ], [ %36, %._crit_edge ]
28+
%3 = phi float [ 0.000000e+00, %0 ], [ %37, %._crit_edge ]
29+
%4 = phi float [ 0.000000e+00, %0 ], [ %38, %._crit_edge ]
30+
%5 = phi float [ 0.000000e+00, %0 ], [ %39, %._crit_edge ]
31+
%6 = phi float [ 0.000000e+00, %0 ], [ %40, %._crit_edge ]
32+
%7 = phi float [ 0.000000e+00, %0 ], [ %41, %._crit_edge ]
33+
%8 = phi float [ 0.000000e+00, %0 ], [ %42, %._crit_edge ]
34+
%9 = call float @llvm.exp2.f32(float 0.000000e+00)
35+
%10 = call float @llvm.exp2.f32(float 1.000000e+00)
36+
%11 = call float @llvm.exp2.f32(float 2.000000e+00)
37+
%12 = call float @llvm.exp2.f32(float 3.000000e+00)
38+
%13 = call float @llvm.exp2.f32(float 4.000000e+00)
39+
%14 = call float @llvm.exp2.f32(float 5.000000e+00)
40+
%15 = call float @llvm.exp2.f32(float 6.000000e+00)
41+
%16 = call float @llvm.exp2.f32(float 7.000000e+00)
42+
%17 = fmul fast float %9, %1
43+
%18 = fmul fast float %10, %2
44+
%19 = fmul fast float %11, %3
45+
%20 = fmul fast float %12, %4
46+
%21 = fmul fast float %13, %5
47+
%22 = fmul fast float %14, %6
48+
%23 = fmul fast float %15, %7
49+
%24 = fmul fast float %15, %8 ; we double the same variable and check that vectorizer builds correct scheme
50+
%25 = insertelement <8 x float> zeroinitializer, float %17, i64 0
51+
%26 = insertelement <8 x float> %25, float %18, i64 1
52+
%27 = insertelement <8 x float> %26, float %19, i64 2
53+
%28 = insertelement <8 x float> %27, float %20, i64 3
54+
%29 = insertelement <8 x float> %28, float %21, i64 4
55+
%30 = insertelement <8 x float> %29, float %22, i64 5
56+
%31 = insertelement <8 x float> %30, float %23, i64 6
57+
%32 = insertelement <8 x float> %31, float %24, i64 7
58+
%33 = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> %32, <8 x i16> zeroinitializer, <8 x i32> zeroinitializer, i32 0, i32 0, i32 0, i32 0, i1 false)
59+
%34 = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> %33, <8 x i16> zeroinitializer, <8 x i32> zeroinitializer, i32 0, i32 0, i32 0, i32 0, i1 false)
60+
%35 = extractelement <8 x float> %34, i64 0
61+
%36 = extractelement <8 x float> %34, i64 1
62+
%37 = extractelement <8 x float> %34, i64 2
63+
%38 = extractelement <8 x float> %34, i64 3
64+
%39 = extractelement <8 x float> %34, i64 4
65+
%40 = extractelement <8 x float> %34, i64 5
66+
%41 = extractelement <8 x float> %34, i64 6
67+
%42 = extractelement <8 x float> %34, i64 7
68+
br label %._crit_edge
69+
}
70+
71+
; Function Attrs: convergent nounwind readnone willreturn
72+
declare <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float>, <8 x i16>, <8 x i32>, i32, i32, i32, i32, i1) #1
73+
74+
; Function Attrs: nofree nosync nounwind readnone speculatable willreturn
75+
declare float @llvm.exp2.f32(float) #2
76+
77+
; uselistorder directives
78+
uselistorder <8 x float> (<8 x float>, <8 x i16>, <8 x i32>, i32, i32, i32, i32, i1)* @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32, { 1, 0 }
79+
uselistorder float (float)* @llvm.exp2.f32, { 7, 6, 5, 4, 3, 2, 1, 0 }
80+
81+
attributes #0 = { convergent nounwind }
82+
attributes #1 = { convergent nounwind readnone willreturn }
83+
attributes #2 = { nofree nosync nounwind readnone speculatable willreturn }
84+
85+
!igc.functions = !{!0}
86+
!0 = !{void ()* @quux, !1}
87+
!1 = !{!2, !3}
88+
!2 = !{!"function_type", i32 0}
89+
!3 = !{!"sub_group_size", i32 16}
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
; RUN: igc_opt -S --igc-vectorizer -dce < %s 2>&1 | FileCheck %s
2+
3+
; ModuleID = 'reduced.ll'
4+
source_filename = "initial_test.ll"
5+
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-n8:16:32"
6+
target triple = "spir64-unknown-unknown"
7+
8+
; Function Attrs: convergent nounwind
9+
define spir_kernel void @quux(< 16 x float>* %ptr) {
10+
11+
%bulkData = load <16 x float>, <16 x float>* %ptr
12+
13+
%extractedValue_0 = extractelement <16 x float> %bulkData, i64 0
14+
%extractedValue_1 = extractelement <16 x float> %bulkData, i64 1
15+
%extractedValue_2 = extractelement <16 x float> %bulkData, i64 2
16+
%extractedValue_3 = extractelement <16 x float> %bulkData, i64 3
17+
%extractedValue_4 = extractelement <16 x float> %bulkData, i64 4
18+
%extractedValue_5 = extractelement <16 x float> %bulkData, i64 5
19+
%extractedValue_6 = extractelement <16 x float> %bulkData, i64 6
20+
%extractedValue_7 = extractelement <16 x float> %bulkData, i64 7
21+
22+
%extractedValue_8 = extractelement <16 x float> %bulkData, i64 8
23+
%extractedValue_9 = extractelement <16 x float> %bulkData, i64 9
24+
%extractedValue_10 = extractelement <16 x float> %bulkData, i64 10
25+
%extractedValue_11 = extractelement <16 x float> %bulkData, i64 11
26+
%extractedValue_12 = extractelement <16 x float> %bulkData, i64 12
27+
%extractedValue_13 = extractelement <16 x float> %bulkData, i64 13
28+
%extractedValue_14 = extractelement <16 x float> %bulkData, i64 14
29+
%extractedValue_15 = extractelement <16 x float> %bulkData, i64 15
30+
31+
br label %._crit_edge
32+
33+
; CHECK-LABEL: ._crit_edge:
34+
._crit_edge: ; preds = %._crit_edge, %0
35+
36+
; CHECK-NOT: phi <8 x float>{{.*}}
37+
%1 = phi float [ %extractedValue_8, %0 ], [ %a35, %._crit_edge ]
38+
%2 = phi float [ %extractedValue_9, %0 ], [ %a36, %._crit_edge ]
39+
%3 = phi float [ %extractedValue_10, %0 ], [ %a37, %._crit_edge ]
40+
%4 = phi float [ %extractedValue_11, %0 ], [ %a38, %._crit_edge ]
41+
%5 = phi float [ %extractedValue_12, %0 ], [ %a39, %._crit_edge ]
42+
%6 = phi float [ %extractedValue_13, %0 ], [ %a40, %._crit_edge ]
43+
%7 = phi float [ %extractedValue_14, %0 ], [ %a41, %._crit_edge ]
44+
%8 = phi float [ %extractedValue_15, %0 ], [ %a42, %._crit_edge ]
45+
46+
; CHECK-NOT: fmul fast <8 x float>{{.*}}
47+
%a17 = fmul fast float %1, %extractedValue_0
48+
%a18 = fmul fast float %2, %extractedValue_1
49+
%a19 = fmul fast float %3, %extractedValue_2
50+
%a20 = fmul fast float %4, %extractedValue_3
51+
%a21 = fmul fast float %5, %extractedValue_4
52+
%a22 = fmul fast float %6, %extractedValue_5
53+
%a23 = fmul fast float %7, %extractedValue_6
54+
%a24 = fmul fast float %8, %extractedValue_7
55+
56+
%a25 = insertelement <8 x float> zeroinitializer, float %a17, i64 0
57+
%a26 = insertelement <8 x float> %a25, float %a18, i64 1
58+
%a27 = insertelement <8 x float> %a26, float %a19, i64 2
59+
%a28 = insertelement <8 x float> %a27, float %a20, i64 3
60+
%a29 = insertelement <8 x float> %a28, float %a21, i64 4
61+
%a30 = insertelement <8 x float> %a29, float %a22, i64 5
62+
%a31 = insertelement <8 x float> %a30, float %a23, i64 6
63+
%a32 = insertelement <8 x float> %a31, float %a24, i64 7
64+
%a33 = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> %a32, <8 x i16> zeroinitializer, <8 x i32> zeroinitializer, i32 0, i32 0, i32 0, i32 0, i1 false)
65+
%a34 = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> %a33, <8 x i16> zeroinitializer, <8 x i32> zeroinitializer, i32 0, i32 0, i32 0, i32 0, i1 false)
66+
%a35 = extractelement <8 x float> %a34, i64 0
67+
%a36 = extractelement <8 x float> %a34, i64 1
68+
%a37 = extractelement <8 x float> %a34, i64 2
69+
%a38 = extractelement <8 x float> %a34, i64 3
70+
%a39 = extractelement <8 x float> %a34, i64 4
71+
%a40 = extractelement <8 x float> %a34, i64 5
72+
%a41 = extractelement <8 x float> %a34, i64 6
73+
%a42 = extractelement <8 x float> %a34, i64 7
74+
; CHECK: br label %._crit_edge
75+
br label %._crit_edge
76+
}
77+
78+
; Function Attrs: convergent nounwind readnone willreturn
79+
declare <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float>, <8 x i16>, <8 x i32>, i32, i32, i32, i32, i1) #1
80+
81+
; Function Attrs: nofree nosync nounwind readnone speculatable willreturn
82+
declare float @llvm.exp2.f32(float) #2
83+
84+
attributes #0 = { convergent nounwind }
85+
attributes #1 = { convergent nounwind readnone willreturn }
86+
attributes #2 = { nofree nosync nounwind readnone speculatable willreturn }
87+
88+
!igc.functions = !{!0}
89+
!0 = !{void (<16 x float>* )* @quux, !1}
90+
!1 = !{!2, !3}
91+
!2 = !{!"function_type", i32 0}
92+
!3 = !{!"sub_group_size", i32 16}

IGC/common/igc_flags.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -533,6 +533,7 @@ DECLARE_IGC_REGKEY(bool, VectorizerAllowFPTRUNC, true, "Allow FPTRUNC instructio
533533
DECLARE_IGC_REGKEY(bool, VectorizerAllowFDIV, true, "Allow FDIV instructions inside vectorizer", true)
534534
DECLARE_IGC_REGKEY(bool, VectorizerAllowFADD, true, "Allow FADD instructions inside vectorizer", true)
535535
DECLARE_IGC_REGKEY(bool, VectorizerAllowFMADMatching, true, "Allow FADD and FMUL instructions to be matched later in the pattern match pass", true)
536+
DECLARE_IGC_REGKEY(bool, VectorizerUniformValueVectorizationEnabled, true, "Vector Emitter emits vectorized instruction for uniform values", true)
536537
DECLARE_IGC_REGKEY(bool, DisableOCLScalarizer, false, "Disable ScalarizeFunction pass in OCL pipeline", true)
537538
DECLARE_IGC_REGKEY(bool, DisablePHIScalarization, false, "Disable scalarization of PHINode instructions", true)
538539
DECLARE_IGC_REGKEY(bool, EnableSelectiveScalarizer, false, "enable selective scalarizer on GPGPU path", true)

0 commit comments

Comments
 (0)