Skip to content

Commit 8b23ebb

Browse files
authored
[AMDGPU][True16[MC] true16 for v_max3/min3_num_f16 (#121510)
V_MAX3/MIN3_NUM_F16 are alias GFX12 instructions with V_MAX3/MIN3_F16 in GFX11 and they should be updated together. This fix a bug introduced in #113603 such that only V_MAX3/MIN3_F16 are replaced in true16 format. Also added GFX12 runlines for CodeGen test
1 parent 67ff11e commit 8b23ebb

File tree

10 files changed

+1056
-316
lines changed

10 files changed

+1056
-316
lines changed

llvm/lib/Target/AMDGPU/VOP3Instructions.td

+2-2
Original file line numberDiff line numberDiff line change
@@ -1578,8 +1578,8 @@ def : MinimumMaximumByMinimum3Maximum3<fmaximum, f32, V_MAXIMUM3_F32_e64>;
15781578

15791579
defm V_MIN3_NUM_F32 : VOP3_Realtriple_with_name_gfx12<0x229, "V_MIN3_F32", "v_min3_num_f32">;
15801580
defm V_MAX3_NUM_F32 : VOP3_Realtriple_with_name_gfx12<0x22a, "V_MAX3_F32", "v_max3_num_f32">;
1581-
defm V_MIN3_NUM_F16 : VOP3_Realtriple_with_name_gfx12<0x22b, "V_MIN3_F16", "v_min3_num_f16">;
1582-
defm V_MAX3_NUM_F16 : VOP3_Realtriple_with_name_gfx12<0x22c, "V_MAX3_F16", "v_max3_num_f16">;
1581+
defm V_MIN3_NUM_F16 : VOP3_Realtriple_t16_and_fake16_gfx12<0x22b, "v_min3_num_f16", "V_MIN3_F16", "v_min3_f16">;
1582+
defm V_MAX3_NUM_F16 : VOP3_Realtriple_t16_and_fake16_gfx12<0x22c, "v_max3_num_f16", "V_MAX3_F16", "v_max3_f16">;
15831583
defm V_MINIMUM3_F32 : VOP3Only_Realtriple_gfx12<0x22d>;
15841584
defm V_MAXIMUM3_F32 : VOP3Only_Realtriple_gfx12<0x22e>;
15851585
defm V_MINIMUM3_F16 : VOP3Only_Realtriple_t16_gfx12<0x22f>;

llvm/test/CodeGen/AMDGPU/fmax3.ll

+134
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
44
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
55
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
6+
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12 %s
67

78
define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
89
; SI-LABEL: test_fmax3_olt_0_f32:
@@ -124,6 +125,36 @@ define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrs
124125
; GFX11-NEXT: v_max3_f32 v0, v0, v1, v2
125126
; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
126127
; GFX11-NEXT: s_endpgm
128+
;
129+
; GFX12-LABEL: test_fmax3_olt_0_f32:
130+
; GFX12: ; %bb.0:
131+
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
132+
; GFX12-NEXT: s_mov_b32 s10, -1
133+
; GFX12-NEXT: s_mov_b32 s11, 0x31016000
134+
; GFX12-NEXT: s_mov_b32 s14, s10
135+
; GFX12-NEXT: s_mov_b32 s15, s11
136+
; GFX12-NEXT: s_mov_b32 s18, s10
137+
; GFX12-NEXT: s_mov_b32 s19, s11
138+
; GFX12-NEXT: s_mov_b32 s22, s10
139+
; GFX12-NEXT: s_mov_b32 s23, s11
140+
; GFX12-NEXT: s_wait_kmcnt 0x0
141+
; GFX12-NEXT: s_mov_b32 s12, s2
142+
; GFX12-NEXT: s_mov_b32 s13, s3
143+
; GFX12-NEXT: s_mov_b32 s16, s4
144+
; GFX12-NEXT: s_mov_b32 s17, s5
145+
; GFX12-NEXT: s_mov_b32 s20, s6
146+
; GFX12-NEXT: s_mov_b32 s21, s7
147+
; GFX12-NEXT: buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS
148+
; GFX12-NEXT: s_wait_loadcnt 0x0
149+
; GFX12-NEXT: buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS
150+
; GFX12-NEXT: s_wait_loadcnt 0x0
151+
; GFX12-NEXT: buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS
152+
; GFX12-NEXT: s_wait_loadcnt 0x0
153+
; GFX12-NEXT: s_mov_b32 s8, s0
154+
; GFX12-NEXT: s_mov_b32 s9, s1
155+
; GFX12-NEXT: v_max3_num_f32 v0, v0, v1, v2
156+
; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null
157+
; GFX12-NEXT: s_endpgm
127158
%a = load volatile float, ptr addrspace(1) %aptr, align 4
128159
%b = load volatile float, ptr addrspace(1) %bptr, align 4
129160
%c = load volatile float, ptr addrspace(1) %cptr, align 4
@@ -254,6 +285,36 @@ define amdgpu_kernel void @test_fmax3_olt_1_f32(ptr addrspace(1) %out, ptr addrs
254285
; GFX11-NEXT: v_max3_f32 v0, v2, v0, v1
255286
; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
256287
; GFX11-NEXT: s_endpgm
288+
;
289+
; GFX12-LABEL: test_fmax3_olt_1_f32:
290+
; GFX12: ; %bb.0:
291+
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
292+
; GFX12-NEXT: s_mov_b32 s10, -1
293+
; GFX12-NEXT: s_mov_b32 s11, 0x31016000
294+
; GFX12-NEXT: s_mov_b32 s14, s10
295+
; GFX12-NEXT: s_mov_b32 s15, s11
296+
; GFX12-NEXT: s_mov_b32 s18, s10
297+
; GFX12-NEXT: s_mov_b32 s19, s11
298+
; GFX12-NEXT: s_mov_b32 s22, s10
299+
; GFX12-NEXT: s_mov_b32 s23, s11
300+
; GFX12-NEXT: s_wait_kmcnt 0x0
301+
; GFX12-NEXT: s_mov_b32 s12, s2
302+
; GFX12-NEXT: s_mov_b32 s13, s3
303+
; GFX12-NEXT: s_mov_b32 s16, s4
304+
; GFX12-NEXT: s_mov_b32 s17, s5
305+
; GFX12-NEXT: s_mov_b32 s20, s6
306+
; GFX12-NEXT: s_mov_b32 s21, s7
307+
; GFX12-NEXT: buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS
308+
; GFX12-NEXT: s_wait_loadcnt 0x0
309+
; GFX12-NEXT: buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS
310+
; GFX12-NEXT: s_wait_loadcnt 0x0
311+
; GFX12-NEXT: buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS
312+
; GFX12-NEXT: s_wait_loadcnt 0x0
313+
; GFX12-NEXT: s_mov_b32 s8, s0
314+
; GFX12-NEXT: s_mov_b32 s9, s1
315+
; GFX12-NEXT: v_max3_num_f32 v0, v2, v0, v1
316+
; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null
317+
; GFX12-NEXT: s_endpgm
257318
%a = load volatile float, ptr addrspace(1) %aptr, align 4
258319
%b = load volatile float, ptr addrspace(1) %bptr, align 4
259320
%c = load volatile float, ptr addrspace(1) %cptr, align 4
@@ -391,6 +452,36 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrs
391452
; GFX11-NEXT: v_max3_f16 v0, v0, v1, v2
392453
; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
393454
; GFX11-NEXT: s_endpgm
455+
;
456+
; GFX12-LABEL: test_fmax3_olt_0_f16:
457+
; GFX12: ; %bb.0:
458+
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
459+
; GFX12-NEXT: s_mov_b32 s10, -1
460+
; GFX12-NEXT: s_mov_b32 s11, 0x31016000
461+
; GFX12-NEXT: s_mov_b32 s14, s10
462+
; GFX12-NEXT: s_mov_b32 s15, s11
463+
; GFX12-NEXT: s_mov_b32 s18, s10
464+
; GFX12-NEXT: s_mov_b32 s19, s11
465+
; GFX12-NEXT: s_mov_b32 s22, s10
466+
; GFX12-NEXT: s_mov_b32 s23, s11
467+
; GFX12-NEXT: s_wait_kmcnt 0x0
468+
; GFX12-NEXT: s_mov_b32 s12, s2
469+
; GFX12-NEXT: s_mov_b32 s13, s3
470+
; GFX12-NEXT: s_mov_b32 s16, s4
471+
; GFX12-NEXT: s_mov_b32 s17, s5
472+
; GFX12-NEXT: s_mov_b32 s20, s6
473+
; GFX12-NEXT: s_mov_b32 s21, s7
474+
; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
475+
; GFX12-NEXT: s_wait_loadcnt 0x0
476+
; GFX12-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
477+
; GFX12-NEXT: s_wait_loadcnt 0x0
478+
; GFX12-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
479+
; GFX12-NEXT: s_wait_loadcnt 0x0
480+
; GFX12-NEXT: s_mov_b32 s8, s0
481+
; GFX12-NEXT: s_mov_b32 s9, s1
482+
; GFX12-NEXT: v_max3_num_f16 v0, v0, v1, v2
483+
; GFX12-NEXT: buffer_store_b16 v0, off, s[8:11], null
484+
; GFX12-NEXT: s_endpgm
394485
%a = load volatile half, ptr addrspace(1) %aptr, align 2
395486
%b = load volatile half, ptr addrspace(1) %bptr, align 2
396487
%c = load volatile half, ptr addrspace(1) %cptr, align 2
@@ -529,6 +620,36 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrs
529620
; GFX11-NEXT: v_max3_f16 v0, v2, v0, v1
530621
; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
531622
; GFX11-NEXT: s_endpgm
623+
;
624+
; GFX12-LABEL: test_fmax3_olt_1_f16:
625+
; GFX12: ; %bb.0:
626+
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
627+
; GFX12-NEXT: s_mov_b32 s10, -1
628+
; GFX12-NEXT: s_mov_b32 s11, 0x31016000
629+
; GFX12-NEXT: s_mov_b32 s14, s10
630+
; GFX12-NEXT: s_mov_b32 s15, s11
631+
; GFX12-NEXT: s_mov_b32 s18, s10
632+
; GFX12-NEXT: s_mov_b32 s19, s11
633+
; GFX12-NEXT: s_mov_b32 s22, s10
634+
; GFX12-NEXT: s_mov_b32 s23, s11
635+
; GFX12-NEXT: s_wait_kmcnt 0x0
636+
; GFX12-NEXT: s_mov_b32 s12, s2
637+
; GFX12-NEXT: s_mov_b32 s13, s3
638+
; GFX12-NEXT: s_mov_b32 s16, s4
639+
; GFX12-NEXT: s_mov_b32 s17, s5
640+
; GFX12-NEXT: s_mov_b32 s20, s6
641+
; GFX12-NEXT: s_mov_b32 s21, s7
642+
; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
643+
; GFX12-NEXT: s_wait_loadcnt 0x0
644+
; GFX12-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
645+
; GFX12-NEXT: s_wait_loadcnt 0x0
646+
; GFX12-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
647+
; GFX12-NEXT: s_wait_loadcnt 0x0
648+
; GFX12-NEXT: s_mov_b32 s8, s0
649+
; GFX12-NEXT: s_mov_b32 s9, s1
650+
; GFX12-NEXT: v_max3_num_f16 v0, v2, v0, v1
651+
; GFX12-NEXT: buffer_store_b16 v0, off, s[8:11], null
652+
; GFX12-NEXT: s_endpgm
532653
%a = load volatile half, ptr addrspace(1) %aptr, align 2
533654
%b = load volatile half, ptr addrspace(1) %bptr, align 2
534655
%c = load volatile half, ptr addrspace(1) %cptr, align 2
@@ -594,6 +715,19 @@ define <2 x half> @no_fmax3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <
594715
; GFX11-NEXT: v_pk_max_f16 v0, v2, v0
595716
; GFX11-NEXT: v_pk_max_f16 v0, v0, v3
596717
; GFX11-NEXT: s_setpc_b64 s[30:31]
718+
;
719+
; GFX12-LABEL: no_fmax3_v2f16:
720+
; GFX12: ; %bb.0: ; %entry
721+
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
722+
; GFX12-NEXT: s_wait_expcnt 0x0
723+
; GFX12-NEXT: s_wait_samplecnt 0x0
724+
; GFX12-NEXT: s_wait_bvhcnt 0x0
725+
; GFX12-NEXT: s_wait_kmcnt 0x0
726+
; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v1
727+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
728+
; GFX12-NEXT: v_pk_max_num_f16 v0, v2, v0
729+
; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v3
730+
; GFX12-NEXT: s_setpc_b64 s[30:31]
597731
entry:
598732
%max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
599733
%max1 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %c, <2 x half> %max)

0 commit comments

Comments
 (0)