|
3 | 3 | ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
|
4 | 4 | ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
|
5 | 5 | ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
|
| 6 | +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12 %s |
6 | 7 |
|
7 | 8 | define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
|
8 | 9 | ; SI-LABEL: test_fmax3_olt_0_f32:
|
@@ -124,6 +125,36 @@ define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrs
|
124 | 125 | ; GFX11-NEXT: v_max3_f32 v0, v0, v1, v2
|
125 | 126 | ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
|
126 | 127 | ; GFX11-NEXT: s_endpgm
|
| 128 | +; |
| 129 | +; GFX12-LABEL: test_fmax3_olt_0_f32: |
| 130 | +; GFX12: ; %bb.0: |
| 131 | +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 |
| 132 | +; GFX12-NEXT: s_mov_b32 s10, -1 |
| 133 | +; GFX12-NEXT: s_mov_b32 s11, 0x31016000 |
| 134 | +; GFX12-NEXT: s_mov_b32 s14, s10 |
| 135 | +; GFX12-NEXT: s_mov_b32 s15, s11 |
| 136 | +; GFX12-NEXT: s_mov_b32 s18, s10 |
| 137 | +; GFX12-NEXT: s_mov_b32 s19, s11 |
| 138 | +; GFX12-NEXT: s_mov_b32 s22, s10 |
| 139 | +; GFX12-NEXT: s_mov_b32 s23, s11 |
| 140 | +; GFX12-NEXT: s_wait_kmcnt 0x0 |
| 141 | +; GFX12-NEXT: s_mov_b32 s12, s2 |
| 142 | +; GFX12-NEXT: s_mov_b32 s13, s3 |
| 143 | +; GFX12-NEXT: s_mov_b32 s16, s4 |
| 144 | +; GFX12-NEXT: s_mov_b32 s17, s5 |
| 145 | +; GFX12-NEXT: s_mov_b32 s20, s6 |
| 146 | +; GFX12-NEXT: s_mov_b32 s21, s7 |
| 147 | +; GFX12-NEXT: buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS |
| 148 | +; GFX12-NEXT: s_wait_loadcnt 0x0 |
| 149 | +; GFX12-NEXT: buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS |
| 150 | +; GFX12-NEXT: s_wait_loadcnt 0x0 |
| 151 | +; GFX12-NEXT: buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS |
| 152 | +; GFX12-NEXT: s_wait_loadcnt 0x0 |
| 153 | +; GFX12-NEXT: s_mov_b32 s8, s0 |
| 154 | +; GFX12-NEXT: s_mov_b32 s9, s1 |
| 155 | +; GFX12-NEXT: v_max3_num_f32 v0, v0, v1, v2 |
| 156 | +; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null |
| 157 | +; GFX12-NEXT: s_endpgm |
127 | 158 | %a = load volatile float, ptr addrspace(1) %aptr, align 4
|
128 | 159 | %b = load volatile float, ptr addrspace(1) %bptr, align 4
|
129 | 160 | %c = load volatile float, ptr addrspace(1) %cptr, align 4
|
@@ -254,6 +285,36 @@ define amdgpu_kernel void @test_fmax3_olt_1_f32(ptr addrspace(1) %out, ptr addrs
|
254 | 285 | ; GFX11-NEXT: v_max3_f32 v0, v2, v0, v1
|
255 | 286 | ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
|
256 | 287 | ; GFX11-NEXT: s_endpgm
|
| 288 | +; |
| 289 | +; GFX12-LABEL: test_fmax3_olt_1_f32: |
| 290 | +; GFX12: ; %bb.0: |
| 291 | +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 |
| 292 | +; GFX12-NEXT: s_mov_b32 s10, -1 |
| 293 | +; GFX12-NEXT: s_mov_b32 s11, 0x31016000 |
| 294 | +; GFX12-NEXT: s_mov_b32 s14, s10 |
| 295 | +; GFX12-NEXT: s_mov_b32 s15, s11 |
| 296 | +; GFX12-NEXT: s_mov_b32 s18, s10 |
| 297 | +; GFX12-NEXT: s_mov_b32 s19, s11 |
| 298 | +; GFX12-NEXT: s_mov_b32 s22, s10 |
| 299 | +; GFX12-NEXT: s_mov_b32 s23, s11 |
| 300 | +; GFX12-NEXT: s_wait_kmcnt 0x0 |
| 301 | +; GFX12-NEXT: s_mov_b32 s12, s2 |
| 302 | +; GFX12-NEXT: s_mov_b32 s13, s3 |
| 303 | +; GFX12-NEXT: s_mov_b32 s16, s4 |
| 304 | +; GFX12-NEXT: s_mov_b32 s17, s5 |
| 305 | +; GFX12-NEXT: s_mov_b32 s20, s6 |
| 306 | +; GFX12-NEXT: s_mov_b32 s21, s7 |
| 307 | +; GFX12-NEXT: buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS |
| 308 | +; GFX12-NEXT: s_wait_loadcnt 0x0 |
| 309 | +; GFX12-NEXT: buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS |
| 310 | +; GFX12-NEXT: s_wait_loadcnt 0x0 |
| 311 | +; GFX12-NEXT: buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS |
| 312 | +; GFX12-NEXT: s_wait_loadcnt 0x0 |
| 313 | +; GFX12-NEXT: s_mov_b32 s8, s0 |
| 314 | +; GFX12-NEXT: s_mov_b32 s9, s1 |
| 315 | +; GFX12-NEXT: v_max3_num_f32 v0, v2, v0, v1 |
| 316 | +; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null |
| 317 | +; GFX12-NEXT: s_endpgm |
257 | 318 | %a = load volatile float, ptr addrspace(1) %aptr, align 4
|
258 | 319 | %b = load volatile float, ptr addrspace(1) %bptr, align 4
|
259 | 320 | %c = load volatile float, ptr addrspace(1) %cptr, align 4
|
@@ -391,6 +452,36 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrs
|
391 | 452 | ; GFX11-NEXT: v_max3_f16 v0, v0, v1, v2
|
392 | 453 | ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
|
393 | 454 | ; GFX11-NEXT: s_endpgm
|
| 455 | +; |
| 456 | +; GFX12-LABEL: test_fmax3_olt_0_f16: |
| 457 | +; GFX12: ; %bb.0: |
| 458 | +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 |
| 459 | +; GFX12-NEXT: s_mov_b32 s10, -1 |
| 460 | +; GFX12-NEXT: s_mov_b32 s11, 0x31016000 |
| 461 | +; GFX12-NEXT: s_mov_b32 s14, s10 |
| 462 | +; GFX12-NEXT: s_mov_b32 s15, s11 |
| 463 | +; GFX12-NEXT: s_mov_b32 s18, s10 |
| 464 | +; GFX12-NEXT: s_mov_b32 s19, s11 |
| 465 | +; GFX12-NEXT: s_mov_b32 s22, s10 |
| 466 | +; GFX12-NEXT: s_mov_b32 s23, s11 |
| 467 | +; GFX12-NEXT: s_wait_kmcnt 0x0 |
| 468 | +; GFX12-NEXT: s_mov_b32 s12, s2 |
| 469 | +; GFX12-NEXT: s_mov_b32 s13, s3 |
| 470 | +; GFX12-NEXT: s_mov_b32 s16, s4 |
| 471 | +; GFX12-NEXT: s_mov_b32 s17, s5 |
| 472 | +; GFX12-NEXT: s_mov_b32 s20, s6 |
| 473 | +; GFX12-NEXT: s_mov_b32 s21, s7 |
| 474 | +; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS |
| 475 | +; GFX12-NEXT: s_wait_loadcnt 0x0 |
| 476 | +; GFX12-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS |
| 477 | +; GFX12-NEXT: s_wait_loadcnt 0x0 |
| 478 | +; GFX12-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS |
| 479 | +; GFX12-NEXT: s_wait_loadcnt 0x0 |
| 480 | +; GFX12-NEXT: s_mov_b32 s8, s0 |
| 481 | +; GFX12-NEXT: s_mov_b32 s9, s1 |
| 482 | +; GFX12-NEXT: v_max3_num_f16 v0, v0, v1, v2 |
| 483 | +; GFX12-NEXT: buffer_store_b16 v0, off, s[8:11], null |
| 484 | +; GFX12-NEXT: s_endpgm |
394 | 485 | %a = load volatile half, ptr addrspace(1) %aptr, align 2
|
395 | 486 | %b = load volatile half, ptr addrspace(1) %bptr, align 2
|
396 | 487 | %c = load volatile half, ptr addrspace(1) %cptr, align 2
|
@@ -529,6 +620,36 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrs
|
529 | 620 | ; GFX11-NEXT: v_max3_f16 v0, v2, v0, v1
|
530 | 621 | ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
|
531 | 622 | ; GFX11-NEXT: s_endpgm
|
| 623 | +; |
| 624 | +; GFX12-LABEL: test_fmax3_olt_1_f16: |
| 625 | +; GFX12: ; %bb.0: |
| 626 | +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 |
| 627 | +; GFX12-NEXT: s_mov_b32 s10, -1 |
| 628 | +; GFX12-NEXT: s_mov_b32 s11, 0x31016000 |
| 629 | +; GFX12-NEXT: s_mov_b32 s14, s10 |
| 630 | +; GFX12-NEXT: s_mov_b32 s15, s11 |
| 631 | +; GFX12-NEXT: s_mov_b32 s18, s10 |
| 632 | +; GFX12-NEXT: s_mov_b32 s19, s11 |
| 633 | +; GFX12-NEXT: s_mov_b32 s22, s10 |
| 634 | +; GFX12-NEXT: s_mov_b32 s23, s11 |
| 635 | +; GFX12-NEXT: s_wait_kmcnt 0x0 |
| 636 | +; GFX12-NEXT: s_mov_b32 s12, s2 |
| 637 | +; GFX12-NEXT: s_mov_b32 s13, s3 |
| 638 | +; GFX12-NEXT: s_mov_b32 s16, s4 |
| 639 | +; GFX12-NEXT: s_mov_b32 s17, s5 |
| 640 | +; GFX12-NEXT: s_mov_b32 s20, s6 |
| 641 | +; GFX12-NEXT: s_mov_b32 s21, s7 |
| 642 | +; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS |
| 643 | +; GFX12-NEXT: s_wait_loadcnt 0x0 |
| 644 | +; GFX12-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS |
| 645 | +; GFX12-NEXT: s_wait_loadcnt 0x0 |
| 646 | +; GFX12-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS |
| 647 | +; GFX12-NEXT: s_wait_loadcnt 0x0 |
| 648 | +; GFX12-NEXT: s_mov_b32 s8, s0 |
| 649 | +; GFX12-NEXT: s_mov_b32 s9, s1 |
| 650 | +; GFX12-NEXT: v_max3_num_f16 v0, v2, v0, v1 |
| 651 | +; GFX12-NEXT: buffer_store_b16 v0, off, s[8:11], null |
| 652 | +; GFX12-NEXT: s_endpgm |
532 | 653 | %a = load volatile half, ptr addrspace(1) %aptr, align 2
|
533 | 654 | %b = load volatile half, ptr addrspace(1) %bptr, align 2
|
534 | 655 | %c = load volatile half, ptr addrspace(1) %cptr, align 2
|
@@ -594,6 +715,19 @@ define <2 x half> @no_fmax3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <
|
594 | 715 | ; GFX11-NEXT: v_pk_max_f16 v0, v2, v0
|
595 | 716 | ; GFX11-NEXT: v_pk_max_f16 v0, v0, v3
|
596 | 717 | ; GFX11-NEXT: s_setpc_b64 s[30:31]
|
| 718 | +; |
| 719 | +; GFX12-LABEL: no_fmax3_v2f16: |
| 720 | +; GFX12: ; %bb.0: ; %entry |
| 721 | +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 |
| 722 | +; GFX12-NEXT: s_wait_expcnt 0x0 |
| 723 | +; GFX12-NEXT: s_wait_samplecnt 0x0 |
| 724 | +; GFX12-NEXT: s_wait_bvhcnt 0x0 |
| 725 | +; GFX12-NEXT: s_wait_kmcnt 0x0 |
| 726 | +; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v1 |
| 727 | +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| 728 | +; GFX12-NEXT: v_pk_max_num_f16 v0, v2, v0 |
| 729 | +; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v3 |
| 730 | +; GFX12-NEXT: s_setpc_b64 s[30:31] |
597 | 731 | entry:
|
598 | 732 | %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
|
599 | 733 | %max1 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %c, <2 x half> %max)
|
|
0 commit comments