Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Vectorized Multiply (on Avx512) #45

Merged
merged 38 commits into from
Feb 10, 2025
Merged

Vectorized Multiply (on Avx512) #45

merged 38 commits into from
Feb 10, 2025

Conversation

benaadams
Copy link
Member

@benaadams benaadams commented Feb 9, 2025

~30% faster (current Scalar has vectorized Adds) on AMD Ryzen 9 7950X; x2.5 times faster than vanilla MULX

Method Environment A B Mean Error StdDev Ratio
Multiply_UInt256 HWIntrinsic=0 (619(...)658) [156] (619(...)658) [156] 21.44 ns 0.359 ns 0.369 ns 1.79
Multiply_UInt256 Scalar (619(...)658) [156] (619(...)658) [156] 12.02 ns 0.240 ns 0.321 ns 1.00
Multiply_UInt256 Avx512 (619(...)658) [156] (619(...)658) [156] 8.56 ns 0.126 ns 0.118 ns 0.71
Multiply_UInt256 HWIntrinsic=0 (619(...)658) [156] (115(...)935) [160] 21.64 ns 0.442 ns 0.491 ns 1.84
Multiply_UInt256 Scalar (619(...)658) [156] (115(...)935) [160] 11.74 ns 0.243 ns 0.227 ns 1.00
Multiply_UInt256 Avx512 (619(...)658) [156] (115(...)935) [160] 8.65 ns 0.150 ns 0.140 ns 0.74
Multiply_UInt256 HWIntrinsic=0 (115(...)935) [160] (619(...)658) [156] 21.81 ns 0.453 ns 0.589 ns 1.83
Multiply_UInt256 Scalar (115(...)935) [160] (619(...)658) [156] 11.90 ns 0.235 ns 0.209 ns 1.00
Multiply_UInt256 Avx512 (115(...)935) [160] (619(...)658) [156] 8.48 ns 0.139 ns 0.124 ns 0.71
Multiply_UInt256 HWIntrinsic=0 (115(...)935) [160] (115(...)935) [160] 21.95 ns 0.444 ns 0.416 ns 1.80
Multiply_UInt256 Scalar (115(...)935) [160] (115(...)935) [160] 12.21 ns 0.231 ns 0.216 ns 1.00
Multiply_UInt256 Avx512 (115(...)935) [160] (115(...)935) [160] 8.58 ns 0.155 ns 0.145 ns 0.70

Asm output

; Assembly listing for method Nethermind.Int256.UInt256:Multiply(byref,byref,byref) (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX512 - Windows
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data
; 0 inlinees with PGO data; 5 single block inlinees; 2 inlinees without PGO data

G_M000_IG01:                ;; offset=0x0000
       sub      rsp, 200
 
G_M000_IG02:                ;; offset=0x0007
       mov      rax, qword ptr [rcx+0x08]
       or       rax, qword ptr [rcx+0x10]
       or       rax, qword ptr [rcx+0x18]
       or       rax, qword ptr [rdx+0x08]
       or       rax, qword ptr [rdx+0x10]
       or       rax, qword ptr [rdx+0x18]
       je       G_M000_IG06
 
G_M000_IG03:                ;; offset=0x0025
       vmovups  zmm0, zmmword ptr [rsp+0x50]
       vinserti64x4 zmm0, zmm0, ymmword ptr [rcx], 0
       vmovups  zmm1, zmmword ptr [rsp+0x10]
       vinserti64x4 zmm1, zmm1, ymmword ptr [rdx], 0
       vmovups  zmm2, zmmword ptr [reloc @RWD00]
       vmovups  zmm3, zmmword ptr [reloc @RWD64]
       vpermq   zmm1, zmm3, zmm1
       vmovups  zmm3, zmmword ptr [reloc @RWD128]
       vpermq   zmm0, zmm3, zmm0
       vpandq   zmm3, zmm2, zmm0
       vpandq   zmm4, zmm2, zmm1
       vpsrlq   zmm1, zmm1, 32
       vpmullq  zmm5, zmm3, zmm4
       vpmullq  zmm3, zmm3, zmm1
       vpsrlq   zmm0, zmm0, 32
       vpmullq  zmm4, zmm0, zmm4
       vpmullq  zmm0, zmm0, zmm1
       vpandq   zmm1, zmm2, zmm3
       vpsrlq   zmm16, zmm5, 32
       vpaddq   zmm1, zmm1, zmm16
       vpandq   zmm16, zmm2, zmm4
       vpaddq   zmm1, zmm16, zmm1
       vpandq   zmm16, zmm2, zmm1
       vpsllq   zmm16, zmm16, 32
       vpternlogq zmm5, zmm16, zmm2, -20
       vpsrlq   zmm2, zmm3, 32
       vpaddq   zmm0, zmm2, zmm0
       vpsrlq   zmm2, zmm4, 32
       vpaddq   zmm0, zmm2, zmm0
       vpsrlq   zmm1, zmm1, 32
       vpaddq   zmm0, zmm1, zmm0
       vpunpcklqdq zmm1, zmm5, zmm0
       vextracti32x4 xmm2, zmm1, 0
       vpunpckhqdq zmm0, zmm5, zmm0
       vextracti32x4 xmm3, zmm0, 0
       vextracti32x4 xmm4, zmm1, 1
       vextracti32x4 xmm16, zmm0, 1
       vextracti32x4 xmm1, zmm1, 2
       vextracti32x4 xmm0, zmm0, 2
       vmovd    xmm17, qword ptr [rcx+0x10]
       vpinsrq  xmm17, xmm17, qword ptr [rcx+0x18], 1
       vmovd    xmm18, qword ptr [rdx+0x08]
       vpinsrq  xmm18, xmm18, qword ptr [rdx], 1
       vpaddq   xmm4, xmm3, xmm4
       vpcmpuq  k1, xmm4, xmm3, 1
       vpmovm2q xmm19, k1
       vpsrlq   xmm19, xmm19, 63
       vxorps   xmm20, xmm20, xmm20
       vpunpcklqdq xmm19, xmm20, xmm19
       vpaddq   xmm4, xmm19, xmm4
       vpunpcklqdq xmm19, xmm20, xmm4
       vpaddq   xmm19, xmm19, xmm2
       vpunpckhqdq xmm20, xmm19, xmm19
       vpunpckhqdq xmm2, xmm2, xmm2
       vpcmpuq  k1, xmm20, xmm2, 1
       vpmovm2q xmm2, k1
       vpsrlq   xmm2, xmm2, 63
       vpunpckhqdq xmm4, xmm4, xmm4
       vpunpckhqdq xmm3, xmm3, xmm3
       vpcmpuq  k1, xmm3, xmm4, 6
       vpmovm2q xmm3, k1
       vpsrlq   xmm3, xmm3, 63
       vpaddq   xmm1, xmm16, xmm1
       vpcmpuq  k1, xmm1, xmm16, 1
       vpmovm2q xmm16, k1
       vpsrlq   xmm16, xmm16, 63
 
G_M000_IG04:                ;; offset=0x01D8
       vxorps   xmm20, xmm20, xmm20
       vpunpcklqdq xmm16, xmm20, xmm16
       vpaddq   xmm1, xmm16, xmm1
       vpaddq   xmm0, xmm1, xmm0
       vpcmpuq  k1, xmm0, xmm1, 1
       vpmovm2q xmm1, k1
       vpsrlq   xmm1, xmm1, 63
       vpunpcklqdq xmm1, xmm20, xmm1
       vpaddq   xmm0, xmm1, xmm0
       vpaddq   xmm1, xmm4, xmm2
       vpunpcklqdq xmm1, xmm1, xmm3
       vpaddq   xmm0, xmm1, xmm0
       vpcmpuq  k1, xmm0, xmm1, 1
       vpmovm2q xmm1, k1
       vpsrlq   xmm1, xmm1, 63
       vpunpcklqdq xmm1, xmm20, xmm1
       vpaddq   xmm0, xmm1, xmm0
       vpmullq  xmm1, xmm17, xmm18
       vextracti32x4 xmm2, zmm5, 3
       vpaddq   xmm1, xmm2, xmm1
       vpunpcklqdq xmm2, xmm1, xmm1
       vpaddq   xmm1, xmm2, xmm1
       vpunpckhqdq xmm1, xmm20, xmm1
       vpaddq   xmm0, xmm1, xmm0
       vinserti32x4 ymm0, ymm19, xmm0, 1
       vmovups  ymmword ptr [r8], ymm0
 
G_M000_IG05:                ;; offset=0x0261
       vzeroupper 
       add      rsp, 200
       ret      
 
G_M000_IG06:                ;; offset=0x026C
       mov      bword ptr [rsp+0xD8], rdx
       mov      rdx, qword ptr [rcx]
       mov      rax, bword ptr [rsp+0xD8]
       mov      rax, qword ptr [rax]
       lea      rcx, [rsp+0x08]
       mulx     rax, r10, rax
       mov      qword ptr [rcx], r10
       mov      rcx, qword ptr [rsp+0x08]
       vxorps   ymm0, ymm0, ymm0
       vmovdqu  ymmword ptr [r8], ymm0
       mov      qword ptr [r8], rcx
       mov      qword ptr [r8+0x08], rax
 
G_M000_IG07:                ;; offset=0x02A4
       vzeroupper 
       add      rsp, 200
       ret      
 
RWD00  	dq	00000000FFFFFFFFh, 00000000FFFFFFFFh, 00000000FFFFFFFFh, 00000000FFFFFFFFh, 00000000FFFFFFFFh, 00000000FFFFFFFFh, 00000000FFFFFFFFh, 00000000FFFFFFFFh
RWD64  	dq	0000000000000000h, 0000000000000001h, 0000000000000000h, 0000000000000002h, 0000000000000001h, 0000000000000000h, 0000000000000003h, 0000000000000002h
RWD128 	dq	0000000000000000h, 0000000000000000h, 0000000000000001h, 0000000000000000h, 0000000000000001h, 0000000000000002h, 0000000000000000h, 0000000000000001h

; Total bytes of code 687

Copy link
Member

@LukaszRozmej LukaszRozmej left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

HOW!?

@benaadams
Copy link
Member Author

HOW!?

Perseverance 😅

@benaadams benaadams merged commit 7d7b936 into master Feb 10, 2025
4 checks passed
@benaadams benaadams deleted the avx512-multiply branch February 10, 2025 10:21
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

2 participants