perf: optimize MlpgAdjust#116
Draft
cm-ayf wants to merge 20 commits into
Draft
Conversation
Codecov Report❌ Patch coverage is
Additional details and impacted files@@ Coverage Diff @@
## main #116 +/- ##
==========================================
- Coverage 71.46% 71.28% -0.18%
==========================================
Files 37 37
Lines 1675 1689 +14
==========================================
+ Hits 1197 1204 +7
- Misses 478 485 +7 ☔ View full report in Codecov by Harness. 🚀 New features to boost your workflow:
|
mlsafir diagnosticsref: Artifact: aarch64-baseAssembly: Details.section .text.jbonsai::vocoder::mlsa::fir,"ax",@progbits
.p2align 2
.type jbonsai::vocoder::mlsa::fir,@function
jbonsai::vocoder::mlsa::fir:
.cfi_startproc
stp x29, x30, [sp, #-16]!
.cfi_def_cfa_offset 16
mov x29, sp
.cfi_def_cfa w29, 16
.cfi_offset w30, -8
.cfi_offset w29, -16
.cfi_remember_state
cmp x1, #1
b.ls .LBB261_12
fmul d4, d1, d1
fmov d2, #1.00000000
ldr d3, [x0, #8]
cmp x1, x3
fmul d6, d1, d3
fsub d2, d2, d4
fmul d5, d0, d2
fmul d0, d0, d1
fadd d5, d5, d6
stp d0, d5, [x0]
b.hi .LBB261_13
fsub d3, d3, d0
sub x9, x1, #2
lsr x10, x9, #2
and x8, x9, #0x3
cbz x10, .LBB261_10
fmul d5, d4, d4
fneg d6, d1
movi v0.2d, #0000000000000000
add x11, x0, #32
add x12, x2, #32
.LBB261_4:
ldur q7, [x11, #-16]
ldur q17, [x11, #-8]
mov v18.16b, v3.16b
subs x10, x10, #1
fmsub d16, d1, d3, d7
fmla v17.2d, v7.2d, v6.d[0]
fmul v7.2d, v7.2d, v1.d[0]
mov v18.d[1], v16.d[0]
mov v19.16b, v17.16b
ldr q16, [x11]
fmul v20.2d, v16.2d, v1.d[0]
mov d16, v16.d[1]
fmla v19.2d, v18.2d, v4.d[0]
fmla v7.2d, v18.2d, v2.d[0]
ldr d18, [x11]
fmsub d16, d1, d18, d16
fmla v20.2d, v19.2d, v2.d[0]
ldp q19, q21, [x12, #-16]
add x12, x12, #32
fmadd d16, d4, d17, d16
fmul v18.2d, v7.2d, v19.2d
fmul v19.2d, v20.2d, v21.2d
stp q7, q20, [x11, #-16]
add x11, x11, #32
fmadd d3, d5, d3, d16
fadd v17.2d, v18.2d, v19.2d
fadd v0.2d, v0.2d, v17.2d
b.ne .LBB261_4
cbz x8, .LBB261_11
.LBB261_6:
lsl x9, x9, #3
fmul d4, d2, d3
cmp x8, #1
and x10, x9, #0x7fffffffffffffe0
add x9, x0, x10
add x10, x2, x10
ldur d5, [x9, #16]
fmul d6, d1, d5
fadd d6, d4, d6
ldur d4, [x10, #16]
fmul d4, d4, d6
stur d6, [x9, #16]
fadd d4, d0, d4
b.eq .LBB261_9
fmul d3, d1, d3
cmp x8, #2
fsub d3, d5, d3
ldur d5, [x9, #24]
fmul d7, d1, d5
fmul d6, d2, d3
fadd d6, d6, d7
ldur d7, [x10, #24]
fmul d7, d7, d6
stur d6, [x9, #24]
fadd d4, d4, d7
b.eq .LBB261_9
fmul d3, d1, d3
fsub d3, d5, d3
ldur d5, [x9, #32]
fmul d1, d1, d5
fmul d2, d2, d3
fadd d1, d2, d1
ldur d2, [x10, #32]
fmul d2, d2, d1
stur d1, [x9, #32]
fadd d4, d4, d2
.LBB261_9:
mov d0, v0.d[1]
fadd d0, d4, d0
.cfi_def_cfa wsp, 16
ldp x29, x30, [sp], #16
.cfi_def_cfa_offset 0
.cfi_restore w30
.cfi_restore w29
ret
.LBB261_10:
.cfi_restore_state
.cfi_remember_state
movi v0.2d, #0000000000000000
cbnz x8, .LBB261_6
.LBB261_11:
fmov d4, d0
mov d0, v0.d[1]
fadd d0, d4, d0
.cfi_def_cfa wsp, 16
ldp x29, x30, [sp], #16
.cfi_def_cfa_offset 0
.cfi_restore w30
.cfi_restore w29
ret
.LBB261_12:
.cfi_restore_state
adrp x0, .Lanon.153c9ab06d5becc7cc0ce5c1295b31e9.530
add x0, x0, :lo12:.Lanon.153c9ab06d5becc7cc0ce5c1295b31e9.530
adrp x2, .Lanon.153c9ab06d5becc7cc0ce5c1295b31e9.531
add x2, x2, :lo12:.Lanon.153c9ab06d5becc7cc0ce5c1295b31e9.531
mov w1, #30
bl core::panicking::panic
.LBB261_13:
adrp x8, .Lanon.153c9ab06d5becc7cc0ce5c1295b31e9.532
add x8, x8, :lo12:.Lanon.153c9ab06d5becc7cc0ce5c1295b31e9.532
mov w0, #2
mov x2, x3
mov x3, x8
bl core::slice::index::slice_index_failBenchmark results: Artifact: aarch64-mergeAssembly: Details.section .text.jbonsai::vocoder::mlsa::fir,"ax",@progbits
.p2align 2
.type jbonsai::vocoder::mlsa::fir,@function
jbonsai::vocoder::mlsa::fir:
.cfi_startproc
stp x29, x30, [sp, #-16]!
.cfi_def_cfa_offset 16
mov x29, sp
.cfi_def_cfa w29, 16
.cfi_offset w30, -8
.cfi_offset w29, -16
.cfi_remember_state
cmp x1, #1
b.ls .LBB262_12
fmul d4, d1, d1
fmov d2, #1.00000000
ldr d3, [x0, #8]
cmp x1, x3
fmul d6, d1, d3
fsub d2, d2, d4
fmul d5, d0, d2
fmul d0, d0, d1
fadd d5, d5, d6
stp d0, d5, [x0]
b.hi .LBB262_13
fsub d3, d3, d0
sub x9, x1, #2
lsr x10, x9, #2
and x8, x9, #0x3
cbz x10, .LBB262_10
fmul d5, d4, d4
fneg d6, d1
movi v0.2d, #0000000000000000
add x11, x0, #32
add x12, x2, #32
.LBB262_4:
ldur q7, [x11, #-16]
ldur q17, [x11, #-8]
mov v18.16b, v3.16b
subs x10, x10, #1
fmsub d16, d1, d3, d7
fmla v17.2d, v7.2d, v6.d[0]
fmul v7.2d, v7.2d, v1.d[0]
mov v18.d[1], v16.d[0]
mov v19.16b, v17.16b
ldr q16, [x11]
fmul v20.2d, v16.2d, v1.d[0]
mov d16, v16.d[1]
fmla v19.2d, v18.2d, v4.d[0]
fmla v7.2d, v18.2d, v2.d[0]
ldr d18, [x11]
fmsub d16, d1, d18, d16
fmla v20.2d, v19.2d, v2.d[0]
ldp q19, q21, [x12, #-16]
add x12, x12, #32
fmadd d16, d4, d17, d16
fmul v18.2d, v7.2d, v19.2d
fmul v19.2d, v20.2d, v21.2d
stp q7, q20, [x11, #-16]
add x11, x11, #32
fmadd d3, d5, d3, d16
fadd v17.2d, v18.2d, v19.2d
fadd v0.2d, v0.2d, v17.2d
b.ne .LBB262_4
cbz x8, .LBB262_11
.LBB262_6:
lsl x9, x9, #3
fmul d4, d2, d3
cmp x8, #1
and x10, x9, #0x7fffffffffffffe0
add x9, x0, x10
add x10, x2, x10
ldur d5, [x9, #16]
fmul d6, d1, d5
fadd d6, d4, d6
ldur d4, [x10, #16]
fmul d4, d4, d6
stur d6, [x9, #16]
fadd d4, d0, d4
b.eq .LBB262_9
fmul d3, d1, d3
cmp x8, #2
fsub d3, d5, d3
ldur d5, [x9, #24]
fmul d7, d1, d5
fmul d6, d2, d3
fadd d6, d6, d7
ldur d7, [x10, #24]
fmul d7, d7, d6
stur d6, [x9, #24]
fadd d4, d4, d7
b.eq .LBB262_9
fmul d3, d1, d3
fsub d3, d5, d3
ldur d5, [x9, #32]
fmul d1, d1, d5
fmul d2, d2, d3
fadd d1, d2, d1
ldur d2, [x10, #32]
fmul d2, d2, d1
stur d1, [x9, #32]
fadd d4, d4, d2
.LBB262_9:
mov d0, v0.d[1]
fadd d0, d4, d0
.cfi_def_cfa wsp, 16
ldp x29, x30, [sp], #16
.cfi_def_cfa_offset 0
.cfi_restore w30
.cfi_restore w29
ret
.LBB262_10:
.cfi_restore_state
.cfi_remember_state
movi v0.2d, #0000000000000000
cbnz x8, .LBB262_6
.LBB262_11:
fmov d4, d0
mov d0, v0.d[1]
fadd d0, d4, d0
.cfi_def_cfa wsp, 16
ldp x29, x30, [sp], #16
.cfi_def_cfa_offset 0
.cfi_restore w30
.cfi_restore w29
ret
.LBB262_12:
.cfi_restore_state
adrp x0, .Lanon.153c9ab06d5becc7cc0ce5c1295b31e9.515
add x0, x0, :lo12:.Lanon.153c9ab06d5becc7cc0ce5c1295b31e9.515
adrp x2, .Lanon.153c9ab06d5becc7cc0ce5c1295b31e9.516
add x2, x2, :lo12:.Lanon.153c9ab06d5becc7cc0ce5c1295b31e9.516
mov w1, #30
bl core::panicking::panic
.LBB262_13:
adrp x8, .Lanon.153c9ab06d5becc7cc0ce5c1295b31e9.517
add x8, x8, :lo12:.Lanon.153c9ab06d5becc7cc0ce5c1295b31e9.517
mov w0, #2
mov x2, x3
mov x3, x8
bl core::slice::index::slice_index_failBenchmark results: Artifact: x86_64+avx2+fma-baseAssembly: Details.section .text.jbonsai::vocoder::mlsa::fir,"ax",@progbits
.p2align 4
.type jbonsai::vocoder::mlsa::fir,@function
jbonsai::vocoder::mlsa::fir:
.cfi_startproc
push rax
.cfi_def_cfa_offset 16
cmp rsi, 1
jbe .LBB262_12
vmulsd xmm3, xmm1, xmm1
vmovsd xmm2, qword ptr [rip + .LCPI262_0]
vsubsd xmm2, xmm2, xmm3
vmovsd xmm4, qword ptr [rdi + 8]
vmulsd xmm5, xmm0, xmm1
vmovsd qword ptr [rdi], xmm5
vmulsd xmm0, xmm0, xmm2
vmulsd xmm6, xmm1, xmm4
vaddsd xmm0, xmm0, xmm6
vmovsd qword ptr [rdi + 8], xmm0
cmp rsi, rcx
ja .LBB262_13
vsubsd xmm4, xmm4, xmm5
add rsi, -2
mov eax, esi
and eax, 3
mov rcx, rsi
shr rcx, 2
je .LBB262_3
vxorpd xmm5, xmm1, xmmword ptr [rip + .LCPI262_1]
vmulsd xmm6, xmm3, xmm3
vmovddup xmm7, xmm1
vmovddup xmm8, xmm2
vmovddup xmm9, xmm5
vmovddup xmm10, xmm3
vxorpd xmm0, xmm0, xmm0
mov r8d, 32
vmovapd xmm11, xmm4
.p2align 4
.LBB262_7:
vmovupd xmm12, xmmword ptr [rdi + r8 - 16]
vmovapd xmm4, xmm11
vfmadd213sd xmm4, xmm5, xmm12
vmulpd xmm13, xmm12, xmm7
vunpcklpd xmm14, xmm11, xmm4
vfmadd231pd xmm13, xmm8, xmm14
vfmadd213pd xmm12, xmm9, xmmword ptr [rdi + r8 - 8]
vmovsd xmm4, qword ptr [rdi + r8]
vfmadd213pd xmm14, xmm10, xmm12
vmulpd xmm15, xmm7, xmmword ptr [rdi + r8]
vfmadd231pd xmm15, xmm8, xmm14
vfmadd213sd xmm4, xmm5, qword ptr [rdi + r8 + 8]
vfmadd231sd xmm4, xmm3, xmm12
vfmadd231sd xmm4, xmm6, xmm11
vmovupd xmmword ptr [rdi + r8 - 16], xmm13
vmulpd xmm11, xmm13, xmmword ptr [rdx + r8 - 16]
vmovupd xmmword ptr [rdi + r8], xmm15
vmulpd xmm12, xmm15, xmmword ptr [rdx + r8]
vaddpd xmm11, xmm11, xmm12
vaddpd xmm0, xmm11, xmm0
add r8, 32
vmovapd xmm11, xmm4
dec rcx
jne .LBB262_7
test rax, rax
je .LBB262_5
.LBB262_8:
movabs rcx, 1152921504606846972
and rsi, rcx
vmulsd xmm3, xmm2, xmm4
vmovsd xmm5, qword ptr [rdi + 8*rsi + 16]
vmulsd xmm6, xmm1, xmm5
vaddsd xmm3, xmm3, xmm6
vmovsd qword ptr [rdi + 8*rsi + 16], xmm3
vmulsd xmm3, xmm3, qword ptr [rdx + 8*rsi + 16]
vaddsd xmm3, xmm0, xmm3
cmp eax, 1
je .LBB262_11
vmulsd xmm4, xmm1, xmm4
vsubsd xmm4, xmm5, xmm4
vmulsd xmm6, xmm2, xmm4
vmovsd xmm5, qword ptr [rdi + 8*rsi + 24]
vmulsd xmm7, xmm1, xmm5
vaddsd xmm6, xmm6, xmm7
vmovsd qword ptr [rdi + 8*rsi + 24], xmm6
vmulsd xmm6, xmm6, qword ptr [rdx + 8*rsi + 24]
vaddsd xmm3, xmm3, xmm6
cmp eax, 2
je .LBB262_11
vmulsd xmm4, xmm1, xmm4
vsubsd xmm4, xmm5, xmm4
vmulsd xmm2, xmm2, xmm4
vmulsd xmm1, xmm1, qword ptr [rdi + 8*rsi + 32]
vaddsd xmm1, xmm2, xmm1
vmovsd qword ptr [rdi + 8*rsi + 32], xmm1
vmulsd xmm1, xmm1, qword ptr [rdx + 8*rsi + 32]
vaddsd xmm3, xmm3, xmm1
.LBB262_11:
vshufpd xmm0, xmm0, xmm0, 1
vaddsd xmm0, xmm3, xmm0
pop rax
.cfi_def_cfa_offset 8
ret
.LBB262_3:
.cfi_def_cfa_offset 16
vxorpd xmm0, xmm0, xmm0
test rax, rax
jne .LBB262_8
.LBB262_5:
vmovapd xmm3, xmm0
vshufpd xmm0, xmm0, xmm0, 1
vaddsd xmm0, xmm3, xmm0
pop rax
.cfi_def_cfa_offset 8
ret
.LBB262_12:
.cfi_def_cfa_offset 16
lea rdi, [rip + .Lanon.c52d3424c1a98a09d9b797d35b800c8b.532]
lea rdx, [rip + .Lanon.c52d3424c1a98a09d9b797d35b800c8b.533]
mov esi, 30
call qword ptr [rip + core::panicking::panic@GOTPCREL]
.LBB262_13:
lea rax, [rip + .Lanon.c52d3424c1a98a09d9b797d35b800c8b.534]
mov edi, 2
mov rdx, rcx
mov rcx, rax
call qword ptr [rip + core::slice::index::slice_index_fail@GOTPCREL]Benchmark results: Artifact: x86_64+avx2+fma-mergeAssembly: Details.section .text.jbonsai::vocoder::mlsa::fir,"ax",@progbits
.p2align 4
.type jbonsai::vocoder::mlsa::fir,@function
jbonsai::vocoder::mlsa::fir:
.cfi_startproc
push rax
.cfi_def_cfa_offset 16
cmp rsi, 1
jbe .LBB263_12
vmulsd xmm3, xmm1, xmm1
vmovsd xmm2, qword ptr [rip + .LCPI263_0]
vsubsd xmm2, xmm2, xmm3
vmovsd xmm4, qword ptr [rdi + 8]
vmulsd xmm5, xmm0, xmm1
vmovsd qword ptr [rdi], xmm5
vmulsd xmm0, xmm0, xmm2
vmulsd xmm6, xmm1, xmm4
vaddsd xmm0, xmm0, xmm6
vmovsd qword ptr [rdi + 8], xmm0
cmp rsi, rcx
ja .LBB263_13
vsubsd xmm4, xmm4, xmm5
add rsi, -2
mov eax, esi
and eax, 3
mov rcx, rsi
shr rcx, 2
je .LBB263_3
vxorpd xmm5, xmm1, xmmword ptr [rip + .LCPI263_1]
vmulsd xmm6, xmm3, xmm3
vmovddup xmm7, xmm1
vmovddup xmm8, xmm2
vmovddup xmm9, xmm5
vmovddup xmm10, xmm3
vxorpd xmm0, xmm0, xmm0
mov r8d, 32
vmovapd xmm11, xmm4
.p2align 4
.LBB263_7:
vmovupd xmm12, xmmword ptr [rdi + r8 - 16]
vmovapd xmm4, xmm11
vfmadd213sd xmm4, xmm5, xmm12
vmulpd xmm13, xmm12, xmm7
vunpcklpd xmm14, xmm11, xmm4
vfmadd231pd xmm13, xmm8, xmm14
vfmadd213pd xmm12, xmm9, xmmword ptr [rdi + r8 - 8]
vmovsd xmm4, qword ptr [rdi + r8]
vfmadd213pd xmm14, xmm10, xmm12
vmulpd xmm15, xmm7, xmmword ptr [rdi + r8]
vfmadd231pd xmm15, xmm8, xmm14
vfmadd213sd xmm4, xmm5, qword ptr [rdi + r8 + 8]
vfmadd231sd xmm4, xmm3, xmm12
vfmadd231sd xmm4, xmm6, xmm11
vmovupd xmmword ptr [rdi + r8 - 16], xmm13
vmulpd xmm11, xmm13, xmmword ptr [rdx + r8 - 16]
vmovupd xmmword ptr [rdi + r8], xmm15
vmulpd xmm12, xmm15, xmmword ptr [rdx + r8]
vaddpd xmm11, xmm11, xmm12
vaddpd xmm0, xmm11, xmm0
add r8, 32
vmovapd xmm11, xmm4
dec rcx
jne .LBB263_7
test rax, rax
je .LBB263_5
.LBB263_8:
movabs rcx, 1152921504606846972
and rsi, rcx
vmulsd xmm3, xmm2, xmm4
vmovsd xmm5, qword ptr [rdi + 8*rsi + 16]
vmulsd xmm6, xmm1, xmm5
vaddsd xmm3, xmm3, xmm6
vmovsd qword ptr [rdi + 8*rsi + 16], xmm3
vmulsd xmm3, xmm3, qword ptr [rdx + 8*rsi + 16]
vaddsd xmm3, xmm0, xmm3
cmp eax, 1
je .LBB263_11
vmulsd xmm4, xmm1, xmm4
vsubsd xmm4, xmm5, xmm4
vmulsd xmm6, xmm2, xmm4
vmovsd xmm5, qword ptr [rdi + 8*rsi + 24]
vmulsd xmm7, xmm1, xmm5
vaddsd xmm6, xmm6, xmm7
vmovsd qword ptr [rdi + 8*rsi + 24], xmm6
vmulsd xmm6, xmm6, qword ptr [rdx + 8*rsi + 24]
vaddsd xmm3, xmm3, xmm6
cmp eax, 2
je .LBB263_11
vmulsd xmm4, xmm1, xmm4
vsubsd xmm4, xmm5, xmm4
vmulsd xmm2, xmm2, xmm4
vmulsd xmm1, xmm1, qword ptr [rdi + 8*rsi + 32]
vaddsd xmm1, xmm2, xmm1
vmovsd qword ptr [rdi + 8*rsi + 32], xmm1
vmulsd xmm1, xmm1, qword ptr [rdx + 8*rsi + 32]
vaddsd xmm3, xmm3, xmm1
.LBB263_11:
vshufpd xmm0, xmm0, xmm0, 1
vaddsd xmm0, xmm3, xmm0
pop rax
.cfi_def_cfa_offset 8
ret
.LBB263_3:
.cfi_def_cfa_offset 16
vxorpd xmm0, xmm0, xmm0
test rax, rax
jne .LBB263_8
.LBB263_5:
vmovapd xmm3, xmm0
vshufpd xmm0, xmm0, xmm0, 1
vaddsd xmm0, xmm3, xmm0
pop rax
.cfi_def_cfa_offset 8
ret
.LBB263_12:
.cfi_def_cfa_offset 16
lea rdi, [rip + .Lanon.c52d3424c1a98a09d9b797d35b800c8b.517]
lea rdx, [rip + .Lanon.c52d3424c1a98a09d9b797d35b800c8b.518]
mov esi, 30
call qword ptr [rip + core::panicking::panic@GOTPCREL]
.LBB263_13:
lea rax, [rip + .Lanon.c52d3424c1a98a09d9b797d35b800c8b.519]
mov edi, 2
mov rdx, rcx
mov rcx, rax
call qword ptr [rip + core::slice::index::slice_index_fail@GOTPCREL]Benchmark results: Artifact: x86_64-baseAssembly: Details.section .text.jbonsai::vocoder::mlsa::fir,"ax",@progbits
.p2align 4
.type jbonsai::vocoder::mlsa::fir,@function
jbonsai::vocoder::mlsa::fir:
.cfi_startproc
push rax
.cfi_def_cfa_offset 16
cmp rsi, 1
jbe .LBB262_12
movapd xmm4, xmm1
mulsd xmm4, xmm1
movsd xmm2, qword ptr [rip + .LCPI262_0]
subsd xmm2, xmm4
movsd xmm3, qword ptr [rdi + 8]
movapd xmm5, xmm0
mulsd xmm5, xmm1
movsd qword ptr [rdi], xmm5
mulsd xmm0, xmm2
movapd xmm6, xmm1
mulsd xmm6, xmm3
addsd xmm6, xmm0
movsd qword ptr [rdi + 8], xmm6
cmp rsi, rcx
ja .LBB262_13
subsd xmm3, xmm5
add rsi, -2
mov eax, esi
and eax, 3
mov rcx, rsi
shr rcx, 2
je .LBB262_3
movapd xmm5, xmm4
mulsd xmm5, xmm4
movapd xmm6, xmm2
unpcklpd xmm6, xmm2
movapd xmm7, xmm1
unpcklpd xmm7, xmm1
movapd xmm8, xmm4
unpcklpd xmm8, xmm4
xorpd xmm0, xmm0
mov r8d, 32
movapd xmm9, xmm3
.p2align 4
.LBB262_7:
movapd xmm3, xmm1
mulsd xmm3, xmm9
movapd xmm10, xmm5
mulsd xmm10, xmm9
movupd xmm11, xmmword ptr [rdi + r8 - 16]
movupd xmm12, xmmword ptr [rdi + r8 - 8]
movupd xmm13, xmmword ptr [rdi + r8]
movapd xmm14, xmm7
mulpd xmm14, xmm11
subsd xmm11, xmm3
unpcklpd xmm9, xmm11
movapd xmm11, xmm6
mulpd xmm11, xmm9
addpd xmm11, xmm14
mulpd xmm9, xmm8
movsd xmm3, qword ptr [rdi + r8 + 8]
subpd xmm12, xmm14
addpd xmm9, xmm12
mulpd xmm9, xmm6
mulpd xmm13, xmm7
addpd xmm9, xmm13
mulsd xmm12, xmm4
subsd xmm3, xmm13
addsd xmm3, xmm12
addsd xmm3, xmm10
movupd xmmword ptr [rdi + r8 - 16], xmm11
movupd xmmword ptr [rdi + r8], xmm9
movupd xmm10, xmmword ptr [rdx + r8 - 16]
movupd xmm12, xmmword ptr [rdx + r8]
mulpd xmm10, xmm11
mulpd xmm12, xmm9
addpd xmm12, xmm10
addpd xmm0, xmm12
add r8, 32
movapd xmm9, xmm3
dec rcx
jne .LBB262_7
test rax, rax
je .LBB262_5
.LBB262_8:
movabs rcx, 1152921504606846972
and rsi, rcx
movapd xmm6, xmm2
mulsd xmm6, xmm3
movsd xmm5, qword ptr [rdi + 8*rsi + 16]
movapd xmm4, xmm1
mulsd xmm4, xmm5
addsd xmm4, xmm6
movsd qword ptr [rdi + 8*rsi + 16], xmm4
mulsd xmm4, qword ptr [rdx + 8*rsi + 16]
addsd xmm4, xmm0
cmp eax, 1
je .LBB262_11
mulsd xmm3, xmm1
subsd xmm5, xmm3
movapd xmm6, xmm2
mulsd xmm6, xmm5
movsd xmm3, qword ptr [rdi + 8*rsi + 24]
movapd xmm7, xmm1
mulsd xmm7, xmm3
addsd xmm7, xmm6
movsd qword ptr [rdi + 8*rsi + 24], xmm7
mulsd xmm7, qword ptr [rdx + 8*rsi + 24]
addsd xmm4, xmm7
cmp eax, 2
je .LBB262_11
mulsd xmm5, xmm1
subsd xmm3, xmm5
mulsd xmm2, xmm3
mulsd xmm1, qword ptr [rdi + 8*rsi + 32]
addsd xmm1, xmm2
movsd qword ptr [rdi + 8*rsi + 32], xmm1
mulsd xmm1, qword ptr [rdx + 8*rsi + 32]
addsd xmm4, xmm1
.LBB262_11:
unpckhpd xmm0, xmm0
addsd xmm0, xmm4
pop rax
.cfi_def_cfa_offset 8
ret
.LBB262_3:
.cfi_def_cfa_offset 16
xorpd xmm0, xmm0
test rax, rax
jne .LBB262_8
.LBB262_5:
movapd xmm4, xmm0
unpckhpd xmm0, xmm0
addsd xmm0, xmm4
pop rax
.cfi_def_cfa_offset 8
ret
.LBB262_12:
.cfi_def_cfa_offset 16
lea rdi, [rip + .Lanon.c52d3424c1a98a09d9b797d35b800c8b.532]
lea rdx, [rip + .Lanon.c52d3424c1a98a09d9b797d35b800c8b.533]
mov esi, 30
call qword ptr [rip + core::panicking::panic@GOTPCREL]
.LBB262_13:
lea rax, [rip + .Lanon.c52d3424c1a98a09d9b797d35b800c8b.534]
mov edi, 2
mov rdx, rcx
mov rcx, rax
call qword ptr [rip + core::slice::index::slice_index_fail@GOTPCREL]Benchmark results: Artifact: x86_64-mergeAssembly: Details.section .text.jbonsai::vocoder::mlsa::fir,"ax",@progbits
.p2align 4
.type jbonsai::vocoder::mlsa::fir,@function
jbonsai::vocoder::mlsa::fir:
.cfi_startproc
push rax
.cfi_def_cfa_offset 16
cmp rsi, 1
jbe .LBB263_12
movapd xmm4, xmm1
mulsd xmm4, xmm1
movsd xmm2, qword ptr [rip + .LCPI263_0]
subsd xmm2, xmm4
movsd xmm3, qword ptr [rdi + 8]
movapd xmm5, xmm0
mulsd xmm5, xmm1
movsd qword ptr [rdi], xmm5
mulsd xmm0, xmm2
movapd xmm6, xmm1
mulsd xmm6, xmm3
addsd xmm6, xmm0
movsd qword ptr [rdi + 8], xmm6
cmp rsi, rcx
ja .LBB263_13
subsd xmm3, xmm5
add rsi, -2
mov eax, esi
and eax, 3
mov rcx, rsi
shr rcx, 2
je .LBB263_3
movapd xmm5, xmm4
mulsd xmm5, xmm4
movapd xmm6, xmm2
unpcklpd xmm6, xmm2
movapd xmm7, xmm1
unpcklpd xmm7, xmm1
movapd xmm8, xmm4
unpcklpd xmm8, xmm4
xorpd xmm0, xmm0
mov r8d, 32
movapd xmm9, xmm3
.p2align 4
.LBB263_7:
movapd xmm3, xmm1
mulsd xmm3, xmm9
movapd xmm10, xmm5
mulsd xmm10, xmm9
movupd xmm11, xmmword ptr [rdi + r8 - 16]
movupd xmm12, xmmword ptr [rdi + r8 - 8]
movupd xmm13, xmmword ptr [rdi + r8]
movapd xmm14, xmm7
mulpd xmm14, xmm11
subsd xmm11, xmm3
unpcklpd xmm9, xmm11
movapd xmm11, xmm6
mulpd xmm11, xmm9
addpd xmm11, xmm14
mulpd xmm9, xmm8
movsd xmm3, qword ptr [rdi + r8 + 8]
subpd xmm12, xmm14
addpd xmm9, xmm12
mulpd xmm9, xmm6
mulpd xmm13, xmm7
addpd xmm9, xmm13
mulsd xmm12, xmm4
subsd xmm3, xmm13
addsd xmm3, xmm12
addsd xmm3, xmm10
movupd xmmword ptr [rdi + r8 - 16], xmm11
movupd xmmword ptr [rdi + r8], xmm9
movupd xmm10, xmmword ptr [rdx + r8 - 16]
movupd xmm12, xmmword ptr [rdx + r8]
mulpd xmm10, xmm11
mulpd xmm12, xmm9
addpd xmm12, xmm10
addpd xmm0, xmm12
add r8, 32
movapd xmm9, xmm3
dec rcx
jne .LBB263_7
test rax, rax
je .LBB263_5
.LBB263_8:
movabs rcx, 1152921504606846972
and rsi, rcx
movapd xmm6, xmm2
mulsd xmm6, xmm3
movsd xmm5, qword ptr [rdi + 8*rsi + 16]
movapd xmm4, xmm1
mulsd xmm4, xmm5
addsd xmm4, xmm6
movsd qword ptr [rdi + 8*rsi + 16], xmm4
mulsd xmm4, qword ptr [rdx + 8*rsi + 16]
addsd xmm4, xmm0
cmp eax, 1
je .LBB263_11
mulsd xmm3, xmm1
subsd xmm5, xmm3
movapd xmm6, xmm2
mulsd xmm6, xmm5
movsd xmm3, qword ptr [rdi + 8*rsi + 24]
movapd xmm7, xmm1
mulsd xmm7, xmm3
addsd xmm7, xmm6
movsd qword ptr [rdi + 8*rsi + 24], xmm7
mulsd xmm7, qword ptr [rdx + 8*rsi + 24]
addsd xmm4, xmm7
cmp eax, 2
je .LBB263_11
mulsd xmm5, xmm1
subsd xmm3, xmm5
mulsd xmm2, xmm3
mulsd xmm1, qword ptr [rdi + 8*rsi + 32]
addsd xmm1, xmm2
movsd qword ptr [rdi + 8*rsi + 32], xmm1
mulsd xmm1, qword ptr [rdx + 8*rsi + 32]
addsd xmm4, xmm1
.LBB263_11:
unpckhpd xmm0, xmm0
addsd xmm0, xmm4
pop rax
.cfi_def_cfa_offset 8
ret
.LBB263_3:
.cfi_def_cfa_offset 16
xorpd xmm0, xmm0
test rax, rax
jne .LBB263_8
.LBB263_5:
movapd xmm4, xmm0
unpckhpd xmm0, xmm0
addsd xmm0, xmm4
pop rax
.cfi_def_cfa_offset 8
ret
.LBB263_12:
.cfi_def_cfa_offset 16
lea rdi, [rip + .Lanon.c52d3424c1a98a09d9b797d35b800c8b.517]
lea rdx, [rip + .Lanon.c52d3424c1a98a09d9b797d35b800c8b.518]
mov esi, 30
call qword ptr [rip + core::panicking::panic@GOTPCREL]
.LBB263_13:
lea rax, [rip + .Lanon.c52d3424c1a98a09d9b797d35b800c8b.519]
mov edi, 2
mov rdx, rcx
mov rcx, rax
call qword ptr [rip + core::slice::index::slice_index_fail@GOTPCREL]Benchmark results: |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.This suggestion is invalid because no changes were made to the code.Suggestions cannot be applied while the pull request is closed.Suggestions cannot be applied while viewing a subset of changes.Only one suggestion per line can be applied in a batch.Add this suggestion to a batch that can be applied as a single commit.Applying suggestions on deleted lines is not supported.You must change the existing code in this line in order to create a valid suggestion.Outdated suggestions cannot be applied.This suggestion has been applied or marked resolved.Suggestions cannot be applied from pending reviews.Suggestions cannot be applied on multi-line comments.Suggestions cannot be applied while the pull request is queued to merge.Suggestion cannot be applied right now. Please check back later.
No description provided.