Skip to content

perf: optimize MlpgAdjust#116

Draft
cm-ayf wants to merge 20 commits into
mainfrom
perf/mlpg-adjust
Draft

perf: optimize MlpgAdjust#116
cm-ayf wants to merge 20 commits into
mainfrom
perf/mlpg-adjust

Conversation

@cm-ayf

@cm-ayf cm-ayf commented Nov 18, 2025

Copy link
Copy Markdown
Contributor

No description provided.

@cm-ayf cm-ayf self-assigned this Nov 18, 2025
@codecov

codecov Bot commented Nov 18, 2025

Copy link
Copy Markdown

Codecov Report

❌ Patch coverage is 96.49123% with 4 lines in your changes missing coverage. Please review.
✅ Project coverage is 71.28%. Comparing base (a77a27a) to head (55d2a5a).

Files with missing lines Patch % Lines
src/mlpg_adjust/mlpg.rs 96.47% 3 Missing ⚠️
src/mlpg_adjust/mod.rs 94.73% 1 Missing ⚠️
Additional details and impacted files
@@            Coverage Diff             @@
##             main     #116      +/-   ##
==========================================
- Coverage   71.46%   71.28%   -0.18%     
==========================================
  Files          37       37              
  Lines        1675     1689      +14     
==========================================
+ Hits         1197     1204       +7     
- Misses        478      485       +7     

☔ View full report in Codecov by Harness.
📢 Have feedback on the report? Share it here.

🚀 New features to boost your workflow:
  • ❄️ Test Analytics: Detect flaky tests, report on failures, and find test suite problems.

@github-actions

github-actions Bot commented Nov 29, 2025

Copy link
Copy Markdown

mlsafir diagnostics

ref: 55d2a5ae98d08c5c7df31a80e778b3bdf95314b0

Artifact: aarch64-base

Assembly:

Details
.section .text.jbonsai::vocoder::mlsa::fir,"ax",@progbits
	.p2align	2
.type	jbonsai::vocoder::mlsa::fir,@function
jbonsai::vocoder::mlsa::fir:
	.cfi_startproc
	stp x29, x30, [sp, #-16]!
	.cfi_def_cfa_offset 16
	mov x29, sp
	.cfi_def_cfa w29, 16
	.cfi_offset w30, -8
	.cfi_offset w29, -16
	.cfi_remember_state
	cmp x1, #1
	b.ls .LBB261_12
	fmul d4, d1, d1
	fmov d2, #1.00000000
	ldr d3, [x0, #8]
	cmp x1, x3
	fmul d6, d1, d3
	fsub d2, d2, d4
	fmul d5, d0, d2
	fmul d0, d0, d1
	fadd d5, d5, d6
	stp d0, d5, [x0]
	b.hi .LBB261_13
	fsub d3, d3, d0
	sub x9, x1, #2
	lsr x10, x9, #2
	and x8, x9, #0x3
	cbz x10, .LBB261_10
	fmul d5, d4, d4
	fneg d6, d1
	movi v0.2d, #0000000000000000
	add x11, x0, #32
	add x12, x2, #32
.LBB261_4:
	ldur q7, [x11, #-16]
	ldur q17, [x11, #-8]
	mov v18.16b, v3.16b
	subs x10, x10, #1
	fmsub d16, d1, d3, d7
	fmla v17.2d, v7.2d, v6.d[0]
	fmul v7.2d, v7.2d, v1.d[0]
	mov v18.d[1], v16.d[0]
	mov v19.16b, v17.16b
	ldr q16, [x11]
	fmul v20.2d, v16.2d, v1.d[0]
	mov d16, v16.d[1]
	fmla v19.2d, v18.2d, v4.d[0]
	fmla v7.2d, v18.2d, v2.d[0]
	ldr d18, [x11]
	fmsub d16, d1, d18, d16
	fmla v20.2d, v19.2d, v2.d[0]
	ldp q19, q21, [x12, #-16]
	add x12, x12, #32
	fmadd d16, d4, d17, d16
	fmul v18.2d, v7.2d, v19.2d
	fmul v19.2d, v20.2d, v21.2d
	stp q7, q20, [x11, #-16]
	add x11, x11, #32
	fmadd d3, d5, d3, d16
	fadd v17.2d, v18.2d, v19.2d
	fadd v0.2d, v0.2d, v17.2d
	b.ne .LBB261_4
	cbz x8, .LBB261_11
.LBB261_6:
	lsl x9, x9, #3
	fmul d4, d2, d3
	cmp x8, #1
	and x10, x9, #0x7fffffffffffffe0
	add x9, x0, x10
	add x10, x2, x10
	ldur d5, [x9, #16]
	fmul d6, d1, d5
	fadd d6, d4, d6
	ldur d4, [x10, #16]
	fmul d4, d4, d6
	stur d6, [x9, #16]
	fadd d4, d0, d4
	b.eq .LBB261_9
	fmul d3, d1, d3
	cmp x8, #2
	fsub d3, d5, d3
	ldur d5, [x9, #24]
	fmul d7, d1, d5
	fmul d6, d2, d3
	fadd d6, d6, d7
	ldur d7, [x10, #24]
	fmul d7, d7, d6
	stur d6, [x9, #24]
	fadd d4, d4, d7
	b.eq .LBB261_9
	fmul d3, d1, d3
	fsub d3, d5, d3
	ldur d5, [x9, #32]
	fmul d1, d1, d5
	fmul d2, d2, d3
	fadd d1, d2, d1
	ldur d2, [x10, #32]
	fmul d2, d2, d1
	stur d1, [x9, #32]
	fadd d4, d4, d2
.LBB261_9:
	mov d0, v0.d[1]
	fadd d0, d4, d0
	.cfi_def_cfa wsp, 16
	ldp x29, x30, [sp], #16
	.cfi_def_cfa_offset 0
	.cfi_restore w30
	.cfi_restore w29
	ret
.LBB261_10:
	.cfi_restore_state
	.cfi_remember_state
	movi v0.2d, #0000000000000000
	cbnz x8, .LBB261_6
.LBB261_11:
	fmov d4, d0
	mov d0, v0.d[1]
	fadd d0, d4, d0
	.cfi_def_cfa wsp, 16
	ldp x29, x30, [sp], #16
	.cfi_def_cfa_offset 0
	.cfi_restore w30
	.cfi_restore w29
	ret
.LBB261_12:
	.cfi_restore_state
	adrp x0, .Lanon.153c9ab06d5becc7cc0ce5c1295b31e9.530
	add x0, x0, :lo12:.Lanon.153c9ab06d5becc7cc0ce5c1295b31e9.530
	adrp x2, .Lanon.153c9ab06d5becc7cc0ce5c1295b31e9.531
	add x2, x2, :lo12:.Lanon.153c9ab06d5becc7cc0ce5c1295b31e9.531
	mov w1, #30
	bl core::panicking::panic
.LBB261_13:
	adrp x8, .Lanon.153c9ab06d5becc7cc0ce5c1295b31e9.532
	add x8, x8, :lo12:.Lanon.153c9ab06d5becc7cc0ce5c1295b31e9.532
	mov w0, #2
	mov x2, x3
	mov x3, x8
	bl core::slice::index::slice_index_fail

Benchmark results:


running 3 tests
test bonsai        ... bench:  16,500,471.20 ns/iter (+/- 78,146.11)
test bonsai_letter ... bench:  45,285,314.80 ns/iter (+/- 109,543.16)
test is_bonsai     ... bench:  25,396,634.90 ns/iter (+/- 70,572.50)

test result: ok. 0 passed; 0 failed; 0 ignored; 3 measured; 0 filtered out; finished in 26.26s

Artifact: aarch64-merge

Assembly:

Details
.section .text.jbonsai::vocoder::mlsa::fir,"ax",@progbits
	.p2align	2
.type	jbonsai::vocoder::mlsa::fir,@function
jbonsai::vocoder::mlsa::fir:
	.cfi_startproc
	stp x29, x30, [sp, #-16]!
	.cfi_def_cfa_offset 16
	mov x29, sp
	.cfi_def_cfa w29, 16
	.cfi_offset w30, -8
	.cfi_offset w29, -16
	.cfi_remember_state
	cmp x1, #1
	b.ls .LBB262_12
	fmul d4, d1, d1
	fmov d2, #1.00000000
	ldr d3, [x0, #8]
	cmp x1, x3
	fmul d6, d1, d3
	fsub d2, d2, d4
	fmul d5, d0, d2
	fmul d0, d0, d1
	fadd d5, d5, d6
	stp d0, d5, [x0]
	b.hi .LBB262_13
	fsub d3, d3, d0
	sub x9, x1, #2
	lsr x10, x9, #2
	and x8, x9, #0x3
	cbz x10, .LBB262_10
	fmul d5, d4, d4
	fneg d6, d1
	movi v0.2d, #0000000000000000
	add x11, x0, #32
	add x12, x2, #32
.LBB262_4:
	ldur q7, [x11, #-16]
	ldur q17, [x11, #-8]
	mov v18.16b, v3.16b
	subs x10, x10, #1
	fmsub d16, d1, d3, d7
	fmla v17.2d, v7.2d, v6.d[0]
	fmul v7.2d, v7.2d, v1.d[0]
	mov v18.d[1], v16.d[0]
	mov v19.16b, v17.16b
	ldr q16, [x11]
	fmul v20.2d, v16.2d, v1.d[0]
	mov d16, v16.d[1]
	fmla v19.2d, v18.2d, v4.d[0]
	fmla v7.2d, v18.2d, v2.d[0]
	ldr d18, [x11]
	fmsub d16, d1, d18, d16
	fmla v20.2d, v19.2d, v2.d[0]
	ldp q19, q21, [x12, #-16]
	add x12, x12, #32
	fmadd d16, d4, d17, d16
	fmul v18.2d, v7.2d, v19.2d
	fmul v19.2d, v20.2d, v21.2d
	stp q7, q20, [x11, #-16]
	add x11, x11, #32
	fmadd d3, d5, d3, d16
	fadd v17.2d, v18.2d, v19.2d
	fadd v0.2d, v0.2d, v17.2d
	b.ne .LBB262_4
	cbz x8, .LBB262_11
.LBB262_6:
	lsl x9, x9, #3
	fmul d4, d2, d3
	cmp x8, #1
	and x10, x9, #0x7fffffffffffffe0
	add x9, x0, x10
	add x10, x2, x10
	ldur d5, [x9, #16]
	fmul d6, d1, d5
	fadd d6, d4, d6
	ldur d4, [x10, #16]
	fmul d4, d4, d6
	stur d6, [x9, #16]
	fadd d4, d0, d4
	b.eq .LBB262_9
	fmul d3, d1, d3
	cmp x8, #2
	fsub d3, d5, d3
	ldur d5, [x9, #24]
	fmul d7, d1, d5
	fmul d6, d2, d3
	fadd d6, d6, d7
	ldur d7, [x10, #24]
	fmul d7, d7, d6
	stur d6, [x9, #24]
	fadd d4, d4, d7
	b.eq .LBB262_9
	fmul d3, d1, d3
	fsub d3, d5, d3
	ldur d5, [x9, #32]
	fmul d1, d1, d5
	fmul d2, d2, d3
	fadd d1, d2, d1
	ldur d2, [x10, #32]
	fmul d2, d2, d1
	stur d1, [x9, #32]
	fadd d4, d4, d2
.LBB262_9:
	mov d0, v0.d[1]
	fadd d0, d4, d0
	.cfi_def_cfa wsp, 16
	ldp x29, x30, [sp], #16
	.cfi_def_cfa_offset 0
	.cfi_restore w30
	.cfi_restore w29
	ret
.LBB262_10:
	.cfi_restore_state
	.cfi_remember_state
	movi v0.2d, #0000000000000000
	cbnz x8, .LBB262_6
.LBB262_11:
	fmov d4, d0
	mov d0, v0.d[1]
	fadd d0, d4, d0
	.cfi_def_cfa wsp, 16
	ldp x29, x30, [sp], #16
	.cfi_def_cfa_offset 0
	.cfi_restore w30
	.cfi_restore w29
	ret
.LBB262_12:
	.cfi_restore_state
	adrp x0, .Lanon.153c9ab06d5becc7cc0ce5c1295b31e9.515
	add x0, x0, :lo12:.Lanon.153c9ab06d5becc7cc0ce5c1295b31e9.515
	adrp x2, .Lanon.153c9ab06d5becc7cc0ce5c1295b31e9.516
	add x2, x2, :lo12:.Lanon.153c9ab06d5becc7cc0ce5c1295b31e9.516
	mov w1, #30
	bl core::panicking::panic
.LBB262_13:
	adrp x8, .Lanon.153c9ab06d5becc7cc0ce5c1295b31e9.517
	add x8, x8, :lo12:.Lanon.153c9ab06d5becc7cc0ce5c1295b31e9.517
	mov w0, #2
	mov x2, x3
	mov x3, x8
	bl core::slice::index::slice_index_fail

Benchmark results:


running 3 tests
test bonsai        ... bench:  15,283,852.20 ns/iter (+/- 54,652.70)
test bonsai_letter ... bench:  42,238,446.80 ns/iter (+/- 91,047.96)
test is_bonsai     ... bench:  23,605,071.00 ns/iter (+/- 57,750.99)

test result: ok. 0 passed; 0 failed; 0 ignored; 3 measured; 0 filtered out; finished in 24.44s

Artifact: x86_64+avx2+fma-base

Assembly:

Details
.section .text.jbonsai::vocoder::mlsa::fir,"ax",@progbits
	.p2align	4
.type	jbonsai::vocoder::mlsa::fir,@function
jbonsai::vocoder::mlsa::fir:
	.cfi_startproc
	push rax
	.cfi_def_cfa_offset 16
	cmp rsi, 1
	jbe .LBB262_12
	vmulsd xmm3, xmm1, xmm1
	vmovsd xmm2, qword ptr [rip + .LCPI262_0]
	vsubsd xmm2, xmm2, xmm3
	vmovsd xmm4, qword ptr [rdi + 8]
	vmulsd xmm5, xmm0, xmm1
	vmovsd qword ptr [rdi], xmm5
	vmulsd xmm0, xmm0, xmm2
	vmulsd xmm6, xmm1, xmm4
	vaddsd xmm0, xmm0, xmm6
	vmovsd qword ptr [rdi + 8], xmm0
	cmp rsi, rcx
	ja .LBB262_13
	vsubsd xmm4, xmm4, xmm5
	add rsi, -2
	mov eax, esi
	and eax, 3
	mov rcx, rsi
	shr rcx, 2
	je .LBB262_3
	vxorpd xmm5, xmm1, xmmword ptr [rip + .LCPI262_1]
	vmulsd xmm6, xmm3, xmm3
	vmovddup xmm7, xmm1
	vmovddup xmm8, xmm2
	vmovddup xmm9, xmm5
	vmovddup xmm10, xmm3
	vxorpd xmm0, xmm0, xmm0
	mov r8d, 32
	vmovapd xmm11, xmm4
	.p2align	4
.LBB262_7:
	vmovupd xmm12, xmmword ptr [rdi + r8 - 16]
	vmovapd xmm4, xmm11
	vfmadd213sd xmm4, xmm5, xmm12
	vmulpd xmm13, xmm12, xmm7
	vunpcklpd xmm14, xmm11, xmm4
	vfmadd231pd xmm13, xmm8, xmm14
	vfmadd213pd xmm12, xmm9, xmmword ptr [rdi + r8 - 8]
	vmovsd xmm4, qword ptr [rdi + r8]
	vfmadd213pd xmm14, xmm10, xmm12
	vmulpd xmm15, xmm7, xmmword ptr [rdi + r8]
	vfmadd231pd xmm15, xmm8, xmm14
	vfmadd213sd xmm4, xmm5, qword ptr [rdi + r8 + 8]
	vfmadd231sd xmm4, xmm3, xmm12
	vfmadd231sd xmm4, xmm6, xmm11
	vmovupd xmmword ptr [rdi + r8 - 16], xmm13
	vmulpd xmm11, xmm13, xmmword ptr [rdx + r8 - 16]
	vmovupd xmmword ptr [rdi + r8], xmm15
	vmulpd xmm12, xmm15, xmmword ptr [rdx + r8]
	vaddpd xmm11, xmm11, xmm12
	vaddpd xmm0, xmm11, xmm0
	add r8, 32
	vmovapd xmm11, xmm4
	dec rcx
	jne .LBB262_7
	test rax, rax
	je .LBB262_5
.LBB262_8:
	movabs rcx, 1152921504606846972
	and rsi, rcx
	vmulsd xmm3, xmm2, xmm4
	vmovsd xmm5, qword ptr [rdi + 8*rsi + 16]
	vmulsd xmm6, xmm1, xmm5
	vaddsd xmm3, xmm3, xmm6
	vmovsd qword ptr [rdi + 8*rsi + 16], xmm3
	vmulsd xmm3, xmm3, qword ptr [rdx + 8*rsi + 16]
	vaddsd xmm3, xmm0, xmm3
	cmp eax, 1
	je .LBB262_11
	vmulsd xmm4, xmm1, xmm4
	vsubsd xmm4, xmm5, xmm4
	vmulsd xmm6, xmm2, xmm4
	vmovsd xmm5, qword ptr [rdi + 8*rsi + 24]
	vmulsd xmm7, xmm1, xmm5
	vaddsd xmm6, xmm6, xmm7
	vmovsd qword ptr [rdi + 8*rsi + 24], xmm6
	vmulsd xmm6, xmm6, qword ptr [rdx + 8*rsi + 24]
	vaddsd xmm3, xmm3, xmm6
	cmp eax, 2
	je .LBB262_11
	vmulsd xmm4, xmm1, xmm4
	vsubsd xmm4, xmm5, xmm4
	vmulsd xmm2, xmm2, xmm4
	vmulsd xmm1, xmm1, qword ptr [rdi + 8*rsi + 32]
	vaddsd xmm1, xmm2, xmm1
	vmovsd qword ptr [rdi + 8*rsi + 32], xmm1
	vmulsd xmm1, xmm1, qword ptr [rdx + 8*rsi + 32]
	vaddsd xmm3, xmm3, xmm1
.LBB262_11:
	vshufpd xmm0, xmm0, xmm0, 1
	vaddsd xmm0, xmm3, xmm0
	pop rax
	.cfi_def_cfa_offset 8
	ret
.LBB262_3:
	.cfi_def_cfa_offset 16
	vxorpd xmm0, xmm0, xmm0
	test rax, rax
	jne .LBB262_8
.LBB262_5:
	vmovapd xmm3, xmm0
	vshufpd xmm0, xmm0, xmm0, 1
	vaddsd xmm0, xmm3, xmm0
	pop rax
	.cfi_def_cfa_offset 8
	ret
.LBB262_12:
	.cfi_def_cfa_offset 16
	lea rdi, [rip + .Lanon.c52d3424c1a98a09d9b797d35b800c8b.532]
	lea rdx, [rip + .Lanon.c52d3424c1a98a09d9b797d35b800c8b.533]
	mov esi, 30
	call qword ptr [rip + core::panicking::panic@GOTPCREL]
.LBB262_13:
	lea rax, [rip + .Lanon.c52d3424c1a98a09d9b797d35b800c8b.534]
	mov edi, 2
	mov rdx, rcx
	mov rcx, rax
	call qword ptr [rip + core::slice::index::slice_index_fail@GOTPCREL]

Benchmark results:


running 3 tests
test bonsai        ... bench:  13,280,433.10 ns/iter (+/- 182,266.48)
test bonsai_letter ... bench:  36,842,636.60 ns/iter (+/- 351,586.60)
test is_bonsai     ... bench:  20,441,067.90 ns/iter (+/- 303,511.34)

test result: ok. 0 passed; 0 failed; 0 ignored; 3 measured; 0 filtered out; finished in 21.28s

Artifact: x86_64+avx2+fma-merge

Assembly:

Details
.section .text.jbonsai::vocoder::mlsa::fir,"ax",@progbits
	.p2align	4
.type	jbonsai::vocoder::mlsa::fir,@function
jbonsai::vocoder::mlsa::fir:
	.cfi_startproc
	push rax
	.cfi_def_cfa_offset 16
	cmp rsi, 1
	jbe .LBB263_12
	vmulsd xmm3, xmm1, xmm1
	vmovsd xmm2, qword ptr [rip + .LCPI263_0]
	vsubsd xmm2, xmm2, xmm3
	vmovsd xmm4, qword ptr [rdi + 8]
	vmulsd xmm5, xmm0, xmm1
	vmovsd qword ptr [rdi], xmm5
	vmulsd xmm0, xmm0, xmm2
	vmulsd xmm6, xmm1, xmm4
	vaddsd xmm0, xmm0, xmm6
	vmovsd qword ptr [rdi + 8], xmm0
	cmp rsi, rcx
	ja .LBB263_13
	vsubsd xmm4, xmm4, xmm5
	add rsi, -2
	mov eax, esi
	and eax, 3
	mov rcx, rsi
	shr rcx, 2
	je .LBB263_3
	vxorpd xmm5, xmm1, xmmword ptr [rip + .LCPI263_1]
	vmulsd xmm6, xmm3, xmm3
	vmovddup xmm7, xmm1
	vmovddup xmm8, xmm2
	vmovddup xmm9, xmm5
	vmovddup xmm10, xmm3
	vxorpd xmm0, xmm0, xmm0
	mov r8d, 32
	vmovapd xmm11, xmm4
	.p2align	4
.LBB263_7:
	vmovupd xmm12, xmmword ptr [rdi + r8 - 16]
	vmovapd xmm4, xmm11
	vfmadd213sd xmm4, xmm5, xmm12
	vmulpd xmm13, xmm12, xmm7
	vunpcklpd xmm14, xmm11, xmm4
	vfmadd231pd xmm13, xmm8, xmm14
	vfmadd213pd xmm12, xmm9, xmmword ptr [rdi + r8 - 8]
	vmovsd xmm4, qword ptr [rdi + r8]
	vfmadd213pd xmm14, xmm10, xmm12
	vmulpd xmm15, xmm7, xmmword ptr [rdi + r8]
	vfmadd231pd xmm15, xmm8, xmm14
	vfmadd213sd xmm4, xmm5, qword ptr [rdi + r8 + 8]
	vfmadd231sd xmm4, xmm3, xmm12
	vfmadd231sd xmm4, xmm6, xmm11
	vmovupd xmmword ptr [rdi + r8 - 16], xmm13
	vmulpd xmm11, xmm13, xmmword ptr [rdx + r8 - 16]
	vmovupd xmmword ptr [rdi + r8], xmm15
	vmulpd xmm12, xmm15, xmmword ptr [rdx + r8]
	vaddpd xmm11, xmm11, xmm12
	vaddpd xmm0, xmm11, xmm0
	add r8, 32
	vmovapd xmm11, xmm4
	dec rcx
	jne .LBB263_7
	test rax, rax
	je .LBB263_5
.LBB263_8:
	movabs rcx, 1152921504606846972
	and rsi, rcx
	vmulsd xmm3, xmm2, xmm4
	vmovsd xmm5, qword ptr [rdi + 8*rsi + 16]
	vmulsd xmm6, xmm1, xmm5
	vaddsd xmm3, xmm3, xmm6
	vmovsd qword ptr [rdi + 8*rsi + 16], xmm3
	vmulsd xmm3, xmm3, qword ptr [rdx + 8*rsi + 16]
	vaddsd xmm3, xmm0, xmm3
	cmp eax, 1
	je .LBB263_11
	vmulsd xmm4, xmm1, xmm4
	vsubsd xmm4, xmm5, xmm4
	vmulsd xmm6, xmm2, xmm4
	vmovsd xmm5, qword ptr [rdi + 8*rsi + 24]
	vmulsd xmm7, xmm1, xmm5
	vaddsd xmm6, xmm6, xmm7
	vmovsd qword ptr [rdi + 8*rsi + 24], xmm6
	vmulsd xmm6, xmm6, qword ptr [rdx + 8*rsi + 24]
	vaddsd xmm3, xmm3, xmm6
	cmp eax, 2
	je .LBB263_11
	vmulsd xmm4, xmm1, xmm4
	vsubsd xmm4, xmm5, xmm4
	vmulsd xmm2, xmm2, xmm4
	vmulsd xmm1, xmm1, qword ptr [rdi + 8*rsi + 32]
	vaddsd xmm1, xmm2, xmm1
	vmovsd qword ptr [rdi + 8*rsi + 32], xmm1
	vmulsd xmm1, xmm1, qword ptr [rdx + 8*rsi + 32]
	vaddsd xmm3, xmm3, xmm1
.LBB263_11:
	vshufpd xmm0, xmm0, xmm0, 1
	vaddsd xmm0, xmm3, xmm0
	pop rax
	.cfi_def_cfa_offset 8
	ret
.LBB263_3:
	.cfi_def_cfa_offset 16
	vxorpd xmm0, xmm0, xmm0
	test rax, rax
	jne .LBB263_8
.LBB263_5:
	vmovapd xmm3, xmm0
	vshufpd xmm0, xmm0, xmm0, 1
	vaddsd xmm0, xmm3, xmm0
	pop rax
	.cfi_def_cfa_offset 8
	ret
.LBB263_12:
	.cfi_def_cfa_offset 16
	lea rdi, [rip + .Lanon.c52d3424c1a98a09d9b797d35b800c8b.517]
	lea rdx, [rip + .Lanon.c52d3424c1a98a09d9b797d35b800c8b.518]
	mov esi, 30
	call qword ptr [rip + core::panicking::panic@GOTPCREL]
.LBB263_13:
	lea rax, [rip + .Lanon.c52d3424c1a98a09d9b797d35b800c8b.519]
	mov edi, 2
	mov rdx, rcx
	mov rcx, rax
	call qword ptr [rip + core::slice::index::slice_index_fail@GOTPCREL]

Benchmark results:


running 3 tests
test bonsai        ... bench:  11,803,118.30 ns/iter (+/- 238,705.64)
test bonsai_letter ... bench:  33,307,132.30 ns/iter (+/- 2,585,662.95)
test is_bonsai     ... bench:  18,079,126.90 ns/iter (+/- 232,866.15)

test result: ok. 0 passed; 0 failed; 0 ignored; 3 measured; 0 filtered out; finished in 19.11s

Artifact: x86_64-base

Assembly:

Details
.section .text.jbonsai::vocoder::mlsa::fir,"ax",@progbits
	.p2align	4
.type	jbonsai::vocoder::mlsa::fir,@function
jbonsai::vocoder::mlsa::fir:
	.cfi_startproc
	push rax
	.cfi_def_cfa_offset 16
	cmp rsi, 1
	jbe .LBB262_12
	movapd xmm4, xmm1
	mulsd xmm4, xmm1
	movsd xmm2, qword ptr [rip + .LCPI262_0]
	subsd xmm2, xmm4
	movsd xmm3, qword ptr [rdi + 8]
	movapd xmm5, xmm0
	mulsd xmm5, xmm1
	movsd qword ptr [rdi], xmm5
	mulsd xmm0, xmm2
	movapd xmm6, xmm1
	mulsd xmm6, xmm3
	addsd xmm6, xmm0
	movsd qword ptr [rdi + 8], xmm6
	cmp rsi, rcx
	ja .LBB262_13
	subsd xmm3, xmm5
	add rsi, -2
	mov eax, esi
	and eax, 3
	mov rcx, rsi
	shr rcx, 2
	je .LBB262_3
	movapd xmm5, xmm4
	mulsd xmm5, xmm4
	movapd xmm6, xmm2
	unpcklpd xmm6, xmm2
	movapd xmm7, xmm1
	unpcklpd xmm7, xmm1
	movapd xmm8, xmm4
	unpcklpd xmm8, xmm4
	xorpd xmm0, xmm0
	mov r8d, 32
	movapd xmm9, xmm3
	.p2align	4
.LBB262_7:
	movapd xmm3, xmm1
	mulsd xmm3, xmm9
	movapd xmm10, xmm5
	mulsd xmm10, xmm9
	movupd xmm11, xmmword ptr [rdi + r8 - 16]
	movupd xmm12, xmmword ptr [rdi + r8 - 8]
	movupd xmm13, xmmword ptr [rdi + r8]
	movapd xmm14, xmm7
	mulpd xmm14, xmm11
	subsd xmm11, xmm3
	unpcklpd xmm9, xmm11
	movapd xmm11, xmm6
	mulpd xmm11, xmm9
	addpd xmm11, xmm14
	mulpd xmm9, xmm8
	movsd xmm3, qword ptr [rdi + r8 + 8]
	subpd xmm12, xmm14
	addpd xmm9, xmm12
	mulpd xmm9, xmm6
	mulpd xmm13, xmm7
	addpd xmm9, xmm13
	mulsd xmm12, xmm4
	subsd xmm3, xmm13
	addsd xmm3, xmm12
	addsd xmm3, xmm10
	movupd xmmword ptr [rdi + r8 - 16], xmm11
	movupd xmmword ptr [rdi + r8], xmm9
	movupd xmm10, xmmword ptr [rdx + r8 - 16]
	movupd xmm12, xmmword ptr [rdx + r8]
	mulpd xmm10, xmm11
	mulpd xmm12, xmm9
	addpd xmm12, xmm10
	addpd xmm0, xmm12
	add r8, 32
	movapd xmm9, xmm3
	dec rcx
	jne .LBB262_7
	test rax, rax
	je .LBB262_5
.LBB262_8:
	movabs rcx, 1152921504606846972
	and rsi, rcx
	movapd xmm6, xmm2
	mulsd xmm6, xmm3
	movsd xmm5, qword ptr [rdi + 8*rsi + 16]
	movapd xmm4, xmm1
	mulsd xmm4, xmm5
	addsd xmm4, xmm6
	movsd qword ptr [rdi + 8*rsi + 16], xmm4
	mulsd xmm4, qword ptr [rdx + 8*rsi + 16]
	addsd xmm4, xmm0
	cmp eax, 1
	je .LBB262_11
	mulsd xmm3, xmm1
	subsd xmm5, xmm3
	movapd xmm6, xmm2
	mulsd xmm6, xmm5
	movsd xmm3, qword ptr [rdi + 8*rsi + 24]
	movapd xmm7, xmm1
	mulsd xmm7, xmm3
	addsd xmm7, xmm6
	movsd qword ptr [rdi + 8*rsi + 24], xmm7
	mulsd xmm7, qword ptr [rdx + 8*rsi + 24]
	addsd xmm4, xmm7
	cmp eax, 2
	je .LBB262_11
	mulsd xmm5, xmm1
	subsd xmm3, xmm5
	mulsd xmm2, xmm3
	mulsd xmm1, qword ptr [rdi + 8*rsi + 32]
	addsd xmm1, xmm2
	movsd qword ptr [rdi + 8*rsi + 32], xmm1
	mulsd xmm1, qword ptr [rdx + 8*rsi + 32]
	addsd xmm4, xmm1
.LBB262_11:
	unpckhpd xmm0, xmm0
	addsd xmm0, xmm4
	pop rax
	.cfi_def_cfa_offset 8
	ret
.LBB262_3:
	.cfi_def_cfa_offset 16
	xorpd xmm0, xmm0
	test rax, rax
	jne .LBB262_8
.LBB262_5:
	movapd xmm4, xmm0
	unpckhpd xmm0, xmm0
	addsd xmm0, xmm4
	pop rax
	.cfi_def_cfa_offset 8
	ret
.LBB262_12:
	.cfi_def_cfa_offset 16
	lea rdi, [rip + .Lanon.c52d3424c1a98a09d9b797d35b800c8b.532]
	lea rdx, [rip + .Lanon.c52d3424c1a98a09d9b797d35b800c8b.533]
	mov esi, 30
	call qword ptr [rip + core::panicking::panic@GOTPCREL]
.LBB262_13:
	lea rax, [rip + .Lanon.c52d3424c1a98a09d9b797d35b800c8b.534]
	mov edi, 2
	mov rdx, rcx
	mov rcx, rax
	call qword ptr [rip + core::slice::index::slice_index_fail@GOTPCREL]

Benchmark results:


running 3 tests
test bonsai        ... bench:  16,081,891.90 ns/iter (+/- 276,797.20)
test bonsai_letter ... bench:  44,216,801.30 ns/iter (+/- 1,838,765.10)
test is_bonsai     ... bench:  24,650,638.80 ns/iter (+/- 181,554.95)

test result: ok. 0 passed; 0 failed; 0 ignored; 3 measured; 0 filtered out; finished in 25.67s

Artifact: x86_64-merge

Assembly:

Details
.section .text.jbonsai::vocoder::mlsa::fir,"ax",@progbits
	.p2align	4
.type	jbonsai::vocoder::mlsa::fir,@function
jbonsai::vocoder::mlsa::fir:
	.cfi_startproc
	push rax
	.cfi_def_cfa_offset 16
	cmp rsi, 1
	jbe .LBB263_12
	movapd xmm4, xmm1
	mulsd xmm4, xmm1
	movsd xmm2, qword ptr [rip + .LCPI263_0]
	subsd xmm2, xmm4
	movsd xmm3, qword ptr [rdi + 8]
	movapd xmm5, xmm0
	mulsd xmm5, xmm1
	movsd qword ptr [rdi], xmm5
	mulsd xmm0, xmm2
	movapd xmm6, xmm1
	mulsd xmm6, xmm3
	addsd xmm6, xmm0
	movsd qword ptr [rdi + 8], xmm6
	cmp rsi, rcx
	ja .LBB263_13
	subsd xmm3, xmm5
	add rsi, -2
	mov eax, esi
	and eax, 3
	mov rcx, rsi
	shr rcx, 2
	je .LBB263_3
	movapd xmm5, xmm4
	mulsd xmm5, xmm4
	movapd xmm6, xmm2
	unpcklpd xmm6, xmm2
	movapd xmm7, xmm1
	unpcklpd xmm7, xmm1
	movapd xmm8, xmm4
	unpcklpd xmm8, xmm4
	xorpd xmm0, xmm0
	mov r8d, 32
	movapd xmm9, xmm3
	.p2align	4
.LBB263_7:
	movapd xmm3, xmm1
	mulsd xmm3, xmm9
	movapd xmm10, xmm5
	mulsd xmm10, xmm9
	movupd xmm11, xmmword ptr [rdi + r8 - 16]
	movupd xmm12, xmmword ptr [rdi + r8 - 8]
	movupd xmm13, xmmword ptr [rdi + r8]
	movapd xmm14, xmm7
	mulpd xmm14, xmm11
	subsd xmm11, xmm3
	unpcklpd xmm9, xmm11
	movapd xmm11, xmm6
	mulpd xmm11, xmm9
	addpd xmm11, xmm14
	mulpd xmm9, xmm8
	movsd xmm3, qword ptr [rdi + r8 + 8]
	subpd xmm12, xmm14
	addpd xmm9, xmm12
	mulpd xmm9, xmm6
	mulpd xmm13, xmm7
	addpd xmm9, xmm13
	mulsd xmm12, xmm4
	subsd xmm3, xmm13
	addsd xmm3, xmm12
	addsd xmm3, xmm10
	movupd xmmword ptr [rdi + r8 - 16], xmm11
	movupd xmmword ptr [rdi + r8], xmm9
	movupd xmm10, xmmword ptr [rdx + r8 - 16]
	movupd xmm12, xmmword ptr [rdx + r8]
	mulpd xmm10, xmm11
	mulpd xmm12, xmm9
	addpd xmm12, xmm10
	addpd xmm0, xmm12
	add r8, 32
	movapd xmm9, xmm3
	dec rcx
	jne .LBB263_7
	test rax, rax
	je .LBB263_5
.LBB263_8:
	movabs rcx, 1152921504606846972
	and rsi, rcx
	movapd xmm6, xmm2
	mulsd xmm6, xmm3
	movsd xmm5, qword ptr [rdi + 8*rsi + 16]
	movapd xmm4, xmm1
	mulsd xmm4, xmm5
	addsd xmm4, xmm6
	movsd qword ptr [rdi + 8*rsi + 16], xmm4
	mulsd xmm4, qword ptr [rdx + 8*rsi + 16]
	addsd xmm4, xmm0
	cmp eax, 1
	je .LBB263_11
	mulsd xmm3, xmm1
	subsd xmm5, xmm3
	movapd xmm6, xmm2
	mulsd xmm6, xmm5
	movsd xmm3, qword ptr [rdi + 8*rsi + 24]
	movapd xmm7, xmm1
	mulsd xmm7, xmm3
	addsd xmm7, xmm6
	movsd qword ptr [rdi + 8*rsi + 24], xmm7
	mulsd xmm7, qword ptr [rdx + 8*rsi + 24]
	addsd xmm4, xmm7
	cmp eax, 2
	je .LBB263_11
	mulsd xmm5, xmm1
	subsd xmm3, xmm5
	mulsd xmm2, xmm3
	mulsd xmm1, qword ptr [rdi + 8*rsi + 32]
	addsd xmm1, xmm2
	movsd qword ptr [rdi + 8*rsi + 32], xmm1
	mulsd xmm1, qword ptr [rdx + 8*rsi + 32]
	addsd xmm4, xmm1
.LBB263_11:
	unpckhpd xmm0, xmm0
	addsd xmm0, xmm4
	pop rax
	.cfi_def_cfa_offset 8
	ret
.LBB263_3:
	.cfi_def_cfa_offset 16
	xorpd xmm0, xmm0
	test rax, rax
	jne .LBB263_8
.LBB263_5:
	movapd xmm4, xmm0
	unpckhpd xmm0, xmm0
	addsd xmm0, xmm4
	pop rax
	.cfi_def_cfa_offset 8
	ret
.LBB263_12:
	.cfi_def_cfa_offset 16
	lea rdi, [rip + .Lanon.c52d3424c1a98a09d9b797d35b800c8b.517]
	lea rdx, [rip + .Lanon.c52d3424c1a98a09d9b797d35b800c8b.518]
	mov esi, 30
	call qword ptr [rip + core::panicking::panic@GOTPCREL]
.LBB263_13:
	lea rax, [rip + .Lanon.c52d3424c1a98a09d9b797d35b800c8b.519]
	mov edi, 2
	mov rdx, rcx
	mov rcx, rax
	call qword ptr [rip + core::slice::index::slice_index_fail@GOTPCREL]

Benchmark results:


running 3 tests
test bonsai        ... bench:  14,719,328.50 ns/iter (+/- 540,668.25)
test bonsai_letter ... bench:  40,249,033.90 ns/iter (+/- 338,242.87)
test is_bonsai     ... bench:  22,538,595.00 ns/iter (+/- 151,013.04)

test result: ok. 0 passed; 0 failed; 0 ignored; 3 measured; 0 filtered out; finished in 23.41s

@cm-ayf cm-ayf force-pushed the perf/mlpg-adjust branch from ca9223a to 55d2a5a Compare July 4, 2026 14:42
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

1 participant