Skip to content

Commit 41d2a4d

Browse files
taegyunkimclaude
andcommitted
test(profiling): experiment - measure greenlet sampling malloc churn
LD_PRELOAD malloc-counting interposer; prints MALLOC_DELTA over a fixed greenlet-sampling window. Experiment to find a signal that separates the buffer-reuse fix from unfixed code (RSS does not). Always passes for now. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent caf0b62 commit 41d2a4d

1 file changed

Lines changed: 171 additions & 0 deletions

File tree

Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
"""Measure native malloc churn during sustained unwind_greenlets sampling.
2+
3+
On a single sampling thread, allocate-then-free each sample reuses the same
4+
freed blocks, so RSS/arena are identical between the buffer-reuse fix and
5+
unfixed code. The signal that actually separates them is the *number* of
6+
allocations: unfixed code allocates one StackInfo (std::deque<Frame>) per
7+
tracked greenlet per sample; the fix reuses buffers across samples.
8+
9+
We count allocations with a tiny LD_PRELOAD malloc/calloc/realloc interposer
10+
(black-box; works identically on both builds) and report the delta over a
11+
fixed sampling window.
12+
13+
EXPERIMENT MODE: prints MALLOC_DELTA and always passes, so we can read the
14+
real numbers from CI on both the unfixed (#18422) and fixed builds before
15+
choosing a threshold.
16+
"""
17+
18+
import os
19+
import subprocess
20+
import sys
21+
import textwrap
22+
23+
import pytest
24+
25+
26+
GEVENT_COMPATIBLE_WITH_PYTHON_VERSION = os.getenv("DD_PROFILE_TEST_GEVENT", False) and (
27+
sys.version_info[:2] < (3, 13) or (sys.version_info[:2] == (3, 13) and sys.version_info[3] != "free-threading")
28+
)
29+
30+
_INTERPOSER_C = r"""
31+
#define _GNU_SOURCE
32+
#include <dlfcn.h>
33+
#include <stdatomic.h>
34+
#include <stddef.h>
35+
36+
static atomic_ullong g_cnt;
37+
static void *(*r_malloc)(size_t);
38+
static void *(*r_calloc)(size_t, size_t);
39+
static void *(*r_realloc)(void *, size_t);
40+
41+
/* Serve calloc out of a static buffer while dlsym() bootstraps. */
42+
static char g_tmp[1 << 20];
43+
static size_t g_off;
44+
static int g_initing;
45+
46+
static void mc_init(void) {
47+
g_initing = 1;
48+
r_malloc = dlsym(RTLD_NEXT, "malloc");
49+
r_calloc = dlsym(RTLD_NEXT, "calloc");
50+
r_realloc = dlsym(RTLD_NEXT, "realloc");
51+
g_initing = 0;
52+
}
53+
54+
void *malloc(size_t s) {
55+
if (!r_malloc) mc_init();
56+
atomic_fetch_add(&g_cnt, 1);
57+
return r_malloc(s);
58+
}
59+
60+
void *realloc(void *p, size_t s) {
61+
if (!r_realloc) mc_init();
62+
atomic_fetch_add(&g_cnt, 1);
63+
return r_realloc(p, s);
64+
}
65+
66+
void *calloc(size_t n, size_t s) {
67+
if (g_initing) {
68+
size_t t = n * s;
69+
void *p = g_tmp + g_off;
70+
g_off += (t + 15) & ~((size_t)15);
71+
return p;
72+
}
73+
if (!r_calloc) mc_init();
74+
atomic_fetch_add(&g_cnt, 1);
75+
return r_calloc(n, s);
76+
}
77+
78+
unsigned long long mc_get(void) { return atomic_load(&g_cnt); }
79+
"""
80+
81+
# Program run inside the LD_PRELOAD'd subprocess. Prints MALLOC_DELTA=<n>.
82+
_INNER = textwrap.dedent(
83+
"""
84+
from gevent import monkey
85+
monkey.patch_all()
86+
87+
import ctypes
88+
import time
89+
90+
import gevent
91+
92+
from ddtrace.internal.datadog.profiling import stack
93+
from ddtrace.profiling import profiler
94+
95+
me = ctypes.CDLL(None)
96+
me.mc_get.restype = ctypes.c_ulonglong
97+
98+
N_IDLE = 2000
99+
STACK_DEPTH = 50
100+
WARMUP_S = 1.0
101+
MEASURE_S = 5.0
102+
103+
def _idle_deep(depth):
104+
if depth > 0:
105+
_idle_deep(depth - 1)
106+
else:
107+
gevent.sleep(1000)
108+
109+
def idle_greenlet():
110+
_idle_deep(STACK_DEPTH)
111+
112+
p = profiler.Profiler()
113+
p.start()
114+
stack.set_interval(0.005)
115+
stack.set_adaptive_sampling(False)
116+
idles = [gevent.spawn(idle_greenlet) for _ in range(N_IDLE)]
117+
gevent.sleep(WARMUP_S)
118+
119+
start = me.mc_get()
120+
t_end = time.monotonic() + MEASURE_S
121+
n_yields = 0
122+
while time.monotonic() < t_end:
123+
gevent.sleep(0.05)
124+
n_yields += 1
125+
delta = me.mc_get() - start
126+
127+
print("MALLOC_DELTA=%d" % delta)
128+
print("MEASURE_S=%.1f N_IDLE=%d STACK_DEPTH=%d YIELDS=%d" % (MEASURE_S, N_IDLE, STACK_DEPTH, n_yields))
129+
130+
gevent.killall(idles, timeout=5)
131+
p.stop()
132+
"""
133+
)
134+
135+
136+
@pytest.mark.skipif(not sys.platform.startswith("linux"), reason="LD_PRELOAD interposer is Linux-only")
137+
@pytest.mark.skipif(not GEVENT_COMPATIBLE_WITH_PYTHON_VERSION, reason="gevent not compatible")
138+
def test_greenlet_malloc_churn(tmp_path) -> None:
139+
cc = os.environ.get("CC", "cc")
140+
src = tmp_path / "mc.c"
141+
so = tmp_path / "mc.so"
142+
src.write_text(_INTERPOSER_C)
143+
try:
144+
subprocess.run(
145+
[cc, "-shared", "-fPIC", "-O2", "-o", str(so), str(src), "-ldl"],
146+
check=True,
147+
capture_output=True,
148+
)
149+
except (OSError, subprocess.CalledProcessError) as e:
150+
pytest.skip(f"could not build malloc interposer: {e}")
151+
152+
env = dict(os.environ)
153+
env["LD_PRELOAD"] = (str(so) + " " + env.get("LD_PRELOAD", "")).strip()
154+
env["DD_PROFILING_OUTPUT_PPROF"] = str(tmp_path / "prof")
155+
# Keep glibc on the main arena so counts are stable and the sampling
156+
# thread does not get its own arena (irrelevant to malloc *counts*, but
157+
# keeps behavior deterministic).
158+
env["MALLOC_ARENA_MAX"] = "1"
159+
160+
proc = subprocess.run(
161+
[sys.executable, "-c", _INNER],
162+
env=env,
163+
capture_output=True,
164+
text=True,
165+
timeout=120,
166+
)
167+
sys.stderr.write(proc.stderr)
168+
sys.stdout.write(proc.stdout)
169+
assert proc.returncode == 0, f"inner subprocess failed: {proc.returncode}"
170+
assert "MALLOC_DELTA=" in proc.stdout
171+
# EXPERIMENT: do not assert a threshold yet; just surface the number.

0 commit comments

Comments
 (0)