Skip to content

Commit 7fd1feb

Browse files
authored
DEBUG-3175 DI snapshot/rate limit benchmark (#4207)
1 parent 11b9ae1 commit 7fd1feb

File tree

6 files changed

+306
-6
lines changed

6 files changed

+306
-6
lines changed

benchmarks/di_snapshot.rb

Lines changed: 257 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,257 @@
1+
=begin
2+
3+
This benchmark verifies that the rate limits used for dynamic instrumentation
4+
probes are attainable.
5+
6+
Each benchmark performs as many operations as the rate limit permits -
7+
5000 for a basic probe and 1 for enriched probe. If the benchmark
8+
produces a rate of fewer than 1 "instructions" per second, the rate limit is
9+
not being reached. A result of more than 1 "instruction" per second
10+
means the rate limit is being reached.
11+
12+
Note that the number of "instructions per second" reported by benchmark/ips
13+
does not reflect how many times the instrumentation creates a snapshot -
14+
there can (and normally are) invocations of the target method that do not
15+
produce DI snapshots due to rate limit but these invocations are counted in
16+
the "instructions per second" reported by benchmark/ips.
17+
18+
The default dynamic instrumentation settings for the probe notifier worker
19+
(queue capacity of 100 and minimum send interval of 3 seconds) mean an
20+
effective rate limit of 30 snapshots per second for basic probes,
21+
which is shared across all probes in the application, which is significantly
22+
below the 5000 snapshots per second per probe that DI is theoretically
23+
supposed to achieve. However, to increase actual attainable snapshot rate
24+
to 5000/second, the probe notifier worker needs to be changed to send
25+
multiple network requests for a single queue processing run or be more
26+
aggressive in flushing the snapshots to the network when the queue is getting
27+
full. In either case care needs to be taken not to starve customer applications
28+
of CPU.
29+
30+
=end
31+
32+
# Used to quickly run benchmark under RSpec as part of the usual test suite, to validate it didn't bitrot
33+
VALIDATE_BENCHMARK_MODE = ENV['VALIDATE_BENCHMARK'] == 'true'
34+
35+
return unless __FILE__ == $PROGRAM_NAME || VALIDATE_BENCHMARK_MODE
36+
37+
require 'benchmark/ips'
38+
require 'datadog'
39+
require 'webrick'
40+
41+
class DISnapshotBenchmark
42+
# If we are validating the benchmark a single operation is sufficient.
43+
BASIC_RATE_LIMIT = VALIDATE_BENCHMARK_MODE ? 1 : 5000
44+
ENRICHED_RATE_LIMIT = 1
45+
46+
def initialize
47+
48+
Datadog::DI.activate_tracking!
49+
50+
Datadog.configure do |c|
51+
c.remote.enabled = true
52+
c.dynamic_instrumentation.enabled = true
53+
c.dynamic_instrumentation.internal.development = true
54+
55+
# Increase queue capacity and reduce min send interval
56+
# to be able to send more snapshots out.
57+
# The default settings will result in dropped snapshots
58+
# way before non-enriched probe rate limit is reached.
59+
c.dynamic_instrumentation.internal.snapshot_queue_capacity = 10000
60+
c.dynamic_instrumentation.internal.min_send_interval = 1
61+
end
62+
63+
Thread.new do
64+
# If there is an actual Datadog agent running locally, the server
65+
# used in this benchmark will fail to start.
66+
# Using an actual Datadog agent instead of the fake server should not
67+
# affect the indication of whether the rate limit is reachable
68+
# since the agent shouldn't take longer to process than the fake
69+
# web server (and the agent should also run on another core),
70+
# however using a real agent would forego reports of the number of
71+
# snapshots submitted and their size.
72+
server.start
73+
end
74+
75+
require_relative 'support/di_snapshot_target'
76+
end
77+
78+
def run_benchmark
79+
probe = Datadog::DI::Probe.new(
80+
id: 1, type: :log,
81+
type_name: 'DISnapshotTarget', method_name: 'test_method',
82+
rate_limit: BASIC_RATE_LIMIT,
83+
)
84+
85+
unless probe_manager.add_probe(probe)
86+
raise "Failed to instrument method (without snapshot capture)"
87+
end
88+
89+
@received_snapshot_count = 0
90+
@received_snapshot_bytes = 0
91+
92+
Benchmark.ips do |x|
93+
benchmark_time = VALIDATE_BENCHMARK_MODE ? { time: 0.01, warmup: 0 } : { time: 10, warmup: 2 }
94+
x.config(
95+
**benchmark_time,
96+
)
97+
98+
x.report('method probe - basic') do
99+
BASIC_RATE_LIMIT.times do
100+
DISnapshotTarget.new.test_method
101+
end
102+
Datadog::DI.component.probe_notifier_worker.flush
103+
end
104+
105+
x.save! 'di-snapshot-results.json' unless VALIDATE_BENCHMARK_MODE
106+
x.compare!
107+
end
108+
109+
# DI does not provide an API to remove a specific probe because
110+
# this functionality is currently not needed by the product.
111+
probe_manager.remove_other_probes([])
112+
113+
puts "Received #{@received_snapshot_count} snapshots, #{@received_snapshot_bytes} bytes total"
114+
115+
probe = Datadog::DI::Probe.new(
116+
id: 1, type: :log,
117+
type_name: 'DISnapshotTarget', method_name: 'test_method',
118+
capture_snapshot: true,
119+
# Normally rate limit for enriched probes is 1.
120+
# To get a meaningful number of submissions, increase it to 20.
121+
# We should get about 200 snapshots in the 10 seconds that the
122+
# benchmark is supposed to run.
123+
rate_limit: ENRICHED_RATE_LIMIT,
124+
)
125+
126+
unless probe_manager.add_probe(probe)
127+
raise "Failed to instrument method (with snapshot capture)"
128+
end
129+
130+
@received_snapshot_count = 0
131+
@received_snapshot_bytes = 0
132+
133+
Benchmark.ips do |x|
134+
benchmark_time = VALIDATE_BENCHMARK_MODE ? { time: 0.01, warmup: 0 } : { time: 10, warmup: 2 }
135+
x.config(
136+
**benchmark_time,
137+
)
138+
139+
x.report('method probe - enriched') do
140+
ENRICHED_RATE_LIMIT.times do
141+
DISnapshotTarget.new.test_method
142+
end
143+
Datadog::DI.component.probe_notifier_worker.flush
144+
end
145+
146+
x.save! 'di-snapshot-results.json' unless VALIDATE_BENCHMARK_MODE
147+
x.compare!
148+
end
149+
150+
probe_manager.remove_other_probes([])
151+
152+
puts "Received #{@received_snapshot_count} snapshots, #{@received_snapshot_bytes} bytes total"
153+
154+
probe = Datadog::DI::Probe.new(
155+
id: 1, type: :log,
156+
file: 'di_snapshot_target.rb', line_no: 20,
157+
capture_snapshot: false,
158+
rate_limit: BASIC_RATE_LIMIT,
159+
)
160+
161+
unless probe_manager.add_probe(probe)
162+
raise "Failed to instrument line (with snapshot capture)"
163+
end
164+
165+
@received_snapshot_count = 0
166+
@received_snapshot_bytes = 0
167+
168+
Benchmark.ips do |x|
169+
benchmark_time = VALIDATE_BENCHMARK_MODE ? { time: 0.01, warmup: 0 } : { time: 10, warmup: 2 }
170+
x.config(
171+
**benchmark_time,
172+
)
173+
174+
x.report('line probe - basic') do
175+
BASIC_RATE_LIMIT.times do
176+
DISnapshotTarget.new.test_method
177+
end
178+
Datadog::DI.component.probe_notifier_worker.flush
179+
end
180+
181+
x.save! 'di-snapshot-results.json' unless VALIDATE_BENCHMARK_MODE
182+
x.compare!
183+
end
184+
185+
probe_manager.remove_other_probes([])
186+
187+
puts "Received #{@received_snapshot_count} snapshots, #{@received_snapshot_bytes} bytes total"
188+
189+
probe = Datadog::DI::Probe.new(
190+
id: 1, type: :log,
191+
file: 'di_snapshot_target.rb', line_no: 20,
192+
capture_snapshot: true,
193+
rate_limit: ENRICHED_RATE_LIMIT,
194+
)
195+
196+
unless probe_manager.add_probe(probe)
197+
raise "Failed to instrument line (with snapshot capture)"
198+
end
199+
200+
@received_snapshot_count = 0
201+
@received_snapshot_bytes = 0
202+
203+
Benchmark.ips do |x|
204+
benchmark_time = VALIDATE_BENCHMARK_MODE ? { time: 0.01, warmup: 0 } : { time: 10, warmup: 2 }
205+
x.config(
206+
**benchmark_time,
207+
)
208+
209+
x.report('line probe - enriched') do
210+
ENRICHED_RATE_LIMIT.times do
211+
DISnapshotTarget.new.test_method
212+
end
213+
Datadog::DI.component.probe_notifier_worker.flush
214+
end
215+
216+
x.save! 'di-snapshot-results.json' unless VALIDATE_BENCHMARK_MODE
217+
x.compare!
218+
end
219+
220+
probe_manager.remove_other_probes([])
221+
222+
puts "Received #{@received_snapshot_count} snapshots, #{@received_snapshot_bytes} bytes total"
223+
end
224+
225+
private
226+
227+
def probe_manager
228+
Datadog::DI.component.probe_manager
229+
end
230+
231+
def server
232+
WEBrick::HTTPServer.new(
233+
Port: 8126,
234+
).tap do |server|
235+
@received_snapshot_count = 0
236+
@received_snapshot_bytes = 0
237+
238+
server.mount_proc('/debugger/v1/diagnostics') do |req, res|
239+
# This request is a multipart form post
240+
end
241+
242+
server.mount_proc('/debugger/v1/input') do |req, res|
243+
payload = JSON.parse(req.body)
244+
@received_snapshot_count += payload.length
245+
@received_snapshot_bytes += req.body.length
246+
end
247+
end
248+
end
249+
250+
attr_reader :received_snapshot_count
251+
end
252+
253+
puts "Current pid is #{Process.pid}"
254+
255+
DISnapshotBenchmark.new.instance_exec do
256+
run_benchmark
257+
end
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
class DISnapshotTarget
2+
def test_method
3+
# Perform some work to take up time
4+
SecureRandom.uuid
5+
6+
v1 = Datadog.configuration
7+
v2 = Datadog.configuration
8+
v3 = Datadog.configuration
9+
v4 = Datadog.configuration
10+
v5 = Datadog.configuration
11+
v6 = Datadog.configuration
12+
v7 = Datadog.configuration
13+
v8 = Datadog.configuration
14+
v9 = Datadog.configuration
15+
16+
# Currently Ruby DI does not implement capture of local variables
17+
# in method probes, or instance variables.
18+
# Return a complex object which will be serialized for the
19+
# enriched probe.
20+
Datadog.configuration # Line 20
21+
end
22+
end

lib/datadog/di/configuration/settings.rb

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -166,10 +166,20 @@ def self.add_settings!(base)
166166
# being sent out by the probe notifier worker) and creates a
167167
# possibility of dropping payloads if the queue gets too long.
168168
option :min_send_interval do |o|
169-
o.type :int
169+
o.type :float
170170
o.default 3
171171
end
172172

173+
# Number of snapshots that can be stored in the probe
174+
# notifier worker queue. Larger capacity runs the risk of
175+
# creating snapshots that exceed the agent's request size
176+
# limit. Smaller capacity increases the risk of dropping
177+
# snapshots.
178+
option :snapshot_queue_capacity do |o|
179+
o.type :int
180+
o.default 100
181+
end
182+
173183
# Enable dynamic instrumentation in development environments.
174184
# Currently DI does not fully implement support for code
175185
# unloading and reloading, and is not supported in

lib/datadog/di/probe_notifier_worker.rb

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -183,8 +183,7 @@ def io_in_progress?
183183
define_method("add_#{event_type}") do |event|
184184
@lock.synchronize do
185185
queue = send("#{event_type}_queue")
186-
# TODO determine a suitable limit via testing/benchmarking
187-
if queue.length > 100
186+
if queue.length > settings.dynamic_instrumentation.internal.snapshot_queue_capacity
188187
logger.warn("#{self.class.name}: dropping #{event_type} because queue is full")
189188
else
190189
queue << event

spec/datadog/di/probe_notifier_worker_spec.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
allow(settings.dynamic_instrumentation.internal).to receive(:propagate_all_exceptions).and_return(false)
1111
# Reduce to 1 to have the test run faster
1212
allow(settings.dynamic_instrumentation.internal).to receive(:min_send_interval).and_return(1)
13+
allow(settings.dynamic_instrumentation.internal).to receive(:snapshot_queue_capacity).and_return(10)
1314
end
1415

1516
let(:transport) do

spec/datadog/di/validate_benchmarks_spec.rb

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,18 +10,29 @@
1010
end
1111

1212
benchmarks_to_validate = [
13-
"di_instrument",
13+
"di_instrument", "di_snapshot",
1414
].freeze
1515

1616
benchmarks_to_validate.each do |benchmark|
1717
describe benchmark do
18-
it("runs without raising errors") { expect_in_fork { load "./benchmarks/#{benchmark}.rb" } }
18+
timeout = if benchmark == 'di_snapshot'
19+
20
20+
else
21+
10
22+
end
23+
it "runs without raising errors" do
24+
expect_in_fork(timeout_seconds: timeout) do
25+
load "./benchmarks/#{benchmark}.rb"
26+
end
27+
end
1928
end
2029
end
2130

2231
# This test validates that we don't forget to add new benchmarks to benchmarks_to_validate
2332
it "tests all expected benchmarks in the benchmarks folder" do
24-
all_benchmarks = Dir["./benchmarks/di_*"].map { |it| it.gsub("./benchmarks/", "").gsub(".rb", "") }
33+
all_benchmarks = Dir["./benchmarks/di_*"].map do |it|
34+
it.gsub("./benchmarks/", "").gsub(".rb", "")
35+
end
2536

2637
expect(benchmarks_to_validate).to contain_exactly(*all_benchmarks)
2738
end

0 commit comments

Comments
 (0)