Skip to content

Commit 3e3ad39

Browse files
authored
Merge pull request #3103 from DataDog/ivoanjo/prof-7307-wire-up-allocation-sampling
[PROF-7307] Wire up allocation sampling into `CpuAndWallTimeWorker`
2 parents e6e917c + b045835 commit 3e3ad39

File tree

7 files changed

+163
-15
lines changed

7 files changed

+163
-15
lines changed

ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time_worker.c

+38-6
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ struct cpu_and_wall_time_worker_state {
8383
bool allocation_counting_enabled;
8484
bool no_signals_workaround_enabled;
8585
bool dynamic_sampling_rate_enabled;
86+
int allocation_sample_every; // Temporarily used for development/testing of allocation profiling
8687
VALUE self_instance;
8788
VALUE thread_context_collector_instance;
8889
VALUE idle_sampling_helper_instance;
@@ -150,7 +151,8 @@ static VALUE _native_initialize(
150151
VALUE idle_sampling_helper_instance,
151152
VALUE allocation_counting_enabled,
152153
VALUE no_signals_workaround_enabled,
153-
VALUE dynamic_sampling_rate_enabled
154+
VALUE dynamic_sampling_rate_enabled,
155+
VALUE allocation_sample_every
154156
);
155157
static void cpu_and_wall_time_worker_typed_data_mark(void *state_ptr);
156158
static VALUE _native_sampling_loop(VALUE self, VALUE instance);
@@ -183,9 +185,10 @@ static void grab_gvl_and_sample(void);
183185
static void reset_stats(struct cpu_and_wall_time_worker_state *state);
184186
static void sleep_for(uint64_t time_ns);
185187
static VALUE _native_allocation_count(DDTRACE_UNUSED VALUE self);
186-
static void on_newobj_event(DDTRACE_UNUSED VALUE tracepoint_data, DDTRACE_UNUSED void *unused);
188+
static void on_newobj_event(VALUE tracepoint_data, DDTRACE_UNUSED void *unused);
187189
static void disable_tracepoints(struct cpu_and_wall_time_worker_state *state);
188190
static VALUE _native_with_blocked_sigprof(DDTRACE_UNUSED VALUE self);
191+
static VALUE rescued_sample_allocation(VALUE tracepoint_data);
189192

190193
// Note on sampler global state safety:
191194
//
@@ -223,7 +226,7 @@ void collectors_cpu_and_wall_time_worker_init(VALUE profiling_module) {
223226
// https://bugs.ruby-lang.org/issues/18007 for a discussion around this.
224227
rb_define_alloc_func(collectors_cpu_and_wall_time_worker_class, _native_new);
225228

226-
rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_initialize", _native_initialize, 7);
229+
rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_initialize", _native_initialize, 8);
227230
rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_sampling_loop", _native_sampling_loop, 1);
228231
rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_stop", _native_stop, 2);
229232
rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_reset_after_fork", _native_reset_after_fork, 1);
@@ -261,6 +264,7 @@ static VALUE _native_new(VALUE klass) {
261264
state->allocation_counting_enabled = false;
262265
state->no_signals_workaround_enabled = false;
263266
state->dynamic_sampling_rate_enabled = true;
267+
state->allocation_sample_every = 0;
264268
state->thread_context_collector_instance = Qnil;
265269
state->idle_sampling_helper_instance = Qnil;
266270
state->owner_thread = Qnil;
@@ -287,12 +291,14 @@ static VALUE _native_initialize(
287291
VALUE idle_sampling_helper_instance,
288292
VALUE allocation_counting_enabled,
289293
VALUE no_signals_workaround_enabled,
290-
VALUE dynamic_sampling_rate_enabled
294+
VALUE dynamic_sampling_rate_enabled,
295+
VALUE allocation_sample_every
291296
) {
292297
ENFORCE_BOOLEAN(gc_profiling_enabled);
293298
ENFORCE_BOOLEAN(allocation_counting_enabled);
294299
ENFORCE_BOOLEAN(no_signals_workaround_enabled);
295300
ENFORCE_BOOLEAN(dynamic_sampling_rate_enabled);
301+
ENFORCE_TYPE(allocation_sample_every, T_FIXNUM);
296302

297303
struct cpu_and_wall_time_worker_state *state;
298304
TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
@@ -301,6 +307,12 @@ static VALUE _native_initialize(
301307
state->allocation_counting_enabled = (allocation_counting_enabled == Qtrue);
302308
state->no_signals_workaround_enabled = (no_signals_workaround_enabled == Qtrue);
303309
state->dynamic_sampling_rate_enabled = (dynamic_sampling_rate_enabled == Qtrue);
310+
state->allocation_sample_every = NUM2INT(allocation_sample_every);
311+
312+
if (state->allocation_sample_every < 0) {
313+
rb_raise(rb_eArgError, "Unexpected value for allocation_sample_every: %d. This value must be >= 0.", state->allocation_sample_every);
314+
}
315+
304316
state->thread_context_collector_instance = enforce_thread_context_collector_instance(thread_context_collector_instance);
305317
state->idle_sampling_helper_instance = idle_sampling_helper_instance;
306318
state->gc_tracepoint = rb_tracepoint_new(Qnil, RUBY_INTERNAL_EVENT_GC_ENTER | RUBY_INTERNAL_EVENT_GC_EXIT, on_gc_event, NULL /* unused */);
@@ -880,7 +892,7 @@ static VALUE _native_allocation_count(DDTRACE_UNUSED VALUE self) {
880892

881893
// Implements memory-related profiling events. This function is called by Ruby via the `object_allocation_tracepoint`
882894
// when the RUBY_INTERNAL_EVENT_NEWOBJ event is triggered.
883-
static void on_newobj_event(DDTRACE_UNUSED VALUE tracepoint_data, DDTRACE_UNUSED void *unused) {
895+
static void on_newobj_event(VALUE tracepoint_data, DDTRACE_UNUSED void *unused) {
884896
// Update thread-local allocation count
885897
if (RB_UNLIKELY(allocation_count == UINT64_MAX)) {
886898
allocation_count = 0;
@@ -907,7 +919,12 @@ static void on_newobj_event(DDTRACE_UNUSED VALUE tracepoint_data, DDTRACE_UNUSED
907919
// defined as not being able to allocate) sets this.
908920
state->during_sample = true;
909921

910-
// TODO: Sampling goes here (calling into `thread_context_collector_sample_allocation`)
922+
// TODO: This is a placeholder sampling decision strategy. We plan to replace it with a better one soon (e.g. before
923+
// beta), and having something here allows us to test the rest of feature, sampling decision aside.
924+
if (state->allocation_sample_every > 0 && ((allocation_count % state->allocation_sample_every) == 0)) {
925+
// Rescue against any exceptions that happen during sampling
926+
safely_call(rescued_sample_allocation, tracepoint_data, state->self_instance);
927+
}
911928

912929
state->during_sample = false;
913930
}
@@ -929,3 +946,18 @@ static VALUE _native_with_blocked_sigprof(DDTRACE_UNUSED VALUE self) {
929946
return result;
930947
}
931948
}
949+
950+
static VALUE rescued_sample_allocation(VALUE tracepoint_data) {
951+
struct cpu_and_wall_time_worker_state *state = active_sampler_instance_state; // Read from global variable, see "sampler global state safety" note above
952+
953+
// This should not happen in a normal situation because on_newobj_event already checked for this, but just in case...
954+
if (state == NULL) return Qnil;
955+
956+
rb_trace_arg_t *data = rb_tracearg_from_tracepoint(tracepoint_data);
957+
VALUE new_object = rb_tracearg_object(data);
958+
959+
thread_context_collector_sample_allocation(state->thread_context_collector_instance, state->allocation_sample_every, new_object);
960+
961+
// Return a dummy VALUE because we're called from rb_rescue2 which requires it
962+
return Qnil;
963+
}

ext/ddtrace_profiling_native_extension/collectors_thread_context.c

+48-8
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,7 @@ static VALUE _native_reset_after_fork(DDTRACE_UNUSED VALUE self, VALUE collector
206206
static VALUE thread_list(struct thread_context_collector_state *state);
207207
static VALUE _native_sample_allocation(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE sample_weight, VALUE new_object);
208208
static VALUE _native_new_empty_thread(VALUE self);
209+
ddog_CharSlice ruby_value_type_to_class_name(enum ruby_value_type type);
209210

210211
void collectors_thread_context_init(VALUE profiling_module) {
211212
VALUE collectors_module = rb_define_module_under(profiling_module, "Collectors");
@@ -1122,17 +1123,30 @@ void thread_context_collector_sample_allocation(VALUE self_instance, unsigned in
11221123
type == RUBY_T_SYMBOL ||
11231124
type == RUBY_T_FIXNUM
11241125
) {
1125-
const char *name = rb_obj_classname(new_object);
1126-
size_t name_length = name != NULL ? strlen(name) : 0;
1127-
1128-
if (name_length > 0) {
1129-
class_name = (ddog_CharSlice) {.ptr = name, .len = name_length};
1126+
VALUE klass = rb_class_of(new_object);
1127+
1128+
// Ruby sometimes plays a bit fast and loose with some of its internal objects, e.g.
1129+
// `rb_str_tmp_frozen_acquire` allocates a string with no class (klass=0).
1130+
// Thus, we need to make sure there's actually a class before getting its name.
1131+
1132+
if (klass != 0) {
1133+
const char *name = rb_obj_classname(new_object);
1134+
size_t name_length = name != NULL ? strlen(name) : 0;
1135+
1136+
if (name_length > 0) {
1137+
class_name = (ddog_CharSlice) {.ptr = name, .len = name_length};
1138+
} else {
1139+
// @ivoanjo: I'm not sure this can ever happen, but just-in-case
1140+
class_name = ruby_value_type_to_class_name(type);
1141+
}
11301142
} else {
1131-
class_name = DDOG_CHARSLICE_C("(Anonymous)");
1143+
// Fallback for objects with no class
1144+
class_name = ruby_value_type_to_class_name(type);
11321145
}
1146+
} else if (type == RUBY_T_IMEMO) {
1147+
class_name = DDOG_CHARSLICE_C("(VM Internal, T_IMEMO)");
11331148
} else {
1134-
class_name = ruby_vm_type; // For internal things and, we just use the VM type
1135-
// TODO: Maybe prefix them with a nice note? E.g. (VM Internal, T_IMEMO)
1149+
class_name = ruby_vm_type; // For other weird internal things we just use the VM type
11361150
}
11371151
}
11381152

@@ -1164,3 +1178,29 @@ static VALUE new_empty_thread_inner(DDTRACE_UNUSED void *arg) { return Qnil; }
11641178
static VALUE _native_new_empty_thread(DDTRACE_UNUSED VALUE self) {
11651179
return rb_thread_create(new_empty_thread_inner, NULL);
11661180
}
1181+
1182+
ddog_CharSlice ruby_value_type_to_class_name(enum ruby_value_type type) {
1183+
switch (type) {
1184+
case(RUBY_T_OBJECT ): return DDOG_CHARSLICE_C("Object");
1185+
case(RUBY_T_CLASS ): return DDOG_CHARSLICE_C("Class");
1186+
case(RUBY_T_MODULE ): return DDOG_CHARSLICE_C("Module");
1187+
case(RUBY_T_FLOAT ): return DDOG_CHARSLICE_C("Float");
1188+
case(RUBY_T_STRING ): return DDOG_CHARSLICE_C("String");
1189+
case(RUBY_T_REGEXP ): return DDOG_CHARSLICE_C("Regexp");
1190+
case(RUBY_T_ARRAY ): return DDOG_CHARSLICE_C("Array");
1191+
case(RUBY_T_HASH ): return DDOG_CHARSLICE_C("Hash");
1192+
case(RUBY_T_STRUCT ): return DDOG_CHARSLICE_C("Struct");
1193+
case(RUBY_T_BIGNUM ): return DDOG_CHARSLICE_C("Integer");
1194+
case(RUBY_T_FILE ): return DDOG_CHARSLICE_C("File");
1195+
case(RUBY_T_DATA ): return DDOG_CHARSLICE_C("(VM Internal, T_DATA)");
1196+
case(RUBY_T_MATCH ): return DDOG_CHARSLICE_C("MatchData");
1197+
case(RUBY_T_COMPLEX ): return DDOG_CHARSLICE_C("Complex");
1198+
case(RUBY_T_RATIONAL): return DDOG_CHARSLICE_C("Rational");
1199+
case(RUBY_T_NIL ): return DDOG_CHARSLICE_C("NilClass");
1200+
case(RUBY_T_TRUE ): return DDOG_CHARSLICE_C("TrueClass");
1201+
case(RUBY_T_FALSE ): return DDOG_CHARSLICE_C("FalseClass");
1202+
case(RUBY_T_SYMBOL ): return DDOG_CHARSLICE_C("Symbol");
1203+
case(RUBY_T_FIXNUM ): return DDOG_CHARSLICE_C("Integer");
1204+
default: return DDOG_CHARSLICE_C("(VM Internal, Missing class)");
1205+
}
1206+
}

lib/datadog/profiling/collectors/cpu_and_wall_time_worker.rb

+10-1
Original file line numberDiff line numberDiff line change
@@ -21,14 +21,22 @@ def initialize(
2121
idle_sampling_helper: IdleSamplingHelper.new,
2222
# **NOTE**: This should only be used for testing; disabling the dynamic sampling rate will increase the
2323
# profiler overhead!
24-
dynamic_sampling_rate_enabled: true
24+
dynamic_sampling_rate_enabled: true,
25+
allocation_sample_every: 0 # Currently only for testing; Setting this to > 0 can add a lot of overhead!
2526
)
2627
unless dynamic_sampling_rate_enabled
2728
Datadog.logger.warn(
2829
'Profiling dynamic sampling rate disabled. This should only be used for testing, and will increase overhead!'
2930
)
3031
end
3132

33+
if allocation_counting_enabled && allocation_sample_every > 0
34+
Datadog.logger.warn(
35+
"Enabled experimental allocation profiling: allocation_sample_every=#{allocation_sample_every}. This is " \
36+
'experimental, not recommended, and will increase overhead!'
37+
)
38+
end
39+
3240
self.class._native_initialize(
3341
self,
3442
thread_context_collector,
@@ -37,6 +45,7 @@ def initialize(
3745
allocation_counting_enabled,
3846
no_signals_workaround_enabled,
3947
dynamic_sampling_rate_enabled,
48+
allocation_sample_every,
4049
)
4150
@worker_thread = nil
4251
@failure_exception = nil

lib/datadog/profiling/component.rb

+1
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ def self.build_profiler_component(settings:, agent_settings:, optional_tracer:)
9191
allocation_counting_enabled: settings.profiling.advanced.allocation_counting_enabled,
9292
no_signals_workaround_enabled: no_signals_workaround_enabled,
9393
thread_context_collector: thread_context_collector,
94+
allocation_sample_every: 0,
9495
)
9596
else
9697
load_pprof_support

sig/datadog/profiling/collectors/cpu_and_wall_time_worker.rbs

+2
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ module Datadog
1414
thread_context_collector: Datadog::Profiling::Collectors::ThreadContext,
1515
?idle_sampling_helper: Datadog::Profiling::Collectors::IdleSamplingHelper,
1616
?dynamic_sampling_rate_enabled: bool,
17+
?allocation_sample_every: Integer,
1718
) -> void
1819

1920
def self._native_initialize: (
@@ -24,6 +25,7 @@ module Datadog
2425
bool allocation_counting_enabled,
2526
bool no_signals_workaround_enabled,
2627
bool dynamic_sampling_rate_enabled,
28+
::Integer allocation_sample_every,
2729
) -> true
2830

2931
def start: () -> bool?

spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb

+63
Original file line numberDiff line numberDiff line change
@@ -412,6 +412,69 @@
412412
expect(stats.fetch(:trigger_sample_attempts)).to eq(stats.fetch(:postponed_job_success))
413413
end
414414
end
415+
416+
context 'when allocation sampling is enabled' do
417+
let(:options) { { allocation_sample_every: 1 } }
418+
419+
before do
420+
allow(Datadog.logger).to receive(:warn)
421+
end
422+
423+
it 'logs a warning message mentioning this is experimental' do
424+
expect(Datadog.logger).to receive(:warn).with(/Enabled experimental allocation profiling/)
425+
426+
start
427+
end
428+
429+
it 'records allocated objects' do
430+
stub_const('CpuAndWallTimeWorkerSpec::TestStruct', Struct.new(:foo))
431+
432+
start
433+
434+
123.times { CpuAndWallTimeWorkerSpec::TestStruct.new }
435+
allocation_line = __LINE__ - 1
436+
437+
cpu_and_wall_time_worker.stop
438+
439+
allocation_sample =
440+
samples_for_thread(samples_from_pprof(recorder.serialize!), Thread.current)
441+
.find { |s| s.labels[:'allocation class'] == 'CpuAndWallTimeWorkerSpec::TestStruct' }
442+
443+
expect(allocation_sample.values).to include(:'alloc-samples' => 123)
444+
expect(allocation_sample.locations.first.lineno).to eq allocation_line
445+
end
446+
447+
context 'when sampling optimized Ruby strings' do
448+
# Regression test: Some internal Ruby classes use a `rb_str_tmp_frozen_acquire` function which allocates a
449+
# weird "intermediate" string object that has its class pointer set to 0.
450+
#
451+
# When such an object gets sampled, we need to take care not to try to resolve its class name.
452+
#
453+
# In practice, this test is actually validating behavior of the `ThreadContext` collector, but we can only
454+
# really trigger this situation when using the allocation tracepoint, which lives in the `CpuAndWallTimeWorker`.
455+
it 'does not crash' do
456+
start
457+
458+
expect(Time.new.strftime(String.new('Potato'))).to_not be nil
459+
end
460+
end
461+
end
462+
463+
context 'when allocation sampling is disabled' do
464+
let(:options) { { allocation_sample_every: 0 } }
465+
466+
it 'does not record allocations' do
467+
stub_const('CpuAndWallTimeWorkerSpec::TestStruct', Struct.new(:foo))
468+
469+
start
470+
471+
123.times { CpuAndWallTimeWorkerSpec::TestStruct.new }
472+
473+
cpu_and_wall_time_worker.stop
474+
475+
expect(samples_from_pprof(recorder.serialize!).map(&:values)).to all(include(:'alloc-samples' => 0))
476+
end
477+
end
415478
end
416479

417480
describe 'Ractor safety' do

spec/datadog/profiling/component_spec.rb

+1
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,7 @@
167167
allocation_counting_enabled: anything,
168168
no_signals_workaround_enabled: :no_signals_result,
169169
thread_context_collector: instance_of(Datadog::Profiling::Collectors::ThreadContext),
170+
allocation_sample_every: 0,
170171
)
171172

172173
build_profiler_component

0 commit comments

Comments
 (0)