Skip to content

Commit aaf378e

Browse files
authored
Merge pull request ocaml#13227 from gadmm/caml_plat_lock_non_blocking2
Audit and fix caml_plat_lock_blocking usage
2 parents 86470c2 + a3c0dd0 commit aaf378e

12 files changed

+51
-25
lines changed

Diff for: Changes

+4
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,10 @@ Working version
6464
- #13701: optimize `caml_continuation_use` based on #12735
6565
(Hugo Heuzard, review by KC Sivaramakrishnan)
6666

67+
- #13227: Review of locking in the multicore runtime. Fix deadlocks in
68+
runtime events and potential deadlocks with named values.
69+
(Guillaume Munch-Maccagnoni, review by Gabriel Scherer)
70+
6771
### Code generation and optimizations:
6872

6973
* #13050: Use '$' instead of '.' to separate module names in symbol names.

Diff for: runtime/callback.c

+3-3
Original file line numberDiff line numberDiff line change
@@ -390,7 +390,7 @@ CAMLprim value caml_register_named_value(value vname, value val)
390390
unsigned int h = hash_value_name(name);
391391
int found = 0;
392392

393-
caml_plat_lock_blocking(&named_value_lock);
393+
caml_plat_lock_non_blocking(&named_value_lock);
394394
for (struct named_value *nv = named_value_table[h];
395395
nv != NULL;
396396
nv = nv->next) {
@@ -416,7 +416,7 @@ CAMLprim value caml_register_named_value(value vname, value val)
416416

417417
CAMLexport const value* caml_named_value(char const *name)
418418
{
419-
caml_plat_lock_blocking(&named_value_lock);
419+
caml_plat_lock_non_blocking(&named_value_lock);
420420
for (struct named_value *nv = named_value_table[hash_value_name(name)];
421421
nv != NULL;
422422
nv = nv->next) {
@@ -431,7 +431,7 @@ CAMLexport const value* caml_named_value(char const *name)
431431

432432
CAMLexport void caml_iterate_named_values(caml_named_action f)
433433
{
434-
caml_plat_lock_blocking(&named_value_lock);
434+
caml_plat_lock_non_blocking(&named_value_lock);
435435
for (int i = 0; i < Named_value_size; i++){
436436
for (struct named_value *nv = named_value_table[i];
437437
nv != NULL;

Diff for: runtime/caml/frame_descriptors.h

+2-1
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,8 @@ void* caml_copy_and_register_frametable(void *table, int size);
119119

120120
/* The unregistered frametables can still be in use after calling
121121
this function. Thus, you should not free their memory.
122-
Note: it may reorder the content of the array 'tables' */
122+
Note: it may reorder the content of the array 'tables'.
123+
This can be called from a custom block finalizer. */
123124
void caml_unregister_frametables(void **tables, int ntables);
124125
void caml_unregister_frametable(void *table);
125126

Diff for: runtime/caml/io.h

+6-2
Original file line numberDiff line numberDiff line change
@@ -63,8 +63,12 @@ struct channel {
6363

6464
enum {
6565
CHANNEL_FLAG_FROM_SOCKET = 1, /* For Windows */
66-
CHANNEL_FLAG_MANAGED_BY_GC = 4, /* Free and close using GC finalization */
67-
CHANNEL_TEXT_MODE = 8, /* "Text mode" for Windows and Cygwin */
66+
CHANNEL_FLAG_MANAGED_BY_GC = 4, /* Free and close using GC finalization. */
67+
/* Note: For backwards-compatibility, channels without the flag
68+
CHANNEL_FLAG_MANAGED_BY_GC can be used inside single-threaded
69+
programs without locking. As a consequence, using such a channel
70+
from an asynchronous callback can result in deadlocks. */
71+
CHANNEL_TEXT_MODE = 8, /* "Text mode" for Windows and Cygwin */
6872
CHANNEL_FLAG_UNBUFFERED = 16 /* Unbuffered (for output channels only) */
6973
};
7074

Diff for: runtime/caml/platform.h

+4
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,10 @@ Caml_inline void cpu_relax(void) {
9090
The domain lock must be held in order to call
9191
[caml_plat_lock_non_blocking].
9292
93+
It is possible to combine calls to [caml_plat_lock_non_blocking] on
94+
a mutex from the mutator with calls to [caml_plat_lock_blocking] on
95+
the same mutex from a STW section.
96+
9397
These functions never raise exceptions; errors are fatal. Thus, for
9498
usages where bugs are susceptible to be introduced by users, the
9599
functions from caml/sync.h should be used instead.

Diff for: runtime/codefrag.c

+8-1
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,14 @@ unsigned char *caml_digest_of_code_fragment(struct code_fragment *cf) {
130130
/* Note: this approach is a bit heavy-handed as we take a lock in
131131
all cases. It would be possible to take a lock only in the
132132
DIGEST_LATER case, which occurs at most once per fragment, by
133-
using double-checked locking -- see #11791. */
133+
using double-checked locking -- see #11791.
134+
135+
Note: we use [caml_plat_lock_blocking] despite holding the domain
136+
lock because this is called by intern.c and extern.c, both of
137+
which share state between threads of the same domain. The
138+
critical section must therefore remain short and not allocate
139+
(nor cause other potential STW).
140+
*/
134141
caml_plat_lock_blocking(&cf->mutex);
135142
{
136143
if (cf->digest_status == DIGEST_IGNORE) {

Diff for: runtime/frame_descriptors.c

+1
Original file line numberDiff line numberDiff line change
@@ -326,6 +326,7 @@ static void remove_frame_descriptors(
326326
void *frametable;
327327
caml_frametable_list ** previous;
328328

329+
/* cannot release the domain lock here (e.g. custom block finaliser) */
329330
caml_plat_lock_blocking(&table->mutex);
330331

331332
previous = &table->frametables;

Diff for: runtime/gc_stats.c

+4-1
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,9 @@ void caml_reset_domain_alloc_stats(caml_domain_state *local)
8383
static caml_plat_mutex orphan_lock = CAML_PLAT_MUTEX_INITIALIZER;
8484
static struct alloc_stats orphaned_alloc_stats = {0,};
8585

86-
void caml_accum_orphan_alloc_stats(struct alloc_stats *acc) {
86+
void caml_accum_orphan_alloc_stats(struct alloc_stats *acc)
87+
{
88+
/* This is called from the collector as well as from the mutator. */
8789
caml_plat_lock_blocking(&orphan_lock);
8890
caml_accum_alloc_stats(acc, &orphaned_alloc_stats);
8991
caml_plat_unlock(&orphan_lock);
@@ -97,6 +99,7 @@ void caml_orphan_alloc_stats(caml_domain_state *domain) {
9799
caml_reset_domain_alloc_stats(domain);
98100

99101
/* push them into the orphan stats */
102+
/* This is called from the collector as well as from the mutator. */
100103
caml_plat_lock_blocking(&orphan_lock);
101104
caml_accum_alloc_stats(&orphaned_alloc_stats, &alloc_stats);
102105
caml_plat_unlock(&orphan_lock);

Diff for: runtime/globroots.c

+4
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,10 @@
2525
#include "caml/skiplist.h"
2626
#include "caml/stack.h"
2727

28+
/* This mutex must be locked with [caml_plat_lock_blocking] from the
29+
mutator, because caml_{register,remove}_{generational_}roots can be
30+
called in places where the domain lock is not safe to be
31+
released. */
2832
static caml_plat_mutex roots_mutex = CAML_PLAT_MUTEX_INITIALIZER;
2933

3034
/* Greater than zero when the current thread is scanning the roots */

Diff for: runtime/io.c

+4-4
Original file line numberDiff line numberDiff line change
@@ -559,7 +559,7 @@ void caml_finalize_channel(value vchan)
559559
}
560560
/* Don't run concurrently with caml_ml_out_channels_list that may resurrect
561561
a dead channel . */
562-
caml_plat_lock_blocking(&caml_all_opened_channels_mutex);
562+
caml_plat_lock_non_blocking(&caml_all_opened_channels_mutex);
563563
chan->refcount --;
564564
if (chan->refcount > 0 || notflushed) {
565565
/* We need to keep the channel around, either because it is being
@@ -612,7 +612,7 @@ CAMLprim value caml_ml_open_descriptor_in_with_flags(int fd, int flags)
612612
struct channel * chan = caml_open_descriptor_in(fd);
613613
chan->flags |= flags | CHANNEL_FLAG_MANAGED_BY_GC;
614614
chan->refcount = 1;
615-
caml_plat_lock_blocking(&caml_all_opened_channels_mutex);
615+
caml_plat_lock_non_blocking(&caml_all_opened_channels_mutex);
616616
link_channel (chan);
617617
caml_plat_unlock (&caml_all_opened_channels_mutex);
618618
return caml_alloc_channel(chan);
@@ -627,7 +627,7 @@ CAMLprim value caml_ml_open_descriptor_out_with_flags(int fd, int flags)
627627
struct channel * chan = caml_open_descriptor_out(fd);
628628
chan->flags |= flags | CHANNEL_FLAG_MANAGED_BY_GC;
629629
chan->refcount = 1;
630-
caml_plat_lock_blocking(&caml_all_opened_channels_mutex);
630+
caml_plat_lock_non_blocking(&caml_all_opened_channels_mutex);
631631
link_channel (chan);
632632
caml_plat_unlock (&caml_all_opened_channels_mutex);
633633
return caml_alloc_channel(chan);
@@ -663,7 +663,7 @@ CAMLprim value caml_ml_out_channels_list (value unit)
663663
struct channel_list *channel_list = NULL, *cl_tmp;
664664
mlsize_t num_channels = 0;
665665

666-
caml_plat_lock_blocking(&caml_all_opened_channels_mutex);
666+
caml_plat_lock_non_blocking(&caml_all_opened_channels_mutex);
667667
for (struct channel *channel = caml_all_opened_channels;
668668
channel != NULL;
669669
channel = channel->next) {

Diff for: runtime/runtime_events.c

+3-2
Original file line numberDiff line numberDiff line change
@@ -376,6 +376,7 @@ static void runtime_events_create_from_stw_single(void) {
376376

377377
// at the same instant: snapshot user_events list and set
378378
// runtime_events_enabled to 1
379+
/* calling from STW */
379380
caml_plat_lock_blocking(&user_events_lock);
380381
value current_user_event = user_events;
381382
atomic_store_release(&runtime_events_enabled, 1);
@@ -716,7 +717,7 @@ CAMLprim value caml_runtime_events_user_register(value event_name,
716717
Field(event, 3) = event_tag;
717718

718719

719-
caml_plat_lock_blocking(&user_events_lock);
720+
caml_plat_lock_non_blocking(&user_events_lock);
720721
// critical section: when we update the user_events list we need to make sure
721722
// it is not updated while we construct the pointer to the next element
722723

@@ -832,7 +833,7 @@ CAMLexport value caml_runtime_events_user_resolve(
832833
CAMLlocal3(event, cur_event_name, ml_event_name);
833834

834835
// TODO: it might be possible to atomic load instead
835-
caml_plat_lock_blocking(&user_events_lock);
836+
caml_plat_lock_non_blocking(&user_events_lock);
836837
value current_user_event = user_events;
837838
caml_plat_unlock(&user_events_lock);
838839

Diff for: runtime/signals.c

+8-11
Original file line numberDiff line numberDiff line change
@@ -687,6 +687,8 @@ CAMLprim value caml_install_signal_handler(value signal_number, value action)
687687
act = 2;
688688
break;
689689
}
690+
caml_plat_lock_non_blocking(&signal_install_mutex);
691+
/* Note: no safepoint for calling signals in this critical section */
690692
oldact = caml_set_signal_action(sig, act);
691693
switch (oldact) {
692694
case 0: /* was Signal_default */
@@ -700,24 +702,19 @@ CAMLprim value caml_install_signal_handler(value signal_number, value action)
700702
Field(res, 0) = Field(caml_signal_handlers, sig);
701703
break;
702704
default: /* error in caml_set_signal_action */
703-
caml_sys_error(NO_ARG);
705+
goto err;
704706
}
705707
if (Is_block(action)) {
706-
/* Speculatively allocate this so we don't hold the lock for
707-
a GC */
708708
if (caml_signal_handlers == 0) {
709-
tmp_signal_handlers = caml_alloc(NSIG, 0);
710-
}
711-
caml_plat_lock_blocking(&signal_install_mutex);
712-
if (caml_signal_handlers == 0) {
713-
/* caml_alloc cannot raise asynchronous exceptions from signals
714-
so this is safe */
715-
caml_signal_handlers = tmp_signal_handlers;
709+
caml_signal_handlers = caml_alloc(NSIG, 0);
716710
caml_register_global_root(&caml_signal_handlers);
717711
}
718712
caml_modify(&Field(caml_signal_handlers, sig), Field(action, 0));
719-
caml_plat_unlock(&signal_install_mutex);
720713
}
714+
caml_plat_unlock(&signal_install_mutex);
721715
caml_get_value_or_raise(caml_process_pending_signals_res());
722716
CAMLreturn (res);
717+
err:
718+
caml_plat_unlock(&signal_install_mutex);
719+
caml_sys_error(NO_ARG);
723720
}

0 commit comments

Comments
 (0)