Skip to content

Commit 8b019f5

Browse files
committed
CA-410782: Add receive_memory_queues for VM_receive_memory operations
Migration spawns 2 operations which depend on each other so we need to ensure there is always space for both of them to prevent a deadlock. Adding VM_receive_memory to a new queue ensures that there will always be a worker for the receive operation so the paired send will never be blocked. Signed-off-by: Steven Woods <[email protected]>
1 parent 418b610 commit 8b019f5

File tree

1 file changed

+22
-7
lines changed

1 file changed

+22
-7
lines changed

ocaml/xenopsd/lib/xenops_server.ml

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -916,6 +916,12 @@ module Redirector = struct
916916
Parallel atoms, creating a deadlock. *)
917917
let parallel_queues = {queues= Queues.create (); mutex= Mutex.create ()}
918918

919+
(* We create another queue only for VM_receive_memory operations for the same reason as Parallel atoms.
920+
Migration spawns 2 operations, send and receive, so if there is limited available worker space
921+
a deadlock can happen when VMs are migrating between hosts or on localhost migration
922+
as the receiver has no free workers to receive memory. *)
923+
let receive_memory_queues = {queues= Queues.create (); mutex= Mutex.create ()}
924+
919925
(* When a thread is actively processing a queue, items are redirected to a
920926
thread-private queue *)
921927
let overrides = ref StringMap.empty
@@ -1035,6 +1041,7 @@ module Redirector = struct
10351041
List.concat_map one
10361042
(default.queues
10371043
:: parallel_queues.queues
1044+
:: receive_memory_queues.queues
10381045
:: List.map snd (StringMap.bindings !overrides)
10391046
)
10401047
)
@@ -1268,7 +1275,8 @@ module WorkerPool = struct
12681275
let start size =
12691276
for _i = 1 to size do
12701277
incr Redirector.default ;
1271-
incr Redirector.parallel_queues
1278+
incr Redirector.parallel_queues ;
1279+
incr Redirector.receive_memory_queues
12721280
done
12731281

12741282
let set_size size =
@@ -1283,7 +1291,8 @@ module WorkerPool = struct
12831291
done
12841292
in
12851293
inner Redirector.default ;
1286-
inner Redirector.parallel_queues
1294+
inner Redirector.parallel_queues ;
1295+
inner Redirector.receive_memory_queues
12871296
end
12881297

12891298
(* Keep track of which VMs we're rebooting so we avoid transient glitches where
@@ -3276,19 +3285,20 @@ let uses_mxgpu id =
32763285
)
32773286
(VGPU_DB.ids id)
32783287

3279-
let queue_operation_int ?traceparent dbg id op =
3288+
let queue_operation_int ?traceparent ?(redirector = Redirector.default) dbg id
3289+
op =
32803290
let task =
32813291
Xenops_task.add ?traceparent tasks dbg
32823292
(let r = ref None in
32833293
fun t -> perform ~result:r op t ; !r
32843294
)
32853295
in
32863296
let tag = if uses_mxgpu id then "mxgpu" else id in
3287-
Redirector.push Redirector.default tag (op, task) ;
3297+
Redirector.push redirector tag (op, task) ;
32883298
task
32893299

3290-
let queue_operation ?traceparent dbg id op =
3291-
let task = queue_operation_int ?traceparent dbg id op in
3300+
let queue_operation ?traceparent ?redirector dbg id op =
3301+
let task = queue_operation_int ?traceparent ?redirector dbg id op in
32923302
Xenops_task.id_of_handle task
32933303

32943304
let queue_operation_and_wait dbg id op =
@@ -3737,7 +3747,12 @@ module VM = struct
37373747
; vmr_compressed= compressed_memory
37383748
}
37393749
in
3740-
let task = Some (queue_operation ?traceparent dbg id op) in
3750+
let task =
3751+
Some
3752+
(queue_operation ?traceparent
3753+
~redirector:Redirector.receive_memory_queues dbg id op
3754+
)
3755+
in
37413756
Option.iter
37423757
(fun t -> t |> Xenops_client.wait_for_task dbg |> ignore)
37433758
task

0 commit comments

Comments
 (0)