Skip to content

Commit 179854e

Browse files
committed
CA-410782: Add receive_memory_queues for VM_receive_memory operations
Migration spawns 2 operations which depend on each other so we need to ensure there is always space for both of them to prevent a deadlock. Adding VM_receive_memory to a new queue ensures that there will always be a worker for the receive operation so the paired send will never be blocked. Signed-off-by: Steven Woods <[email protected]>
1 parent 17181ca commit 179854e

File tree

1 file changed

+22
-7
lines changed

1 file changed

+22
-7
lines changed

ocaml/xenopsd/lib/xenops_server.ml

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -928,6 +928,12 @@ module Redirector = struct
928928
let nested_parallel_queues =
929929
{queues= Queues.create (); mutex= Mutex.create ()}
930930

931+
(* We create another queue only for VM_receive_memory operations for the same reason again.
932+
Migration spawns 2 operations, send and receive, so if there is limited available worker space
933+
a deadlock can happen when VMs are migrating between hosts or on localhost migration
934+
as the receiver has no free workers to receive memory. *)
935+
let receive_memory_queues = {queues= Queues.create (); mutex= Mutex.create ()}
936+
931937
(* we do not want to use = when comparing queues: queues can contain
932938
(uncomparable) functions, and we are only interested in comparing the
933939
equality of their static references *)
@@ -1062,6 +1068,7 @@ module Redirector = struct
10621068
(default.queues
10631069
:: parallel_queues.queues
10641070
:: nested_parallel_queues.queues
1071+
:: receive_memory_queues.queues
10651072
:: List.map snd (StringMap.bindings !overrides)
10661073
)
10671074
)
@@ -1297,7 +1304,8 @@ module WorkerPool = struct
12971304
for _i = 1 to size do
12981305
incr Redirector.default ;
12991306
incr Redirector.parallel_queues ;
1300-
incr Redirector.nested_parallel_queues
1307+
incr Redirector.nested_parallel_queues ;
1308+
incr Redirector.receive_memory_queues
13011309
done
13021310

13031311
let set_size size =
@@ -1313,7 +1321,8 @@ module WorkerPool = struct
13131321
in
13141322
inner Redirector.default ;
13151323
inner Redirector.parallel_queues ;
1316-
inner Redirector.nested_parallel_queues
1324+
inner Redirector.nested_parallel_queues ;
1325+
inner Redirector.receive_memory_queues
13171326
end
13181327

13191328
(* Keep track of which VMs we're rebooting so we avoid transient glitches where
@@ -3360,19 +3369,20 @@ let uses_mxgpu id =
33603369
)
33613370
(VGPU_DB.ids id)
33623371

3363-
let queue_operation_int ?traceparent dbg id op =
3372+
let queue_operation_int ?traceparent ?(redirector = Redirector.default) dbg id
3373+
op =
33643374
let task =
33653375
Xenops_task.add ?traceparent tasks dbg
33663376
(let r = ref None in
33673377
fun t -> perform ~result:r op t ; !r
33683378
)
33693379
in
33703380
let tag = if uses_mxgpu id then "mxgpu" else id in
3371-
Redirector.push Redirector.default tag (op, task) ;
3381+
Redirector.push redirector tag (op, task) ;
33723382
task
33733383

3374-
let queue_operation ?traceparent dbg id op =
3375-
let task = queue_operation_int ?traceparent dbg id op in
3384+
let queue_operation ?traceparent ?redirector dbg id op =
3385+
let task = queue_operation_int ?traceparent ?redirector dbg id op in
33763386
Xenops_task.id_of_handle task
33773387

33783388
let queue_operation_and_wait dbg id op =
@@ -3821,7 +3831,12 @@ module VM = struct
38213831
; vmr_compressed= compressed_memory
38223832
}
38233833
in
3824-
let task = Some (queue_operation ?traceparent dbg id op) in
3834+
let task =
3835+
Some
3836+
(queue_operation ?traceparent
3837+
~redirector:Redirector.receive_memory_queues dbg id op
3838+
)
3839+
in
38253840
Option.iter
38263841
(fun t -> t |> Xenops_client.wait_for_task dbg |> ignore)
38273842
task

0 commit comments

Comments
 (0)