@@ -477,8 +477,8 @@ fn remove_cycle(
477
477
/// Detects query cycles by using depth first search over all active query jobs.
478
478
/// If a query cycle is found it will break the cycle by finding an edge which
479
479
/// uses a query latch and then resuming that waiter.
480
- /// There may be multiple cycles involved in a deadlock, so this searches
481
- /// all active queries for cycles before finally resuming all the waiters at once.
480
+ /// There may be multiple cycles involved in a deadlock, but we only search
481
+ /// one cycle at a call and resume one waiter at once. See `FIXME` below .
482
482
pub fn break_query_cycles ( query_map : QueryMap , registry : & rayon_core:: Registry ) {
483
483
let mut wakelist = Vec :: new ( ) ;
484
484
let mut jobs: Vec < QueryJobId > = query_map. keys ( ) . cloned ( ) . collect ( ) ;
@@ -488,6 +488,18 @@ pub fn break_query_cycles(query_map: QueryMap, registry: &rayon_core::Registry)
488
488
while jobs. len ( ) > 0 {
489
489
if remove_cycle ( & query_map, & mut jobs, & mut wakelist) {
490
490
found_cycle = true ;
491
+
492
+ // FIXME(#137731): We can encounter deadlocks for cycles we can't break here,
493
+ // but it's still unclear whether it's due to possible issues in rustc-rayon
494
+ // or instead in the handling of query cycles. We can avoid them by only waking
495
+ // up a single waiter instead of all of them. The deadlock issues seem to only
496
+ // appear when multiple query cycles errors are involved, so this reduction in
497
+ // parallelism, while suboptimal, is not universal and only the deadlock handler
498
+ // will encounter these cases. The workaround shows loss of potential gains,
499
+ // but there still are big improvements in the common case, and no regressions
500
+ // compared to the single-threaded case. More investigation is still needed,
501
+ // and once fixed, we can wake up all the waiters up.
502
+ break ;
491
503
}
492
504
}
493
505
@@ -506,7 +518,7 @@ pub fn break_query_cycles(query_map: QueryMap, registry: &rayon_core::Registry)
506
518
) ;
507
519
}
508
520
509
- // FIXME: Ensure this won't cause a deadlock before we return
521
+ // FIXME: Ensure this won't cause a deadlock if we resume all waiters at once.
510
522
for waiter in wakelist. into_iter ( ) {
511
523
waiter. notify ( registry) ;
512
524
}
0 commit comments