const auto [qp_idx, sharing_mode] = comm::get_qp_mode<kNumSMs, kNumQPs, kNumChannelsPerSM, (kNumNotifyWarps > 0)>(
sm_idx, (warp_idx - kNumNotifyWarps) % kNumChannelsPerSM, warp_idx < kNumNotifyWarps);
const auto gin = handle::NCCLGin(nccl_dev_comm, nccl_window, qp_idx, sharing_mode);
hybrid mode set num_qps to num_sms * 16 + 1, But it seems only
kNumChannelsPerSMwere used andkNumMaxChannelsPerSMis 8, So is num_sms * 8 enough?