4
4
//! thread blocks and execute in SIMT fashion.
5
5
6
6
use crate :: gpu_only;
7
+ #[ cfg( target_os = "cuda" ) ]
7
8
use core:: arch:: asm;
8
9
use half:: { bf16, f16} ;
9
10
@@ -329,7 +330,7 @@ unsafe fn match_all_64(mask: u32, value: u64) -> (u32, bool) {
329
330
/// Behavior is undefined if:
330
331
/// - Any thread participating in the vote has exited or the executing thread is not in `mask`.
331
332
/// - For `compute_62` and below, all threads in `mask` must call this function in convergence, and only threads belonging
332
- /// to the `mask` can be active when the intrinsic is called.
333
+ /// to the `mask` can be active when the intrinsic is called.
333
334
/// - A thread tries to execute this function while not being present in `mask`.
334
335
#[ gpu_only]
335
336
pub unsafe fn warp_vote_all ( mask : u32 , predicate : bool ) -> bool {
@@ -359,7 +360,7 @@ pub unsafe fn warp_vote_all(mask: u32, predicate: bool) -> bool {
359
360
/// Behavior is undefined if:
360
361
/// - Any thread participating in the vote has exited or the executing thread is not in `mask`.
361
362
/// - For `compute_62` and below, all threads in `mask` must call this function in convergence, and only threads belonging
362
- /// to the `mask` can be active when the intrinsic is called.
363
+ /// to the `mask` can be active when the intrinsic is called.
363
364
/// - A thread tries to execute this function while not being present in `mask`.
364
365
#[ gpu_only]
365
366
pub unsafe fn warp_vote_any ( mask : u32 , predicate : bool ) -> bool {
@@ -389,7 +390,7 @@ pub unsafe fn warp_vote_any(mask: u32, predicate: bool) -> bool {
389
390
/// Behavior is undefined if:
390
391
/// - Any thread participating in the vote has exited or the executing thread is not in `mask`.
391
392
/// - For `compute_62` and below, all threads in `mask` must call this function in convergence, and only threads belonging
392
- /// to the `mask` can be active when the intrinsic is called.
393
+ /// to the `mask` can be active when the intrinsic is called.
393
394
/// - A thread tries to execute this function while not being present in `mask`.
394
395
#[ gpu_only]
395
396
pub unsafe fn warp_vote_ballot ( mask : u32 , predicate : bool ) -> u32 {
@@ -415,10 +416,10 @@ pub unsafe fn warp_vote_ballot(mask: u32, predicate: bool) -> u32 {
415
416
///
416
417
/// - `mask` dictates what threads will participate in the shuffle, usually [`u32::MAX`] to indicate all threads.
417
418
/// - `value` is the value that will be shuffled across the threads. i.e. the value that will be given to the thread
418
- /// that calculates this thread as its target lane.
419
+ /// that calculates this thread as its target lane.
419
420
/// - `delta` is the value that will be subtracted from the current thread's lane to calculate the target lane.
420
421
/// - `width` dictates how to optionally split the warp into subsections, it must be a power of two and lower than `32`.
421
- /// calculated source lane values will NOT wrap around the value of `width`. Usually just `32`.
422
+ /// calculated source lane values will NOT wrap around the value of `width`. Usually just `32`.
422
423
///
423
424
/// # Returns
424
425
///
@@ -439,7 +440,7 @@ pub unsafe fn warp_vote_ballot(mask: u32, predicate: bool) -> u32 {
439
440
/// Behavior is undefined if:
440
441
/// - Any thread participating in the shuffle has exited or the executing thread is not in `mask`.
441
442
/// - For `compute_62` and below, all threads in `mask` must call the same function in convergence, and only the threads
442
- /// in `mask` can be active when the shuffle is called.
443
+ /// in `mask` can be active when the shuffle is called.
443
444
///
444
445
/// The returned value returned is unspecified if the calculated target lane is inactive.
445
446
pub unsafe fn warp_shuffle_down < T : WarpShuffleValue > (
@@ -457,10 +458,10 @@ pub unsafe fn warp_shuffle_down<T: WarpShuffleValue>(
457
458
///
458
459
/// - `mask` dictates what threads will participate in the shuffle, usually [`u32::MAX`] to indicate all threads.
459
460
/// - `value` is the value that will be shuffled across the threads. i.e. the value that will be given to the thread
460
- /// that calculates this thread as its target lane.
461
+ /// that calculates this thread as its target lane.
461
462
/// - `delta` is the value that will be added to the current thread's lane to calculate the target lane.
462
463
/// - `width` dictates how to optionally split the warp into subsections, it must be a power of two and lower than `32`.
463
- /// calculated source lane values will NOT wrap around the value of `width`. Usually just `32`.
464
+ /// calculated source lane values will NOT wrap around the value of `width`. Usually just `32`.
464
465
///
465
466
/// # Returns
466
467
///
@@ -481,7 +482,7 @@ pub unsafe fn warp_shuffle_down<T: WarpShuffleValue>(
481
482
/// Behavior is undefined if:
482
483
/// - Any thread participating in the shuffle has exited or the executing thread is not in `mask`.
483
484
/// - For `compute_62` and below, all threads in `mask` must call the same function in convergence, and only the threads
484
- /// in `mask` can be active when the shuffle is called.
485
+ /// in `mask` can be active when the shuffle is called.
485
486
///
486
487
/// The returned value returned is unspecified if the calculated target lane is inactive.
487
488
pub unsafe fn warp_shuffle_up < T : WarpShuffleValue > (
@@ -499,10 +500,10 @@ pub unsafe fn warp_shuffle_up<T: WarpShuffleValue>(
499
500
///
500
501
/// - `mask` dictates what threads will participate in the shuffle, usually [`u32::MAX`] to indicate all threads.
501
502
/// - `value` is the value that will be shuffled across the threads. i.e. the value that will be given to the thread
502
- /// that calculates this thread as its target lane.
503
+ /// that calculates this thread as its target lane.
503
504
/// - `idx` is the target lane that will be used as the source of this thread's returned value.
504
505
/// - `width` dictates how to optionally split the warp into subsections, it must be a power of two and lower than `32`.
505
- /// calculated source lane values will NOT wrap around the value of `width`. Usually just `32`.
506
+ /// calculated source lane values will NOT wrap around the value of `width`. Usually just `32`.
506
507
///
507
508
/// # Returns
508
509
///
@@ -523,7 +524,7 @@ pub unsafe fn warp_shuffle_up<T: WarpShuffleValue>(
523
524
/// Behavior is undefined if:
524
525
/// - Any thread participating in the shuffle has exited or the executing thread is not in `mask`.
525
526
/// - For `compute_62` and below, all threads in `mask` must call the same function in convergence, and only the threads
526
- /// in `mask` can be active when the shuffle is called.
527
+ /// in `mask` can be active when the shuffle is called.
527
528
///
528
529
/// The returned value returned is unspecified if the calculated target lane is inactive.
529
530
pub unsafe fn warp_shuffle_idx < T : WarpShuffleValue > (
@@ -541,11 +542,11 @@ pub unsafe fn warp_shuffle_idx<T: WarpShuffleValue>(
541
542
///
542
543
/// - `mask` dictates what threads will participate in the shuffle, usually [`u32::MAX`] to indicate all threads.
543
544
/// - `value` is the value that will be shuffled across the threads. i.e. the value that will be given to the thread
544
- /// that calculates this thread as its target lane.
545
+ /// that calculates this thread as its target lane.
545
546
/// - `lane_mask` is the value that will be XOR'd by the current thread's lane id to calculate the target lane. i.e. the
546
- /// target lane will be `lane_id ^ lane_mask`.
547
+ /// target lane will be `lane_id ^ lane_mask`.
547
548
/// - `width` dictates how to optionally split the warp into subsections, it must be a power of two and lower than `32`.
548
- /// calculated source lane values will NOT wrap around the value of `width`. Usually just `32`.
549
+ /// calculated source lane values will NOT wrap around the value of `width`. Usually just `32`.
549
550
///
550
551
/// # Returns
551
552
///
@@ -566,7 +567,7 @@ pub unsafe fn warp_shuffle_idx<T: WarpShuffleValue>(
566
567
/// Behavior is undefined if:
567
568
/// - Any thread participating in the shuffle has exited or the executing thread is not in `mask`.
568
569
/// - For `compute_62` and below, all threads in `mask` must call the same function in convergence, and only the threads
569
- /// in `mask` can be active when the shuffle is called.
570
+ /// in `mask` can be active when the shuffle is called.
570
571
///
571
572
/// The returned value returned is unspecified if the calculated target lane is inactive.
572
573
pub unsafe fn warp_shuffle_xor < T : WarpShuffleValue > (
0 commit comments