Skip to content

Commit d9466ed

Browse files
kjetilkjekaAmanieu
authored andcommitted
NVPTX: Add f16 SIMD intrinsics
1 parent 2adb43d commit d9466ed

File tree

2 files changed

+148
-0
lines changed

2 files changed

+148
-0
lines changed

crates/core_arch/src/nvptx/mod.rs

+5
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,11 @@
1313
1414
use crate::ffi::c_void;
1515

16+
mod packed;
17+
18+
#[unstable(feature = "stdarch_nvptx", issue = "111199")]
19+
pub use packed::*;
20+
1621
#[allow(improper_ctypes)]
1722
extern "C" {
1823
#[link_name = "llvm.nvvm.barrier0"]

crates/core_arch/src/nvptx/packed.rs

+143
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
//! NVPTX Packed data types (SIMD)
2+
//!
3+
//! Packed Data Types is what PTX calls SIMD types. See [PTX ISA (Packed Data Types)](https://docs.nvidia.com/cuda/parallel-thread-execution/#packed-data-types) for a full reference.
4+
5+
// Note: #[assert_instr] tests are not actually being run on nvptx due to being a `no_std` target incapable of running tests. Something like FileCheck would be appropriate for verifying the correct instruction is used.
6+
7+
use crate::intrinsics::simd::*;
8+
9+
#[allow(improper_ctypes)]
10+
extern "C" {
11+
#[link_name = "llvm.minnum.v2f16"]
12+
fn llvm_f16x2_minnum(a: f16x2, b: f16x2) -> f16x2;
13+
#[link_name = "llvm.minimum.v2f16"]
14+
fn llvm_f16x2_minimum(a: f16x2, b: f16x2) -> f16x2;
15+
#[link_name = "llvm.maxnum.v2f16"]
16+
fn llvm_f16x2_maxnum(a: f16x2, b: f16x2) -> f16x2;
17+
#[link_name = "llvm.maximum.v2f16"]
18+
fn llvm_f16x2_maximum(a: f16x2, b: f16x2) -> f16x2;
19+
}
20+
21+
types! {
22+
#![unstable(feature = "stdarch_nvptx", issue = "111199")]
23+
24+
/// PTX-specific 32-bit wide floating point (f16 x 2) vector type
25+
pub struct f16x2(2 x f16);
26+
27+
}
28+
29+
/// Add two values, round to nearest even
30+
///
31+
/// <https://docs.nvidia.com/cuda/parallel-thread-execution/#half-precision-floating-point-instructions-add>
32+
///
33+
/// Corresponds to the CUDA C intrinsics:
34+
/// - [`__hadd2`](https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF2__ARITHMETIC.html#group__CUDA__MATH____HALF2__ARITHMETIC_1g921c795176eaa31265bd80ef4fe4b8e6)
35+
/// - [`__hadd2_rn`](https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF2__ARITHMETIC.html#group__CUDA__MATH____HALF2__ARITHMETIC_1g6cd8ddb2c3d670e1a10c3eb2e7644f82)
36+
#[inline]
37+
#[cfg_attr(test, assert_instr(add.rn.f16x22))]
38+
#[unstable(feature = "stdarch_nvptx", issue = "111199")]
39+
pub unsafe fn f16x2_add(a: f16x2, b: f16x2) -> f16x2 {
40+
simd_add(a, b)
41+
}
42+
43+
/// Subtract two values, round to nearest even
44+
///
45+
/// <https://docs.nvidia.com/cuda/parallel-thread-execution/#half-precision-floating-point-instructions-sub>
46+
///
47+
/// Corresponds to the CUDA C intrinsics:
48+
/// - [`__hsub2`](https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF2__ARITHMETIC.html#group__CUDA__MATH____HALF2__ARITHMETIC_1ga5536c9c3d853d8c8b9de60e18b41e54)
49+
/// - [`__hsub2_rn`](https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF2__ARITHMETIC.html#group__CUDA__MATH____HALF2__ARITHMETIC_1g8adc164c68d553354f749f0f0645a874)
50+
#[inline]
51+
#[cfg_attr(test, assert_instr(sub.rn.f16x2))]
52+
#[unstable(feature = "stdarch_nvptx", issue = "111199")]
53+
pub unsafe fn f16x2_sub(a: f16x2, b: f16x2) -> f16x2 {
54+
simd_sub(a, b)
55+
}
56+
57+
/// Multiply two values, round to nearest even
58+
///
59+
/// <https://docs.nvidia.com/cuda/parallel-thread-execution/#half-precision-floating-point-instructions-mul>
60+
///
61+
/// Corresponds to the CUDA C intrinsics:
62+
/// - [`__hmul2`](https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF2__ARITHMETIC.html#group__CUDA__MATH____HALF2__ARITHMETIC_1g70de3f2ee48babe4e0969397ac17708e)
63+
/// - [`__hmul2_rn`](https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF2__ARITHMETIC.html#group__CUDA__MATH____HALF2__ARITHMETIC_1g99f8fe23a4b4c6898d6faf999afaa76e)
64+
#[inline]
65+
#[cfg_attr(test, assert_instr(mul.rn.f16x2))]
66+
#[unstable(feature = "stdarch_nvptx", issue = "111199")]
67+
pub unsafe fn f16x2_mul(a: f16x2, b: f16x2) -> f16x2 {
68+
simd_mul(a, b)
69+
}
70+
71+
/// Fused multiply-add, round to nearest even
72+
///
73+
/// <https://docs.nvidia.com/cuda/parallel-thread-execution/#half-precision-floating-point-instructions-fma>
74+
///
75+
/// Corresponds to the CUDA C intrinsics:
76+
/// - [`__fma2`](https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF2__ARITHMETIC.html#group__CUDA__MATH____HALF2__ARITHMETIC_1g43628ba21ded8b1e188a367348008dab)
77+
/// - [`__fma2_rn`](https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF2__ARITHMETIC.html#group__CUDA__MATH____HALF2__ARITHMETIC_1g43628ba21ded8b1e188a367348008dab)
78+
#[inline]
79+
#[cfg_attr(test, assert_instr(fma.rn.f16x2))]
80+
#[unstable(feature = "stdarch_nvptx", issue = "111199")]
81+
pub unsafe fn f16x2_fma(a: f16x2, b: f16x2, c: f16x2) -> f16x2 {
82+
simd_fma(a, b, c)
83+
}
84+
85+
/// Arithmetic negate
86+
///
87+
/// <https://docs.nvidia.com/cuda/parallel-thread-execution/#half-precision-floating-point-instructions-neg>
88+
///
89+
/// Corresponds to the CUDA C intrinsic [`__hmin2`](https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF2__COMPARISON.html#group__CUDA__MATH____HALF2__COMPARISON_1g9e17a33f96061804166f3fbd395422b6)
90+
#[inline]
91+
#[cfg_attr(test, assert_instr(neg.f16x2))]
92+
#[unstable(feature = "stdarch_nvptx", issue = "111199")]
93+
pub unsafe fn f16x2_neg(a: f16x2) -> f16x2 {
94+
simd_neg(a)
95+
}
96+
97+
/// Find the minimum of two values
98+
///
99+
/// <https://docs.nvidia.com/cuda/parallel-thread-execution/#half-precision-floating-point-instructions-min>
100+
///
101+
/// Corresponds to the CUDA C intrinsic [`__hmin2`](https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF2__COMPARISON.html#group__CUDA__MATH____HALF2__COMPARISON_1g9e17a33f96061804166f3fbd395422b6)
102+
#[inline]
103+
#[cfg_attr(test, assert_instr(min.f16x2))]
104+
#[unstable(feature = "stdarch_nvptx", issue = "111199")]
105+
pub unsafe fn f16x2_min(a: f16x2, b: f16x2) -> f16x2 {
106+
llvm_f16x2_minnum(a, b)
107+
}
108+
109+
/// Find the minimum of two values, NaNs pass through.
110+
///
111+
/// <https://docs.nvidia.com/cuda/parallel-thread-execution/#half-precision-floating-point-instructions-min>
112+
///
113+
/// Corresponds to the CUDA C intrinsic [`__hmin2_nan`](https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF2__COMPARISON.html#group__CUDA__MATH____HALF2__COMPARISON_1g8bb8f58e9294cc261d2f42c4d5aecd6b)
114+
#[inline]
115+
#[cfg_attr(test, assert_instr(min.NaN.f16x2))]
116+
#[unstable(feature = "stdarch_nvptx", issue = "111199")]
117+
pub unsafe fn f16x2_min_nan(a: f16x2, b: f16x2) -> f16x2 {
118+
llvm_f16x2_minimum(a, b)
119+
}
120+
121+
/// Find the maximum of two values
122+
///
123+
/// <https://docs.nvidia.com/cuda/parallel-thread-execution/#half-precision-floating-point-instructions-max>
124+
///
125+
/// Corresponds to the CUDA C intrinsic [`__hmax2`](https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF2__COMPARISON.html#group__CUDA__MATH____HALF2__COMPARISON_1g59fc7fc7975d8127b202444a05e57e3d)
126+
#[inline]
127+
#[cfg_attr(test, assert_instr(max.f16x2))]
128+
#[unstable(feature = "stdarch_nvptx", issue = "111199")]
129+
pub unsafe fn f16x2_max(a: f16x2, b: f16x2) -> f16x2 {
130+
llvm_f16x2_maxnum(a, b)
131+
}
132+
133+
/// Find the maximum of two values, NaNs pass through.
134+
///
135+
/// <https://docs.nvidia.com/cuda/parallel-thread-execution/#half-precision-floating-point-instructions-max>
136+
///
137+
/// Corresponds to the CUDA C intrinsic [`__hmax2_nan`](https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF2__COMPARISON.html#group__CUDA__MATH____HALF2__COMPARISON_1g41623db7850e3074fd9daa80a14c3897)
138+
#[inline]
139+
#[cfg_attr(test, assert_instr(max.NaN.f16x2))]
140+
#[unstable(feature = "stdarch_nvptx", issue = "111199")]
141+
pub unsafe fn f16x2_max_nan(a: f16x2, b: f16x2) -> f16x2 {
142+
llvm_f16x2_maximum(a, b)
143+
}

0 commit comments

Comments
 (0)