@@ -63,25 +63,25 @@ std::tuple<uint64_t, dim3, dim3> calc_execution_policy(const int64_t total_eleme
63
63
// grid stride loop kernel for distributions
64
64
template <typename accscalar_t , int unroll_factor, typename dist_t , typename transform_t >
65
65
C10_LAUNCH_BOUNDS_2 (block_size_bound, grid_size_bound)
66
- __global__ void distribution_elementwise_grid_stride_kernel (int numel,
66
+ __global__ void distribution_elementwise_grid_stride_kernel (int64_t numel,
67
67
PhiloxCudaState philox_args,
68
68
const dist_t dist_func,
69
69
const transform_t transform_func) {
70
70
auto seeds = at::cuda::philox::unpack (philox_args);
71
- int idx = blockIdx.x * blockDim.x + threadIdx.x ;
71
+ int64_t idx = blockIdx.x * blockDim.x + threadIdx.x ;
72
72
curandStatePhilox4_32_10_t state;
73
73
curand_init (std::get<0 >(seeds),
74
74
idx,
75
75
std::get<1 >(seeds),
76
76
&state);
77
77
78
- int rounded_size = ((numel - 1 )/(blockDim.x * gridDim.x * unroll_factor)+1 ) *
78
+ int64_t rounded_size = ((numel - 1 )/(blockDim.x * gridDim.x * unroll_factor)+1 ) *
79
79
blockDim.x * gridDim.x * unroll_factor;
80
- for (int linear_index = idx; linear_index < rounded_size; linear_index += blockDim.x * gridDim.x * unroll_factor) {
80
+ for (int64_t linear_index = idx; linear_index < rounded_size; linear_index += blockDim.x * gridDim.x * unroll_factor) {
81
81
auto rand = dist_func (&state);
82
82
#pragma unroll
83
83
for (int ii = 0 ; ii < unroll_factor; ii++) {
84
- int li = linear_index + blockDim.x * gridDim.x * ii;
84
+ int64_t li = linear_index + blockDim.x * gridDim.x * ii;
85
85
if (li < numel) {
86
86
transform_func (li, static_cast <accscalar_t >((&rand .x )[ii]));
87
87
}
0 commit comments