Skip to content

Commit 3e44880

Browse files
wesolwskfacebook-github-bot
authored andcommitted
Modify TileOp GPU implementation to expose more concurrency and better utilize GPU memory bandwidth (pytorch#17275)
Summary: Pull Request resolved: pytorch#17275 Previous implementation used a memcpy inside the kernel. It is more efficient to reduce the data fetched per thread to a single word from memory. This exposes more concurrency and takes advantage of GPU memory coalescing support. Reviewed By: takatosp1 Differential Revision: D14120147 fbshipit-source-id: c4734003d4342e55147c5b858f232a006af60b68
1 parent 9e4a993 commit 3e44880

File tree

1 file changed

+17
-15
lines changed

1 file changed

+17
-15
lines changed

caffe2/operators/tile_op.cu

Lines changed: 17 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -5,19 +5,17 @@
55

66
namespace caffe2 {
77
namespace {
8+
template <typename T>
89
__global__ void TileCopyKernel(
9-
int item_size,
1010
int outer_dim,
1111
int inner_dim,
1212
int tiles,
13-
const char* input_data,
14-
char* output_data) {
15-
CUDA_1D_KERNEL_LOOP(index, outer_dim * tiles) {
16-
int i = index / tiles;
17-
int t = index % tiles;
18-
const char* input_ptr = input_data + inner_dim * item_size * i;
19-
char* output_ptr = output_data + (i * tiles + t) * inner_dim * item_size;
20-
memcpy(output_ptr, input_ptr, inner_dim * item_size);
13+
const T* input_data,
14+
T* output_data) {
15+
CUDA_1D_KERNEL_LOOP(index, outer_dim * inner_dim * tiles) {
16+
int col = index % inner_dim;
17+
int row = index / (inner_dim * tiles);
18+
output_data[index] = input_data[row * inner_dim + col];
2119
}
2220
}
2321

@@ -58,12 +56,16 @@ void TileOp<CUDAContext>::DoTile(
5856
int inner_dim,
5957
const char* input_data,
6058
char* output_data) {
61-
TileCopyKernel<<<
62-
std::min(outer_dim * tiles_, CAFFE_MAXIMUM_NUM_BLOCKS),
63-
CAFFE_CUDA_NUM_THREADS,
64-
0,
65-
context_.cuda_stream()>>>(
66-
item_size, outer_dim, inner_dim, tiles_, input_data, output_data);
59+
TileCopyKernel<float>
60+
<<<std::min(outer_dim * inner_dim * tiles_, CAFFE_MAXIMUM_NUM_BLOCKS),
61+
CAFFE_CUDA_NUM_THREADS,
62+
0,
63+
context_.cuda_stream()>>>(
64+
outer_dim,
65+
inner_dim,
66+
tiles_,
67+
reinterpret_cast<const float*>(input_data),
68+
reinterpret_cast<float*>(output_data));
6769
}
6870

6971
template <>

0 commit comments

Comments
 (0)