|
5 | 5 |
|
6 | 6 | namespace caffe2 {
|
7 | 7 | namespace {
|
| 8 | +template <typename T> |
8 | 9 | __global__ void TileCopyKernel(
|
9 |
| - int item_size, |
10 | 10 | int outer_dim,
|
11 | 11 | int inner_dim,
|
12 | 12 | int tiles,
|
13 |
| - const char* input_data, |
14 |
| - char* output_data) { |
15 |
| - CUDA_1D_KERNEL_LOOP(index, outer_dim * tiles) { |
16 |
| - int i = index / tiles; |
17 |
| - int t = index % tiles; |
18 |
| - const char* input_ptr = input_data + inner_dim * item_size * i; |
19 |
| - char* output_ptr = output_data + (i * tiles + t) * inner_dim * item_size; |
20 |
| - memcpy(output_ptr, input_ptr, inner_dim * item_size); |
| 13 | + const T* input_data, |
| 14 | + T* output_data) { |
| 15 | + CUDA_1D_KERNEL_LOOP(index, outer_dim * inner_dim * tiles) { |
| 16 | + int col = index % inner_dim; |
| 17 | + int row = index / (inner_dim * tiles); |
| 18 | + output_data[index] = input_data[row * inner_dim + col]; |
21 | 19 | }
|
22 | 20 | }
|
23 | 21 |
|
@@ -58,12 +56,16 @@ void TileOp<CUDAContext>::DoTile(
|
58 | 56 | int inner_dim,
|
59 | 57 | const char* input_data,
|
60 | 58 | char* output_data) {
|
61 |
| - TileCopyKernel<<< |
62 |
| - std::min(outer_dim * tiles_, CAFFE_MAXIMUM_NUM_BLOCKS), |
63 |
| - CAFFE_CUDA_NUM_THREADS, |
64 |
| - 0, |
65 |
| - context_.cuda_stream()>>>( |
66 |
| - item_size, outer_dim, inner_dim, tiles_, input_data, output_data); |
| 59 | + TileCopyKernel<float> |
| 60 | + <<<std::min(outer_dim * inner_dim * tiles_, CAFFE_MAXIMUM_NUM_BLOCKS), |
| 61 | + CAFFE_CUDA_NUM_THREADS, |
| 62 | + 0, |
| 63 | + context_.cuda_stream()>>>( |
| 64 | + outer_dim, |
| 65 | + inner_dim, |
| 66 | + tiles_, |
| 67 | + reinterpret_cast<const float*>(input_data), |
| 68 | + reinterpret_cast<float*>(output_data)); |
67 | 69 | }
|
68 | 70 |
|
69 | 71 | template <>
|
|
0 commit comments