tutorial 6

kwea123 · kwea123 · commit 4108dd9131fd · 2022-12-30T12:32:44.000+09:00
diff --git a/include/utils.h b/include/utils.h
@@ -6,6 +6,13 @@
 
 
 torch::Tensor trilinear_fw_cu(
-    torch::Tensor feats,
-    torch::Tensor points
+    const torch::Tensor feats,
+    const torch::Tensor points
+);
+
+
+torch::Tensor trilinear_bw_cu(
+    const torch::Tensor dL_dfeat_interp,
+    const torch::Tensor feats,
+    const torch::Tensor points
 );
diff --git a/interpolation.cpp b/interpolation.cpp
@@ -1,9 +1,9 @@
 #include "utils.h"
 
 
-torch::Tensor trilinear_interpolation(
-    torch::Tensor feats,
-    torch::Tensor points
+torch::Tensor trilinear_interpolation_fw(
+    const torch::Tensor feats,
+    const torch::Tensor points
 ){
     CHECK_INPUT(feats);
     CHECK_INPUT(points);
@@ -12,6 +12,20 @@ torch::Tensor trilinear_interpolation(
 }
 
 
+torch::Tensor trilinear_interpolation_bw(
+    const torch::Tensor dL_dfeat_interp,
+    const torch::Tensor feats,
+    const torch::Tensor points
+){
+    CHECK_INPUT(dL_dfeat_interp);
+    CHECK_INPUT(feats);
+    CHECK_INPUT(points);
+
+    return trilinear_bw_cu(dL_dfeat_interp, feats, points);
+}
+
+
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m){
-    m.def("trilinear_interpolation", &trilinear_interpolation);
+    m.def("trilinear_interpolation_fw", &trilinear_interpolation_fw);
+    m.def("trilinear_interpolation_bw", &trilinear_interpolation_bw);
 }
diff --git a/interpolation_kernel.cu b/interpolation_kernel.cu
@@ -33,14 +33,12 @@ __global__ void trilinear_fw_kernel(
 
 
 torch::Tensor trilinear_fw_cu(
-    torch::Tensor feats,
-    torch::Tensor points
+    const torch::Tensor feats,
+    const torch::Tensor points
 ){
     const int N = feats.size(0), F = feats.size(2);
     
     torch::Tensor feat_interp = torch::zeros({N, F}, feats.options());
-    torch::Tensor feat_interp2 = torch::zeros({N, F}, feats.options());
-
 
     const dim3 threads(16, 16);
     const dim3 blocks((N+threads.x-1)/threads.x, (F+threads.y-1)/threads.y);
@@ -55,4 +53,63 @@ torch::Tensor trilinear_fw_cu(
     }));
 
     return feat_interp;
+}
+
+
+template <typename scalar_t>
+__global__ void trilinear_bw_kernel(
+    const torch::PackedTensorAccessor<scalar_t, 2, torch::RestrictPtrTraits, size_t> dL_dfeat_interp,
+    const torch::PackedTensorAccessor<scalar_t, 3, torch::RestrictPtrTraits, size_t> feats,
+    const torch::PackedTensorAccessor<scalar_t, 2, torch::RestrictPtrTraits, size_t> points,
+    torch::PackedTensorAccessor<scalar_t, 3, torch::RestrictPtrTraits, size_t> dL_dfeats
+){
+    const int n = blockIdx.x * blockDim.x + threadIdx.x;
+    const int f = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (n>=feats.size(0) || f>=feats.size(2)) return;
+
+    // point -1~1
+    const scalar_t u = (points[n][0]+1)/2;
+    const scalar_t v = (points[n][1]+1)/2;
+    const scalar_t w = (points[n][2]+1)/2;
+    
+    const scalar_t a = (1-v)*(1-w);
+    const scalar_t b = (1-v)*w;
+    const scalar_t c = v*(1-w);
+    const scalar_t d = 1-a-b-c;
+
+    dL_dfeats[n][0][f] = (1-u)*a*dL_dfeat_interp[n][f];
+    dL_dfeats[n][1][f] = (1-u)*b*dL_dfeat_interp[n][f];
+    dL_dfeats[n][2][f] = (1-u)*c*dL_dfeat_interp[n][f];
+    dL_dfeats[n][3][f] = (1-u)*d*dL_dfeat_interp[n][f];
+    dL_dfeats[n][4][f] = u*a*dL_dfeat_interp[n][f];
+    dL_dfeats[n][5][f] = u*b*dL_dfeat_interp[n][f];
+    dL_dfeats[n][6][f] = u*c*dL_dfeat_interp[n][f];
+    dL_dfeats[n][7][f] = u*d*dL_dfeat_interp[n][f];
+}
+
+
+torch::Tensor trilinear_bw_cu(
+    const torch::Tensor dL_dfeat_interp,
+    const torch::Tensor feats,
+    const torch::Tensor points
+){
+    const int N = feats.size(0), F = feats.size(2);
+    
+    torch::Tensor dL_dfeats = torch::zeros({N, 8, F}, feats.options());
+
+    const dim3 threads(16, 16);
+    const dim3 blocks((N+threads.x-1)/threads.x, (F+threads.y-1)/threads.y);
+
+    AT_DISPATCH_FLOATING_TYPES(feats.type(), "trilinear_bw_cu", 
+    ([&] {
+        trilinear_bw_kernel<scalar_t><<<blocks, threads>>>(
+            dL_dfeat_interp.packed_accessor<scalar_t, 2, torch::RestrictPtrTraits, size_t>(),
+            feats.packed_accessor<scalar_t, 3, torch::RestrictPtrTraits, size_t>(),
+            points.packed_accessor<scalar_t, 2, torch::RestrictPtrTraits, size_t>(),
+            dL_dfeats.packed_accessor<scalar_t, 3, torch::RestrictPtrTraits, size_t>()
+        );
+    }));
+
+    return dL_dfeats;
 }
diff --git a/test.py b/test.py
@@ -32,19 +32,53 @@ def trilinear_interpolation_py(feats, points):
     return feats_interp
 
 
+class Trilinear_interpolation_cuda(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, feats, points):
+        feat_interp = cppcuda_tutorial.trilinear_interpolation_fw(feats, points)
+
+        ctx.save_for_backward(feats, points)
+
+        return feat_interp
+
+    @staticmethod
+    def backward(ctx, dL_dfeat_interp):
+        feats, points = ctx.saved_tensors
+
+        dL_dfeats = cppcuda_tutorial.trilinear_interpolation_bw(dL_dfeat_interp.contiguous(), feats, points)
+
+        return dL_dfeats, None
+
+
 if __name__ == '__main__':
     N = 65536; F = 256
-    feats = torch.rand(N, 8, F, device='cuda').requires_grad_()
+    rand = torch.rand(N, 8, F, device='cuda')
+    feats = rand.clone().requires_grad_()
+    feats2 = rand.clone().requires_grad_()
     points = torch.rand(N, 3, device='cuda')*2-1
 
     t = time.time()
-    out_cuda = cppcuda_tutorial.trilinear_interpolation(feats, points)
+    out_cuda = Trilinear_interpolation_cuda.apply(feats2, points)
     torch.cuda.synchronize()
-    print('   cuda time', time.time()-t, 's')
+    print('   cuda fw time', time.time()-t, 's')
 
     t = time.time()
     out_py = trilinear_interpolation_py(feats, points)
     torch.cuda.synchronize()
-    print('pytorch time', time.time()-t, 's')
+    print('pytorch fw time', time.time()-t, 's')
+
+    print('fw all close', torch.allclose(out_py, out_cuda))
+
+    t = time.time()
+    loss2 = out_cuda.sum()
+    loss2.backward()
+    torch.cuda.synchronize()
+    print('   cuda bw time', time.time()-t, 's')
+
+    t = time.time()
+    loss = out_py.sum()
+    loss.backward()
+    torch.cuda.synchronize()
+    print('pytorch bw time', time.time()-t, 's')
 
-    print(torch.allclose(out_py, out_cuda))
+    print('bw all close', torch.allclose(feats.grad, feats2.grad))