upd

kwea123 · kwea123 · commit 5f7d4e86da99 · 2022-09-25T10:04:08.000+09:00
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
+.vscode/
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
diff --git a/interpolation_kernel.cu b/interpolation_kernel.cu
@@ -22,13 +22,13 @@ __global__ void trilinear_fw_kernel(
     const scalar_t c = v*(1-w);
     const scalar_t d = 1-a-b-c;
     feat_interp[n][f] = (1-u)*(a*feats[n][0][f] +
-                                b*feats[n][1][f] +
-                                c*feats[n][2][f] +
-                                d*feats[n][3][f]) + 
+                               b*feats[n][1][f] +
+                               c*feats[n][2][f] +
+                               d*feats[n][3][f]) + 
                             u*(a*feats[n][4][f] +
-                                b*feats[n][5][f] +
-                                c*feats[n][6][f] +
-                                d*feats[n][7][f]);
+                               b*feats[n][5][f] +
+                               c*feats[n][6][f] +
+                               d*feats[n][7][f]);
 }
 
 
@@ -39,6 +39,8 @@ torch::Tensor trilinear_fw_cu(
     const int N = feats.size(0), F = feats.size(2);
     
     torch::Tensor feat_interp = torch::zeros({N, F}, feats.options());
+    torch::Tensor feat_interp2 = torch::zeros({N, F}, feats.options());
+
 
     const dim3 threads(16, 16);
     const dim3 blocks((N+threads.x-1)/threads.x, (F+threads.y-1)/threads.y);
@@ -51,5 +53,6 @@ torch::Tensor trilinear_fw_cu(
             feat_interp.packed_accessor<scalar_t, 2, torch::RestrictPtrTraits, size_t>()
         );
     }));
-    
+
+    return feat_interp;
 }
diff --git a/test.py b/test.py
@@ -1,12 +1,50 @@
 import torch
 import cppcuda_tutorial
+import time
+
+
+def trilinear_interpolation_py(feats, points):
+    """
+    Inputs:
+        feats: (N, 8, F)
+        points: (N, 3) local coordinates in [-1, 1]
+    
+    Outputs:
+        feats_interp: (N, F)
+    """
+    u = (points[:, 0:1]+1)/2
+    v = (points[:, 1:2]+1)/2
+    w = (points[:, 2:3]+1)/2
+    a = (1-v)*(1-w)
+    b = (1-v)*w
+    c = v*(1-w)
+    d = 1-a-b-c
+
+    feats_interp = (1-u)*(a*feats[:, 0] +
+                          b*feats[:, 1] +
+                          c*feats[:, 2] +
+                          d*feats[:, 3]) + \
+                       u*(a*feats[:, 4] +
+                          b*feats[:, 5] +
+                          c*feats[:, 6] +
+                          d*feats[:, 7])
+    
+    return feats_interp
 
 
 if __name__ == '__main__':
+    N = 65536; F = 256
+    feats = torch.rand(N, 8, F, device='cuda').requires_grad_()
+    points = torch.rand(N, 3, device='cuda')*2-1
 
-    feats = torch.ones(2, device='cuda')
-    points = torch.zeros(2, device='cuda')
+    t = time.time()
+    out_cuda = cppcuda_tutorial.trilinear_interpolation(feats, points)
+    torch.cuda.synchronize()
+    print('   cuda time', time.time()-t, 's')
 
-    out = cppcuda_tutorial.trilinear_interpolation(feats, points)
+    t = time.time()
+    out_py = trilinear_interpolation_py(feats, points)
+    torch.cuda.synchronize()
+    print('pytorch time', time.time()-t, 's')
 
-    print(out)
+    print(torch.allclose(out_py, out_cuda))

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
	`1`	`+.vscode/`
`1`	`2`	`# Byte-compiled / optimized / DLL files`
`2`	`3`	`__pycache__/`
`3`	`4`	`*.py[cod]`