lorenzbaraldi
diff --git a/‎BUILD.bazel
Lines changed: 2 additions & 0 deletions b/‎BUILD.bazel
Lines changed: 2 additions & 0 deletions
diff --git a/‎WORKSPACE
Lines changed: 6 additions & 0 deletions b/‎WORKSPACE
Lines changed: 6 additions & 0 deletions
diff --git a/‎aten/src/ATen/CMakeLists.txt
Lines changed: 1 addition & 3 deletions b/‎aten/src/ATen/CMakeLists.txt
Lines changed: 1 addition & 3 deletions
diff --git a/‎aten/src/ATen/native/native_functions.yaml
Lines changed: 2 additions & 1 deletion b/‎aten/src/ATen/native/native_functions.yaml
Lines changed: 2 additions & 1 deletion
diff --git a/‎aten/src/ATen/native/nested/NestedTensorUtils.cpp
Lines changed: 0 additions & 22 deletions b/‎aten/src/ATen/native/nested/NestedTensorUtils.cpp
Lines changed: 0 additions & 22 deletions
diff --git a/‎aten/src/ATen/native/nested/NestedTensorUtils.h
Lines changed: 22 additions & 2 deletions b/‎aten/src/ATen/native/nested/NestedTensorUtils.h
Lines changed: 22 additions & 2 deletions
@@ -429,6 +429,7 @@ cu_library(
         "@cuda//:cublas",
         "@cuda//:cufft",
         "@cuda//:cusparse",
+        "@cutlass",
     ],
     alwayslink = True,
 )
@@ -1673,6 +1674,7 @@ cc_library(
     ] + if_cuda([
         ":torch_distributed_cuda",
         "@cuda//:nvToolsExt",
+        "@cutlass",
     ]),
     alwayslink = True,
 )
 
@@ -84,6 +84,12 @@ new_local_repository(
     path = "third_party/eigen",
 )
 
+new_local_repository(
+    name = "cutlass",
+    build_file = "//third_party:cutlass.BUILD",
+    path = "third_party/cutlass",
+)
+
 new_local_repository(
     name = "fbgemm",
     build_file = "//third_party:fbgemm/BUILD.bazel",
 
@@ -433,9 +433,7 @@ if(NOT MSVC AND NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE)
 endif()
 
 if(USE_CUDA AND NOT USE_ROCM)
-  if(USE_FLASH_ATTENTION)
-    list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/cutlass/include)
-  endif()
+  list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/cutlass/include)
   if($ENV{ATEN_STATIC_CUDA})
     list(APPEND ATen_CUDA_DEPENDENCY_LIBS
       ${CUDA_LIBRARIES}
 
@@ -1174,7 +1174,8 @@
   dispatch:
     SparseCPU: bmm_sparse_cpu
     SparseCUDA: bmm_sparse_cuda
-    NestedTensorCPU, NestedTensorCUDA: bmm_nested
+    NestedTensorCPU: bmm_nested
+    NestedTensorCUDA: bmm_nested_cuda
   tags: canonical
 
 - func: bmm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)
 
@@ -108,27 +108,5 @@ std::vector<Tensor> chunk_nested_tensor(const Tensor& self, int64_t chunks, int6
   return splits;
 }
 
-std::vector<IntArrayRef> NestedTensor_get_sizes(
-    const NestedTensorImpl* self_ptr) {
-  int64_t ntensors = self_ptr->size(0);
-  std::vector<IntArrayRef> sizes(ntensors);
-  if (ntensors == 0) {
-    return sizes;
-  }
-  const Tensor& sizemat = self_ptr->get_nested_size_tensor();
-  int64_t orig_dim = sizemat.size(1);
-  // nesting scalars has empty sizes
-  if (orig_dim == 0) {
-    return sizes;
-  }
-  const int64_t* sizemat_ptr = sizemat.data_ptr<int64_t>();
-
-  for (const auto i : c10::irange(ntensors)) {
-    sizes[i] = IntArrayRef(sizemat_ptr, sizemat_ptr + orig_dim);
-    sizemat_ptr += orig_dim;
-  }
-  return sizes;
-}
-
 } // namespace native
 } // namespace at
@@ -97,8 +97,28 @@ inline at::Tensor create_nested_view_tensor(
 int64_t get_consistent_last_dim_of_nested_tensor(const NestedTensorImpl& nt);
 
 // The sizes of the underlying tensors
-std::vector<IntArrayRef> NestedTensor_get_sizes(
-    const NestedTensorImpl* self_ptr);
+inline std::vector<IntArrayRef> NestedTensor_get_sizes(
+    const NestedTensorImpl* self_ptr) {
+  int64_t ntensors = self_ptr->size(0);
+  std::vector<IntArrayRef> sizes(ntensors);
+  if (ntensors == 0) {
+    return sizes;
+  }
+  const Tensor& sizemat = self_ptr->get_nested_size_tensor();
+  int64_t orig_dim = sizemat.size(1);
+  // nesting scalars has empty sizes
+  if (orig_dim == 0) {
+    return sizes;
+  }
+  const int64_t* sizemat_ptr = sizemat.data_ptr<int64_t>();
+
+  for (const auto i : c10::irange(ntensors)) {
+    sizes[i] = IntArrayRef(sizemat_ptr, sizemat_ptr + orig_dim);
+    sizemat_ptr += orig_dim;
+  }
+  return sizes;
+}
+
 
 TORCH_API std::vector<int64_t> NestedTensor_get_max_size(
     const NestedTensorImpl& nt);