kevinch-nv
diff --git a/‎c10/core/StorageImpl.h
+14 b/‎c10/core/StorageImpl.h
+14
diff --git a/‎c10/cuda/CUDACachingAllocator.cpp
+37-16 b/‎c10/cuda/CUDACachingAllocator.cpp
+37-16
diff --git a/‎c10/cuda/CUDACachingAllocator.h
+14 b/‎c10/cuda/CUDACachingAllocator.h
+14
diff --git a/‎docs/source/multiprocessing.rst
+55-47 b/‎docs/source/multiprocessing.rst
+55-47
@@ -19,6 +19,7 @@ struct C10_API StorageImpl final : public c10::intrusive_ptr_target {
         data_ptr_(std::move(data_ptr)),
         numel_(numel),
         resizable_(resizable),
+        received_cuda_(false),
         allocator_(allocator) {
     if (resizable) {
       AT_ASSERTM(
@@ -210,11 +211,24 @@ struct C10_API StorageImpl final : public c10::intrusive_ptr_target {
     resizable_ = false;
   }
 
+  // This method can be used only after storage construction and cannot be used
+  // to modify storage status
+  void set_received_cuda(bool received_cuda) {
+    received_cuda_ = received_cuda;
+  }
+
+  bool received_cuda() {
+    return received_cuda_;
+  }
+
  private:
   caffe2::TypeMeta data_type_;
   DataPtr data_ptr_;
   int64_t numel_;
   bool resizable_;
+  // Identifies that Storage was received from another process and doesn't have
+  // local to process cuda memory allocation
+  bool received_cuda_;
   Allocator* allocator_;
 };
 } // namespace c10
@@ -16,8 +16,10 @@
 #include <vector>
 
 namespace c10 {
-namespace cuda {
 
+C10_DEFINE_REGISTRY(FreeCudaMemoryCallbacksRegistry, FreeMemoryCallback);
+
+namespace cuda {
 namespace CUDACachingAllocator {
 
 //
@@ -47,6 +49,8 @@ namespace CUDACachingAllocator {
 // work.
 //
 
+
+
 namespace {
 
 using stream_set = std::unordered_set<cuda::CUDAStream>;
@@ -154,7 +158,7 @@ struct THCCachingAllocator
   std::vector<DeviceStats> device_stats;
 
   // lock around all operations
-  std::mutex mutex;
+  std::recursive_mutex mutex;
 
   // lock around calls to cudaFree (to prevent deadlocks with NCCL)
   std::mutex cuda_free_mutex;
@@ -186,7 +190,7 @@ struct THCCachingAllocator
   /** allocates a block which is safe to use from the provided stream */
   void malloc(void** devPtr, size_t size, cudaStream_t stream)
   {
-    std::lock_guard<std::mutex> lock(mutex);
+    std::lock_guard<std::recursive_mutex> lock(mutex);
 
     int device;
     C10_CUDA_CHECK(cudaGetDevice(&device));
@@ -201,14 +205,29 @@ struct THCCachingAllocator
     Block search_key(device, stream, size);
     auto& pool = get_pool(size);
 
-    Block* block = nullptr;
-    Block* remaining = nullptr;
-
-    auto it = pool.lower_bound(&search_key);
-    if (it != pool.end() && (*it)->device == device && (*it)->stream == stream) {
-      block = *it;
-      pool.erase(it);
-    } else {
+    auto find_free_block = [&]()->Block*{
+      auto it = pool.lower_bound(&search_key);
+      if (it != pool.end() && (*it)->device == device &&
+          (*it)->stream == stream) {
+        Block* block = *it;
+        pool.erase(it);
+        return block;
+      }
+      return nullptr;
+    };
+
+    Block* block = find_free_block();
+    if (block == nullptr) {
+      bool freed_memory = false;
+      for (const auto& name : FreeCudaMemoryCallbacksRegistry()->Keys()) {
+        freed_memory |=
+            FreeCudaMemoryCallbacksRegistry()->Create(name)->Execute();
+      }
+      if (freed_memory) {
+        block = find_free_block();
+      }
+    }
+    if (block == nullptr) {
       void* ptr;
       size_t alloc_size = get_allocation_size(size);
       cudaError_t err = cuda_malloc_retry(device, &ptr, alloc_size);
@@ -253,8 +272,10 @@ struct THCCachingAllocator
       block = new Block(device, stream, alloc_size, &pool, ptr);
     }
 
+    Block* remaining = nullptr;
     AT_ASSERT(block);
     if (should_split(block, size)) {
+
       remaining = block;
 
       block = new Block(device, stream, size, &pool, block->ptr);
@@ -280,7 +301,7 @@ struct THCCachingAllocator
 
   void free(void* ptr)
   {
-    std::lock_guard<std::mutex> lock(mutex);
+    std::lock_guard<std::recursive_mutex> lock(mutex);
     if (!ptr) {
       return;
     }
@@ -305,14 +326,14 @@ struct THCCachingAllocator
   /** returns cached blocks to the system allocator */
   void emptyCache()
   {
-    std::lock_guard<std::mutex> lock(mutex);
+    std::lock_guard<std::recursive_mutex> lock(mutex);
     free_blocks(large_blocks, large_blocks.begin(), large_blocks.end());
     free_blocks(small_blocks, small_blocks.begin(), small_blocks.end());
   }
 
   void* getBaseAllocation(void* ptr, size_t* outSize)
   {
-    std::lock_guard<std::mutex> lock(mutex);
+    std::lock_guard<std::recursive_mutex> lock(mutex);
     Block* block = find_allocated_block(ptr);
     if (!block) {
       AT_ERROR("invalid device pointer: %p", ptr);
@@ -348,14 +369,14 @@ struct THCCachingAllocator
 
   void cacheInfo(int dev_id, size_t* total, size_t* largest)
   {
-    std::lock_guard<std::mutex> lock(mutex);
+    std::lock_guard<std::recursive_mutex> lock(mutex);
     cacheInfoAux(large_blocks, dev_id, total, largest);
     cacheInfoAux(small_blocks, dev_id, total, largest);
   }
 
   void recordStream(void* ptr, cuda::CUDAStream stream)
   {
-    std::lock_guard<std::mutex> lock(mutex);
+    std::lock_guard<std::recursive_mutex> lock(mutex);
     Block* block = find_allocated_block(ptr);
     if (!block) {
       AT_ERROR("invalid device pointer: %p", ptr);
 
@@ -4,10 +4,24 @@
 #include <c10/cuda/CUDAStream.h>
 #include <c10/core/Allocator.h>
 #include <c10/cuda/CUDAMacros.h>
+#include <c10/util/Registry.h>
 
 #include <mutex>
 
 namespace c10 {
+
+// Caching allocator will execute every registered callback if it unable to find
+// block inside of already allocated area.
+class C10_CUDA_API FreeMemoryCallback {
+ public:
+  virtual ~FreeMemoryCallback() {};
+  virtual bool Execute() = 0;
+};
+
+C10_DECLARE_REGISTRY(FreeCudaMemoryCallbacksRegistry, FreeMemoryCallback);
+#define REGISTER_FREE_MEMORY_CALLBACK(name, ...) \
+  C10_REGISTER_CLASS(FreeCudaMemoryCallbacksRegistry, name, __VA_ARGS__);
+
 namespace cuda {
 
 // TODO: Turn this into an honest to goodness class. I briefly attempted to do
 
@@ -28,57 +28,65 @@ Python 2 can only create subprocesses using ``fork``, and it's not supported
 by the CUDA runtime.
 
 Unlike CPU tensors, the sending process is required to keep the original tensor
-as long as the receiving process retains a copy of the tensor.
-This shouldn't be a problem for sharing model parameters (which stay live
-for the entire execution of the model), but passing other
-kinds of data should be done with care.
+as long as the receiving process retains a copy of the tensor. It is implemented
+under the hood but requires users to follow the next best practices.
 
-Here is an example program which handles these requirements correctly:
+1. Release memory ASAP in the consumer.
 
 ::
 
-    import torch
-    import torch.multiprocessing as mp
-
-    torch.set_default_tensor_type(torch.cuda.FloatTensor)
-
-    def sender(q, e):
-        for i in range(10):
-            s_sample = [torch.zeros(1), torch.ones(1)]
-            q.put(s_sample)
-            e.wait()
-            del s_sample
-            e.clear()
-
-    if __name__ == "__main__":
-        ctx = mp.get_context("spawn")
-        q = ctx.Queue()
-        e = ctx.Event()
-        p = ctx.Process(target=sender, args=(q, e))
-        p.start()
-
-        for i in range(10):
-            print('=== ITER {} ===".format(i))
-            r_sample = q.get()
-            del r_sample
-            e.set()
-
-        p.join()
-
-In the example above, calling `e.wait()`
-on sender side ensures tensor `s_sample` doesn't get deleted while
-receiver is working on it.  The receiver signals when it is done
-with the tensor using `e.set()`, being careful to `del` its reference
-to the received tensor first.  It is INSUFFICIENT to promise never to call
-`r_sample` again; while `r_sample` is live, it may be confused with
-any subsequent tensors allocated by the source process at the same address.
-
-If a receiver wants to save the data of `r_sample` for future use while
-letting the source process deallocate the original, it must
-`clone()` it.
-
-This behavior is very confusing, and we are tracking a fix for it
-at https://github.com/pytorch/pytorch/issues/16141
+    ## Good
+    x = queue.get()
+    # do somethings with x
+    del x
+
+::
+
+    ## Bad
+    x = queue.get()
+    # do somethings with x
+    # do everything else (producer have to keep x in memory)
+
+2. Keep producer process running until all consumers exits. This will prevent
+the situation when the producer process releasing memory which is still in use
+by the consumer.
+
+::
+
+    ## producer
+    # send tensors, do something
+    event.wait()
+
+
+::
+
+    ## consumer
+    # receive tensors and use them
+    event.set()
+
+3. Don't pass received tensors.
+
+::
+
+    # not going to work
+    x = queue.get()
+    queue_2.put(x)
+
+
+::
+
+    # you need to create a process-local copy
+    x = queue.get()
+    x_clone = x.clone()
+    queue_2.put(x_clone)
+
+
+::
+
+    # putting and getting from the same queue in the same process will likely end up with segfault
+    queue.put(tensor)
+    x = queue.get()
+
 
 Sharing strategies
 ------------------