kevinch-nv
diff --git a/‎aten/src/TH/THAllocator.cpp
Lines changed: 3 additions & 12 deletions b/‎aten/src/TH/THAllocator.cpp
Lines changed: 3 additions & 12 deletions
diff --git a/‎aten/src/TH/THGeneral.cpp
Lines changed: 5 additions & 41 deletions b/‎aten/src/TH/THGeneral.cpp
Lines changed: 5 additions & 41 deletions
diff --git a/‎c10/core/Allocator.cpp
Lines changed: 2 additions & 7 deletions b/‎c10/core/Allocator.cpp
Lines changed: 2 additions & 7 deletions
diff --git a/‎c10/core/Allocator.h
Lines changed: 2 additions & 7 deletions b/‎c10/core/Allocator.h
Lines changed: 2 additions & 7 deletions
diff --git a/‎c10/core/CPUAllocator.cpp
Lines changed: 170 additions & 0 deletions b/‎c10/core/CPUAllocator.cpp
Lines changed: 170 additions & 0 deletions
diff --git a/‎c10/core/CPUAllocator.h
Lines changed: 41 additions & 0 deletions b/‎c10/core/CPUAllocator.h
Lines changed: 41 additions & 0 deletions
diff --git a/‎c10/core/StorageImpl.h
Lines changed: 1 addition & 1 deletion b/‎c10/core/StorageImpl.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎c10/core/TensorImpl.h
Lines changed: 1 addition & 1 deletion b/‎c10/core/TensorImpl.h
Lines changed: 1 addition & 1 deletion
@@ -10,6 +10,8 @@
 #define TH_ATOMIC_IPC_REFCOUNT 1
 #endif
 
+#include <c10/core/CPUAllocator.h>
+
 #if HAVE_MMAP
 #include <sys/types.h>
 #include <sys/mman.h>
@@ -19,19 +21,8 @@
 #endif
 /* end of stuff for mapped files */
 
-struct THDefaultAllocator final : public at::Allocator {
-  at::DataPtr allocate(size_t size) const override {
-    auto* ptr = THAlloc(size);
-    return {ptr, ptr, &THFree, at::DeviceType::CPU};
-  }
-  at::DeleterFnPtr raw_deleter() const override {
-    return &THFree;
-  }
-};
-
-static THDefaultAllocator th_default_allocator;
 at::Allocator* getTHDefaultAllocator() {
-  return &th_default_allocator;
+  return c10::GetCPUAllocator();
 }
 
 #if defined(_WIN32) || defined(HAVE_MMAP)
 
@@ -1,5 +1,9 @@
 #include <TH/THGeneral.h>
 
+#ifdef __cplusplus
+#include <c10/core/CPUAllocator.h>
+#endif
+
 #ifdef _OPENMP
 #include <omp.h>
 #endif
@@ -155,52 +159,12 @@ void THSetGCHandler( void (*torchGCFunction_)(void *data), void *data )
   torchGCData = data;
 }
 
-static void* THAllocInternal(ptrdiff_t size)
-{
-  void *ptr;
-
-  if (size > 5120)
-  {
-#if (defined(__unix) || defined(__APPLE__)) && (!defined(DISABLE_POSIX_MEMALIGN))
-    if (posix_memalign(&ptr, 64, size) != 0)
-      ptr = NULL;
-/*
-#elif defined(_WIN32)
-    ptr = _aligned_malloc(size, 64);
-*/
-#else
-    ptr = malloc(size);
-#endif
-  }
-  else
-  {
-    ptr = malloc(size);
-  }
-
-  return ptr;
-}
-
 void* THAlloc(ptrdiff_t size)
 {
-  void *ptr;
-
   if(size < 0)
     THError("$ Torch: invalid memory size -- maybe an overflow?");
 
-  if(size == 0)
-    return NULL;
-
-  ptr = THAllocInternal(size);
-
-  if(!ptr && torchGCFunction) {
-    torchGCFunction(torchGCData);
-    ptr = THAllocInternal(size);
-  }
-
-  if(!ptr)
-    THError("$ Torch: not enough memory: you tried to allocate %dGB. Buy new RAM!", size/1073741824);
-
-  return ptr;
+  return c10::alloc_cpu(size);
 }
 
 void* THRealloc(void *ptr, ptrdiff_t size)
 
@@ -16,12 +16,7 @@ at::DataPtr InefficientStdFunctionContext::makeDataPtr(
           device};
 }
 
-} // namespace c10
-
-namespace caffe2 {
-
-C10_API at::Allocator* allocator_array[static_cast<int>(
-    at::DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES)];
+C10_API at::Allocator* allocator_array[at::COMPILE_TIME_MAX_DEVICE_TYPES];
 
 void SetAllocator(at::DeviceType t, at::Allocator* alloc) {
   allocator_array[static_cast<int>(t)] = alloc;
@@ -33,4 +28,4 @@ at::Allocator* GetAllocator(const at::DeviceType& t) {
   return alloc;
 }
 
-} // namespace caffe2
+} // namespace c10
@@ -16,7 +16,7 @@ namespace c10 {
 // nullptr DataPtrs can still have a nontrivial device; this allows
 // us to treat zero-size allocations uniformly with non-zero allocations.
 //
-class DataPtr {
+class C10_API DataPtr {
  private:
   c10::detail::UniqueVoidPtr ptr_;
   Device device_;
@@ -181,11 +181,6 @@ struct C10_API InefficientStdFunctionContext {
       Device device);
 };
 
-} // namespace c10
-
-// TODO: move to c10
-namespace caffe2 {
-
 /** Set the allocator for DeviceType `t`. The passed in allocator pointer is
  *  expected to have static lifetime; this function does NOT take ownership
  *  of the raw pointer. (The reason for this is to prevent existing pointers
@@ -210,4 +205,4 @@ struct AllocatorRegisterer {
   static AllocatorRegisterer<t> g_allocator_##d(f); \
   }
 
-} // namespace caffe2
+} // namespace c10
@@ -0,0 +1,170 @@
+#include <c10/core/CPUAllocator.h>
+#include <c10/util/typeid.h>
+#include <c10/core/DeviceType.h>
+
+// TODO: rename flags to C10
+C10_DEFINE_bool(
+    caffe2_report_cpu_memory_usage,
+    false,
+    "If set, print out detailed memory usage");
+
+C10_DEFINE_bool(
+    caffe2_cpu_allocator_do_zero_fill,
+    false,
+    "If set, do memory zerofilling when allocating on CPU");
+
+C10_DEFINE_bool(
+    caffe2_cpu_allocator_do_junk_fill,
+    false,
+    "If set, fill memory with deterministic junk when allocating on CPU");
+
+namespace c10 {
+
+void memset_junk(void* data, size_t num) {
+  // This garbage pattern is NaN when interpreted as floating point values,
+  // or as very large integer values.
+  static constexpr int32_t kJunkPattern = 0x7fedbeef;
+  static constexpr int64_t kJunkPattern64 =
+      static_cast<int64_t>(kJunkPattern) << 32 | kJunkPattern;
+  int32_t int64_count = num / sizeof(kJunkPattern64);
+  int32_t remaining_bytes = num % sizeof(kJunkPattern64);
+  int64_t* data_i64 = reinterpret_cast<int64_t*>(data);
+  for (int i = 0; i < int64_count; i++) {
+    data_i64[i] = kJunkPattern64;
+  }
+  if (remaining_bytes > 0) {
+    memcpy(data_i64 + int64_count, &kJunkPattern64, remaining_bytes);
+  }
+}
+
+void* alloc_cpu(size_t nbytes) {
+  if (nbytes == 0) {
+    return nullptr;
+  }
+
+  void* data;
+#ifdef __ANDROID__
+  data = memalign(gAlignment, nbytes);
+#elif defined(_MSC_VER)
+  data = _aligned_malloc(nbytes, gAlignment);
+#else
+  CAFFE_ENFORCE_EQ(posix_memalign(&data, gAlignment, nbytes), 0);
+#endif
+
+  CAFFE_ENFORCE(
+      data,
+      "DefaultCPUAllocator: not enough memory: you tried to allocate %dGB. Buy new RAM!",
+      nbytes / 1073741824);
+
+  // move data to a thread's NUMA node
+  NUMAMove(data, nbytes, GetCurrentNUMANode());
+  CHECK(
+      !FLAGS_caffe2_cpu_allocator_do_zero_fill ||
+      !FLAGS_caffe2_cpu_allocator_do_junk_fill)
+    << "Cannot request both zero-fill and junk-fill at the same time";
+  if (FLAGS_caffe2_cpu_allocator_do_zero_fill) {
+    memset(data, 0, nbytes);
+  } else if (FLAGS_caffe2_cpu_allocator_do_junk_fill) {
+    memset_junk(data, nbytes);
+  }
+
+  return data;
+}
+
+// A virtual struct that is used to report C10's memory allocation and
+// deallocation status
+class C10_API MemoryAllocationReporter {
+ public:
+  MemoryAllocationReporter() : allocated_(0) {}
+  void New(void* ptr, size_t nbytes);
+  void Delete(void* ptr);
+
+ private:
+  std::mutex mutex_;
+  std::unordered_map<void*, size_t> size_table_;
+  size_t allocated_;
+};
+
+struct C10_API DefaultCPUAllocator final : at::Allocator {
+  DefaultCPUAllocator() {}
+  ~DefaultCPUAllocator() override {}
+  at::DataPtr allocate(size_t nbytes) const override {
+    void* data = alloc_cpu(nbytes);
+    if (FLAGS_caffe2_report_cpu_memory_usage && nbytes > 0) {
+      getMemoryAllocationReporter().New(data, nbytes);
+      return {data, data, &ReportAndDelete, at::Device(at::DeviceType::CPU)};
+    }
+    return {data, data, &Delete, at::Device(at::DeviceType::CPU)};
+  }
+
+#ifdef _MSC_VER
+  static void Delete(void* data) {
+    _aligned_free(data);
+  }
+#else
+  static void Delete(void* data) {
+    free(data);
+  }
+#endif
+
+  static void ReportAndDelete(void* ptr) {
+    if (!ptr) {
+      return;
+    }
+    getMemoryAllocationReporter().Delete(ptr);
+    Delete(ptr);
+  }
+
+  at::DeleterFnPtr raw_deleter() const override {
+    if (FLAGS_caffe2_report_cpu_memory_usage) {
+      return &ReportAndDelete;
+    }
+    return &Delete;
+  }
+
+ protected:
+  static MemoryAllocationReporter& getMemoryAllocationReporter() {
+    static MemoryAllocationReporter reporter_;
+    return reporter_;
+  }
+
+};
+
+void NoDelete(void*) {}
+
+at::Allocator* GetCPUAllocator() {
+  return GetAllocator(DeviceType::CPU);
+}
+
+void SetCPUAllocator(at::Allocator* alloc) {
+  SetAllocator(DeviceType::CPU, alloc);
+}
+
+// Global default CPU Allocator
+static DefaultCPUAllocator g_cpu_alloc;
+
+at::Allocator* GetDefaultCPUAllocator() {
+  return &g_cpu_alloc;
+}
+
+REGISTER_ALLOCATOR(DeviceType::CPU, &g_cpu_alloc);
+
+void MemoryAllocationReporter::New(void* ptr, size_t nbytes) {
+  std::lock_guard<std::mutex> guard(mutex_);
+  size_table_[ptr] = nbytes;
+  allocated_ += nbytes;
+  LOG(INFO) << "C10 alloc " << nbytes << " bytes, total alloc " << allocated_
+            << " bytes.";
+}
+
+void MemoryAllocationReporter::Delete(void* ptr) {
+  std::lock_guard<std::mutex> guard(mutex_);
+  auto it = size_table_.find(ptr);
+  CHECK(it != size_table_.end());
+  allocated_ -= it->second;
+  LOG(INFO) << "C10 deleted " << it->second << " bytes, total alloc "
+            << allocated_ << " bytes.";
+  size_table_.erase(it);
+}
+
+} // namespace c10
@@ -0,0 +1,41 @@
+#pragma once
+
+#include <cstring>
+#include <unordered_map>
+
+#include <c10/core/Allocator.h>
+#include <c10/util/Logging.h>
+#include <c10/util/numa.h>
+
+// TODO: rename to c10
+C10_DECLARE_bool(caffe2_report_cpu_memory_usage);
+C10_DECLARE_bool(caffe2_cpu_allocator_do_zero_fill);
+C10_DECLARE_bool(caffe2_cpu_allocator_do_junk_fill);
+
+namespace c10 {
+
+// Use 64-byte alignment should be enough for computation up to AVX512.
+constexpr size_t gAlignment = 64;
+
+using MemoryDeleter = void (*)(void*);
+
+// A helper function that is basically doing nothing.
+C10_API void NoDelete(void*);
+
+// Fill the data memory region of num bytes with a particular garbage pattern.
+// The garbage value is chosen to be NaN if interpreted as floating point value,
+// or a very large integer.
+C10_API void memset_junk(void* data, size_t num);
+
+C10_API void* alloc_cpu(size_t nbytes);
+
+// Get the CPU Alloctor.
+C10_API at::Allocator* GetCPUAllocator();
+// Sets the CPU allocator to the given allocator: the caller gives away the
+// ownership of the pointer.
+C10_API void SetCPUAllocator(at::Allocator* alloc);
+
+// Get the Default CPU Allocator
+C10_API at::Allocator* GetDefaultCPUAllocator();
+
+} // namespace c10
@@ -53,7 +53,7 @@ struct C10_API StorageImpl final : public c10::intrusive_ptr_target {
             data_type,
             0,
             at::DataPtr(nullptr, device),
-            caffe2::GetAllocator(device.type()),
+            GetAllocator(device.type()),
             true) {}
 
   StorageImpl& operator=(StorageImpl&& other) = default;
 
@@ -1188,7 +1188,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
       // know how to reallocate it. However, in order to preserve legacy C2
       // behavior, we allow reallocating the memory using default allocator.
       if (allocator == nullptr) {
-        allocator = caffe2::GetAllocator(storage_.device_type());
+        allocator = GetAllocator(storage_.device_type());
       }
       if (meta.placementNew()) {
         // For types that need placement new, we will call it, as well as
Original file line number	Diff line number	Diff line change
`@@ -1188,7 +1188,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {`
`1188`	`1188`	`// know how to reallocate it. However, in order to preserve legacy C2`
`1189`	`1189`	`// behavior, we allow reallocating the memory using default allocator.`
`1190`	`1190`	`if (allocator == nullptr) {`
`1191`		`- allocator = caffe2::GetAllocator(storage_.device_type());`
	`1191`	`+ allocator = GetAllocator(storage_.device_type());`
`1192`	`1192`	`}`
`1193`	`1193`	`if (meta.placementNew()) {`
`1194`	`1194`	`// For types that need placement new, we will call it, as well as`