FedML-AI
diff --git a/‎caffe2/cuda_rtc/common_rtc.h
Lines changed: 36 additions & 25 deletions b/‎caffe2/cuda_rtc/common_rtc.h
Lines changed: 36 additions & 25 deletions
diff --git a/‎caffe2/cuda_rtc/elemenntwise_rtc_gpu.cc
Lines changed: 20 additions & 21 deletions b/‎caffe2/cuda_rtc/elemenntwise_rtc_gpu.cc
Lines changed: 20 additions & 21 deletions
diff --git a/‎caffe2/cuda_rtc/pool_op_rtc_gpu.cc
Lines changed: 44 additions & 17 deletions b/‎caffe2/cuda_rtc/pool_op_rtc_gpu.cc
Lines changed: 44 additions & 17 deletions
diff --git a/‎caffe2/db/create_db_op.cc
Lines changed: 1 addition & 1 deletion b/‎caffe2/db/create_db_op.cc
Lines changed: 1 addition & 1 deletion
diff --git a/‎caffe2/db/leveldb.cc
Lines changed: 32 additions & 14 deletions b/‎caffe2/db/leveldb.cc
Lines changed: 32 additions & 14 deletions
@@ -7,14 +7,14 @@
 #include <cuda.h>
 #include <nvrtc.h>
 
-#define NVRTC_CHECK(condition)                                                 \
-  do {                                                                         \
-    nvrtcResult result = condition;                                            \
-    if (result != NVRTC_SUCCESS) {                                             \
-      LOG(FATAL) << "Error at: " << __FILE__ << ":" << __LINE__ << ": "   \
-                      << nvrtcGetErrorString(result);                          \
-    }                                                                          \
-  } while(0)
+#define NVRTC_CHECK(condition)                                          \
+  do {                                                                  \
+    nvrtcResult result = condition;                                     \
+    if (result != NVRTC_SUCCESS) {                                      \
+      LOG(FATAL) << "Error at: " << __FILE__ << ":" << __LINE__ << ": " \
+                 << nvrtcGetErrorString(result);                        \
+    }                                                                   \
+  } while (0)
 
 namespace caffe2 {
 
@@ -39,15 +39,14 @@ class CudaRTCFunction {
     VLOG(1) << "function src:\n" << src;
     // Actually do the compiling.
     nvrtcProgram prog;
-    NVRTC_CHECK(nvrtcCreateProgram(
-        &prog, src.c_str(), nullptr, 0, nullptr, nullptr));
+    NVRTC_CHECK(
+        nvrtcCreateProgram(&prog, src.c_str(), nullptr, 0, nullptr, nullptr));
     // Compile the program.
     // TODO(Yangqing): how to find the current gpu architecture instead of hard
     // coding it?
-    const char *nvrtc_opts[] = {"--gpu-architecture=compute_35",
-                                "--use_fast_math"};
-    nvrtcResult compile_result = nvrtcCompileProgram(
-        prog, 2, nvrtc_opts);
+    const char* nvrtc_opts[] = {
+        "--gpu-architecture=compute_35", "--use_fast_math"};
+    nvrtcResult compile_result = nvrtcCompileProgram(prog, 2, nvrtc_opts);
     if (compile_result != NVRTC_SUCCESS) {
       size_t log_size;
       NVRTC_CHECK(nvrtcGetProgramLogSize(prog, &log_size));
@@ -74,21 +73,33 @@ class CudaRTCFunction {
   }
 
   template <typename... Args>
-  void Launch(unsigned int gx, unsigned int gy, unsigned int gz,
-              unsigned int bx, unsigned int by, unsigned int bz,
-              unsigned int shared_mem, cudaStream_t stream,
-              Args... args) {
+  void Launch(
+      unsigned int gx,
+      unsigned int gy,
+      unsigned int gz,
+      unsigned int bx,
+      unsigned int by,
+      unsigned int bz,
+      unsigned int shared_mem,
+      cudaStream_t stream,
+      Args... args) {
     CAFFE_ENFORCE(
         module_loaded_, "Cannot call Launch before a module is loaded.");
-    void * args_voidp[] = {&args...};
+    void* args_voidp[] = {&args...};
     CUDA_DRIVERAPI_ENFORCE(cuLaunchKernel(
         kernel_, gx, gy, gz, bx, by, bz, shared_mem, stream, args_voidp, 0));
   }
 
-  void LaunchEx(unsigned int gx, unsigned int gy, unsigned int gz,
-                unsigned int bx, unsigned int by, unsigned int bz,
-                unsigned int shared_mem, cudaStream_t stream,
-                void** extra) {
+  void LaunchEx(
+      unsigned int gx,
+      unsigned int gy,
+      unsigned int gz,
+      unsigned int bx,
+      unsigned int by,
+      unsigned int bz,
+      unsigned int shared_mem,
+      cudaStream_t stream,
+      void** extra) {
     CAFFE_ENFORCE(
         module_loaded_, "Cannot call Launch before a module is loaded.");
     CUDA_DRIVERAPI_ENFORCE(cuLaunchKernel(
@@ -115,6 +126,6 @@ inline std::string GetUniqueName() {
   return ss.str();
 }
 
-}  // namepsace caffe2
+} // namespace caffe2
 
-#endif  // CAFFE2_CUDA_RTC_COMMON_RTC_H_
+#endif // CAFFE2_CUDA_RTC_COMMON_RTC_H_
@@ -5,8 +5,7 @@
 
 namespace caffe2 {
 namespace {
-class ElementwiseRTCFunction
-    : public CudaRTCFunction<ElementwiseRTCFunction> {
+class ElementwiseRTCFunction : public CudaRTCFunction<ElementwiseRTCFunction> {
  public:
   ElementwiseRTCFunction() : CudaRTCFunction(), name_(GetUniqueName()) {}
 
@@ -22,22 +21,21 @@ class ElementwiseRTCFunction
   string name_;
 };
 
-template<>
+template <>
 string ElementwiseRTCFunction::GetSource(
-    int input_size, int output_size,
+    int input_size,
+    int output_size,
     const string command_string) {
   std::stringstream ss;
-  ss << "extern \"C\" __global__ void " << name_ <<
-        "(const size_t nthreads, \n";
+  ss << "extern \"C\" __global__ void " << name_
+     << "(const size_t nthreads, \n";
   // Insert the parameter list.
   int remain_params = input_size + output_size;
   for (int i = 0; i < input_size; ++i) {
-    ss << "const float* in" << i
-       << ((remain_params--) ? ", \n" : "");
+    ss << "const float* in" << i << ((remain_params--) ? ", \n" : "");
   }
   for (int i = 0; i < output_size; ++i) {
-    ss << "float* out" << i
-       << ((remain_params--) ? ", \n" : "");
+    ss << "float* out" << i << ((remain_params--) ? ", \n" : "");
   }
   ss << ") {\n"
         "for (int index = blockIdx.x * blockDim.x + threadIdx.x;\n"
@@ -46,7 +44,7 @@ string ElementwiseRTCFunction::GetSource(
      << "}\n}";
   return ss.str();
 }
-}  // namespace
+} // namespace
 
 /**
  * A GPU operator that can generate limited elementwise operations.
@@ -75,17 +73,17 @@ class ElementwiseRTCOp final : public Operator<CUDAContext> {
  public:
   ElementwiseRTCOp(const OperatorDef& operator_def, Workspace* ws)
       : Operator<CUDAContext>(operator_def, ws) {
-    const string src = OperatorBase::GetSingleArgument<string>(
-        "rtc_src", "");
+    const string src = OperatorBase::GetSingleArgument<string>("rtc_src", "");
     CAFFE_ENFORCE(src.size(), "Op should have a non-zero source code size.");
     func_.Compile(InputSize(), OutputSize(), src);
   }
   ~ElementwiseRTCOp() override {}
 
   bool RunOnDevice() override {
-    static_assert(sizeof(void*) == sizeof(size_t),
-                  "The argbuffer relies on the assumption that void* and "
-                  "size_t have the same size.");
+    static_assert(
+        sizeof(void*) == sizeof(size_t),
+        "The argbuffer relies on the assumption that void* and "
+        "size_t have the same size.");
     vector<size_t> argBuffer_vec(InputSize() + OutputSize() + 1);
     size_t* argBuffer = argBuffer_vec.data();
     CAFFE_ENFORCE(
@@ -102,10 +100,11 @@ class ElementwiseRTCOp final : public Operator<CUDAContext> {
     }
     size_t argBufferSize = sizeof(argBuffer);
     void* config[] = {
-      CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,
-      CU_LAUNCH_PARAM_BUFFER_SIZE, &argBufferSize,
-      CU_LAUNCH_PARAM_END
-    };
+        CU_LAUNCH_PARAM_BUFFER_POINTER,
+        argBuffer,
+        CU_LAUNCH_PARAM_BUFFER_SIZE,
+        &argBufferSize,
+        CU_LAUNCH_PARAM_END};
     func_.LaunchEx(
         CAFFE_GET_BLOCKS(Input(0).numel()),
         1,
@@ -127,4 +126,4 @@ namespace {
 REGISTER_CUDA_OPERATOR_WITH_ENGINE(ElementwiseRTC, NVRTC, ElementwiseRTCOp);
 }
 
-}  // namespace caffe2
+} // namespace caffe2
@@ -2,14 +2,14 @@
 
 #include "caffe2/core/common_gpu.h"
 #include "caffe2/core/context_gpu.h"
-#include "caffe2/operators/pool_op.h"
 #include "caffe2/cuda_rtc/common_rtc.h"
+#include "caffe2/operators/pool_op.h"
 
 namespace caffe2 {
 namespace {
 class AveragePool {};
 class MaxPool {};
-}  // namespace
+} // namespace
 
 namespace {
 
@@ -98,7 +98,6 @@ __global__ void %s(
 }
 )";
 
-
 class MaxPoolRTCFunction : public CudaRTCFunction<MaxPoolRTCFunction> {
  public:
   MaxPoolRTCFunction() : CudaRTCFunction(), name_(GetUniqueName()) {}
@@ -132,7 +131,6 @@ class MaxPoolGradientRTCFunction
   string name_;
 };
 
-
 template <>
 string MaxPoolRTCFunction::GetSource(
     const int output_size,
@@ -149,9 +147,22 @@ string MaxPoolRTCFunction::GetSource(
     const int pad_l) {
   char buffer[65536];
   int nbytes = snprintf(
-      buffer, 65536, kMaxPoolForwardNCHWSource, name_.c_str(), output_size,
-      channels, height, width, pooled_height, pooled_width, kernel_h, kernel_w,
-      stride_h, stride_w, pad_t, pad_l);
+      buffer,
+      65536,
+      kMaxPoolForwardNCHWSource,
+      name_.c_str(),
+      output_size,
+      channels,
+      height,
+      width,
+      pooled_height,
+      pooled_width,
+      kernel_h,
+      kernel_w,
+      stride_h,
+      stride_w,
+      pad_t,
+      pad_l);
   DCHECK_GE(nbytes, 0);
   DCHECK_LT(nbytes, 65536);
   return string(buffer);
@@ -174,16 +185,29 @@ string MaxPoolGradientRTCFunction::GetSource(
     const int pad_l) {
   char buffer[65536];
   int nbytes = snprintf(
-      buffer, 65536, kMaxPoolBackwardNCHWSource, name_.c_str(), output_size,
-      num, channels, height, width, pooled_height, pooled_width, kernel_h,
-      kernel_w, stride_h, stride_w, pad_t, pad_l);
+      buffer,
+      65536,
+      kMaxPoolBackwardNCHWSource,
+      name_.c_str(),
+      output_size,
+      num,
+      channels,
+      height,
+      width,
+      pooled_height,
+      pooled_width,
+      kernel_h,
+      kernel_w,
+      stride_h,
+      stride_w,
+      pad_t,
+      pad_l);
   DCHECK_GE(nbytes, 0);
   DCHECK_LT(nbytes, 65536);
   return string(buffer);
 }
 
-}  // namespace
-
+} // namespace
 
 class MaxPoolRTCOp final : public ConvPoolOpBase<CUDAContext> {
  public:
@@ -196,7 +220,8 @@ class MaxPoolRTCOp final : public ConvPoolOpBase<CUDAContext> {
 
   bool RunOnDeviceWithOrderNCHW() override {
     auto& X = Input(0);
-    auto output_sizes = ConvPoolOpBase<CUDAContext>::GetOutputSize(X, X.dim32(1));
+    auto output_sizes =
+        ConvPoolOpBase<CUDAContext>::GetOutputSize(X, X.dim32(1));
     auto* Y = Output(0, output_sizes, at::dtype<float>());
 
     if (input_dims_ != X.sizes()) {
@@ -307,7 +332,9 @@ class MaxPoolGradientRTCOp final : public ConvPoolOpBase<CUDAContext> {
 
 namespace {
 REGISTER_CUDA_OPERATOR_WITH_ENGINE(MaxPool, NVRTC, MaxPoolRTCOp);
-REGISTER_CUDA_OPERATOR_WITH_ENGINE(MaxPoolGradient, NVRTC,
-                                   MaxPoolGradientRTCOp);
-}  // namespace
-}  // namespace caffe2
+REGISTER_CUDA_OPERATOR_WITH_ENGINE(
+    MaxPoolGradient,
+    NVRTC,
+    MaxPoolGradientRTCOp);
+} // namespace
+} // namespace caffe2
@@ -6,4 +6,4 @@ REGISTER_CPU_OPERATOR(CreateDB, CreateDBOp<CPUContext>);
 OPERATOR_SCHEMA(CreateDB).NumInputs(0).NumOutputs(1);
 
 NO_GRADIENT(CreateDB);
-}  // namespace caffe2
+} // namespace caffe2
@@ -1,6 +1,6 @@
 #include "caffe2/core/db.h"
-#include "caffe2/core/logging.h"
 #include "caffe2/core/flags.h"
+#include "caffe2/core/logging.h"
 #include "leveldb/db.h"
 #include "leveldb/write_batch.h"
 
@@ -19,13 +19,27 @@ class LevelDBCursor : public Cursor {
     SeekToFirst();
   }
   ~LevelDBCursor() override {}
-  void Seek(const string& key) override { iter_->Seek(key); }
-  bool SupportsSeek() override { return true; }
-  void SeekToFirst() override { iter_->SeekToFirst(); }
-  void Next() override { iter_->Next(); }
-  string key() override { return iter_->key().ToString(); }
-  string value() override { return iter_->value().ToString(); }
-  bool Valid() override { return iter_->Valid(); }
+  void Seek(const string& key) override {
+    iter_->Seek(key);
+  }
+  bool SupportsSeek() override {
+    return true;
+  }
+  void SeekToFirst() override {
+    iter_->SeekToFirst();
+  }
+  void Next() override {
+    iter_->Next();
+  }
+  string key() override {
+    return iter_->key().ToString();
+  }
+  string value() override {
+    return iter_->value().ToString();
+  }
+  bool Valid() override {
+    return iter_->Valid();
+  }
 
  private:
   std::unique_ptr<leveldb::Iterator> iter_;
@@ -47,8 +61,7 @@ class LevelDBTransaction : public Transaction {
     leveldb::Status status = db_->Write(leveldb::WriteOptions(), batch_.get());
     batch_.reset(new leveldb::WriteBatch());
     CAFFE_ENFORCE(
-        status.ok(),
-        "Failed to write batch to leveldb. ", status.ToString());
+        status.ok(), "Failed to write batch to leveldb. ", status.ToString());
   }
 
  private:
@@ -71,12 +84,17 @@ class LevelDB : public DB {
     leveldb::Status status = leveldb::DB::Open(options, source, &db_temp);
     CAFFE_ENFORCE(
         status.ok(),
-        "Failed to open leveldb ", source, ". ", status.ToString());
+        "Failed to open leveldb ",
+        source,
+        ". ",
+        status.ToString());
     db_.reset(db_temp);
     VLOG(1) << "Opened leveldb " << source;
   }
 
-  void Close() override { db_.reset(); }
+  void Close() override {
+    db_.reset();
+  }
   unique_ptr<Cursor> NewCursor() override {
     return make_unique<LevelDBCursor>(db_.get());
   }
@@ -92,5 +110,5 @@ REGISTER_CAFFE2_DB(LevelDB, LevelDB);
 // For lazy-minded, one can also call with lower-case name.
 REGISTER_CAFFE2_DB(leveldb, LevelDB);
 
-}  // namespace db
-}  // namespace caffe2
+} // namespace db
+} // namespace caffe2