Define REGISTER_CPU_GRADIENT_OPERATOR (pytorch#12588)

dreiss · facebook-github-bot · commit 96d826f63592 · 2018-10-22T10:01:02.000-07:00
Summary: Pull Request resolved: pytorch#12588 By default, this is an alias for REGISTER_CPU_OPERATOR. If gradients are not required (e.g., on mobile) it can be converted to a no-op by defining CAFFE2_NO_GRADIENT_OPS, resulting in a smaller build. GRADIENT_OPERATOR_SCHEMA works similarly. CAFFE2_NO_GRADIENT_OPS also converts REGISTER_GRADIENT to a no-op. Use these macros in fully_connected_op.cc as an example. Follow-up diffs will convert more operators. I had to introduce MACRO_EXPAND to handle the way Visual Studio expands VA_ARGS. Reviewed By: Yangqing Differential Revision: D10209468 fbshipit-source-id: 4116d9098b97646bb30a00f2a7d46aa5d7ebcae0
diff --git a/c10/macros/Macros.h b/c10/macros/Macros.h
@@ -32,6 +32,8 @@
 #define CONCAT_IMPL(x, y) x##y
 #define MACRO_CONCAT(x, y) CONCAT_IMPL(x, y)
 
+#define MACRO_EXPAND(args) args
+
 /// C10_NODISCARD - Warn if a type or return value is discarded.
 #define C10_NODISCARD
 #if __cplusplus > 201402L && defined(__has_cpp_attribute)
diff --git a/caffe2/core/operator.h b/caffe2/core/operator.h
@@ -9,6 +9,7 @@
 #include <typeinfo>
 #include <vector>
 
+#include "c10/macros/Macros.h"
 #include "c10/util/Registry.h"
 #include "caffe2/core/blob.h"
 #include "caffe2/core/common.h"
@@ -858,6 +859,15 @@ C10_DECLARE_REGISTRY(
 #define REGISTER_CPU_OPERATOR_WITH_ENGINE(name, engine, ...) \
   C10_REGISTER_CLASS(CPUOperatorRegistry, name##_ENGINE_##engine, __VA_ARGS__)
 
+// Use these macros to register gradient operators.  They can be automatically
+// excluded from builds that don't need them (e.g., mobile).
+#ifdef CAFFE2_NO_GRADIENT_OPS
+#define REGISTER_CPU_GRADIENT_OPERATOR(...) /* No gradients. */
+#else
+#define REGISTER_CPU_GRADIENT_OPERATOR(...) \
+  MACRO_EXPAND(REGISTER_CPU_OPERATOR(__VA_ARGS__))
+#endif
+
 C10_DECLARE_REGISTRY(
     CUDAOperatorRegistry,
     OperatorBase,
diff --git a/caffe2/core/operator_gradient.h b/caffe2/core/operator_gradient.h
@@ -301,11 +301,20 @@ C10_DECLARE_REGISTRY(
     const OperatorDef&,
     const vector<GradientWrapper>&);
 
+#ifdef CAFFE2_NO_GRADIENT_OPS
+
+#define REGISTER_GRADIENT(name, ...) /* No gradients. */
+#define REGISTER_GRADIENT_STR(str_name, ...) /* No gradients. */
+
+#else
+
 #define REGISTER_GRADIENT(name, ...) \
   C10_REGISTER_CLASS(GradientRegistry, name, __VA_ARGS__)
 #define REGISTER_GRADIENT_STR(str_name, ...) \
   C10_REGISTER_TYPED_CLASS(GradientRegistry, str_name, __VA_ARGS__)
 
+#endif
+
 // NO_GRADIENT means that the operator does not need any gradient computation.
 #define NO_GRADIENT(name) REGISTER_GRADIENT(name, NoGradient)
 
diff --git a/caffe2/core/operator_schema.h b/caffe2/core/operator_schema.h
@@ -597,4 +597,16 @@ OpSchema::Cost PointwiseCostInference(
 
 #endif // CAFFE2_NO_OPERATOR_SCHEMA
 
+#ifdef CAFFE2_NO_GRADIENT_OPS
+
+#define GRADIENT_OPERATOR_SCHEMA(name)                              \
+  C10_EXPORT void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name(){}; \
+  static OpSchema* C10_ANONYMOUS_VARIABLE(name) CAFFE2_UNUSED =     \
+      1 ? nullptr : &OpSchemaRegistry::NewSchema(#name, __FILE__, __LINE__)
+
+#else
+
+#define GRADIENT_OPERATOR_SCHEMA(name) OPERATOR_SCHEMA(name)
+
+#endif
 #endif // CAFFE2_CORE_OPERATOR_SCHEMA_H_
diff --git a/caffe2/operators/fully_connected_op.cc b/caffe2/operators/fully_connected_op.cc
@@ -5,15 +5,17 @@
 namespace caffe2 {
 
 REGISTER_CPU_OPERATOR(FC, FullyConnectedOp<CPUContext>);
-REGISTER_CPU_OPERATOR(FCGradient, FullyConnectedGradientOp<CPUContext>);
+REGISTER_CPU_GRADIENT_OPERATOR(
+    FCGradient,
+    FullyConnectedGradientOp<CPUContext>);
 
 REGISTER_CPU_OPERATOR(
     FCTransposed,
     FullyConnectedOp<
         CPUContext,
         DefaultEngine,
         false /* don't transpose weight */>);
-REGISTER_CPU_OPERATOR(
+REGISTER_CPU_GRADIENT_OPERATOR(
     FCTransposedGradient,
     FullyConnectedGradientOp<
         CPUContext,
@@ -255,13 +257,13 @@ print("Y:\n", workspace.FetchBlob("Y"))
         "Ouput blob containing a 2D output matrix of shape $(M,N)$, where $M$ is the batch size and $N$ is the number of nodes in the layer. The ouput is calculated as $Y=XW^T+b$.")
     .InheritOnnxSchema("Gemm");
 
-OPERATOR_SCHEMA(FCGradient)
+GRADIENT_OPERATOR_SCHEMA(FCGradient)
     .NumInputs(3)
     .NumOutputs(2, 3)
     .TensorInferenceFunction(std::bind(FCGradientShapeInference, _1, _2, false))
     .CostInferenceFunction(
         std::bind(CostInferenceForFCGradient, _1, _2, false));
-OPERATOR_SCHEMA(FCTransposedGradient)
+GRADIENT_OPERATOR_SCHEMA(FCTransposedGradient)
     .NumInputs(3)
     .NumOutputs(2, 3)
     .TensorInferenceFunction(std::bind(FCGradientShapeInference, _1, _2, false))