C10_REGISTER_CAFFE2_OPERATOR: Macro for registering c2 kernels (pytorch#16548)

smessmer · facebook-github-bot · commit 6750e1e3e995 · 2019-02-07T13:58:14.000-08:00
Summary: Pull Request resolved: pytorch#16548 With this macro, a caffe2 operator can now directly be registered with c10. No need to write custom wrapper kernels anymore. Differential Revision: D13877076 fbshipit-source-id: e56846238c5bb4b1989b79855fd44d5ecf089c9c
diff --git a/aten/src/ATen/core/dispatch/DispatchTable.h b/aten/src/ATen/core/dispatch/DispatchTable.h
@@ -174,7 +174,7 @@ class DispatchTable final {
  private:
   static size_t get_index_of_first_tensor_arg_(const FunctionSchema& schema) {
     for (size_t i = 0; i < schema.arguments().size(); ++i) {
-      if (schema.arguments()[i].type()->isSubtypeOf(TensorType::get())) {  // DynamicType means it's a tensor
+      if (schema.arguments()[i].type()->isSubtypeOf(TensorType::get())) {
         return i;
       }
     }
diff --git a/aten/src/ATen/core/stack.h b/aten/src/ATen/core/stack.h
@@ -52,6 +52,15 @@ static inline IValue pop(Stack& stack) {
   stack.pop_back();
   return r;
 }
+static inline std::vector<IValue> pop(Stack& stack, size_t n) {
+  std::vector<IValue> result;
+  result.reserve(n);
+  for (size_t i = 0; i < n; ++i) {
+    result.push_back(std::move(peek(stack, i, n)));
+  }
+  drop(stack, n);
+  return result;
+}
 
 // variadic pop:
 // int64_t a; at::Tensor b;
diff --git a/c10/util/flat_hash_map.h b/c10/util/flat_hash_map.h
@@ -5,6 +5,7 @@
 // - https://github.com/skarupke/flat_hash_map/pull/26
 // - replace size_t with uint64_t to fix it for 32bit
 // - add "GCC diagnostic" pragma to ignore -Wshadow
+// - make sherwood_v3_table::convertible_to_iterator public because GCC5 seems to have issues with it otherwise
 
 //          Copyright Malte Skarupke 2017.
 // Distributed under the Boost Software License, Version 1.0.
@@ -293,9 +294,9 @@ class sherwood_v3_table : private EntryAlloc, private Hasher, private Equal
     using Entry = detailv3::sherwood_v3_entry<T>;
     using AllocatorTraits = std::allocator_traits<EntryAlloc>;
     using EntryPointer = typename AllocatorTraits::pointer;
-    struct convertible_to_iterator;
 
 public:
+    struct convertible_to_iterator;
 
     using value_type = T;
     using size_type = uint64_t;
@@ -924,7 +925,7 @@ class sherwood_v3_table : private EntryAlloc, private Hasher, private Equal
         return static_cast<Equal &>(*this)(lhs, rhs);
     }
 
-private:
+public:
     struct convertible_to_iterator
     {
         EntryPointer it;
diff --git a/caffe2/core/c10_operator.h b/caffe2/core/c10_operator.h
@@ -0,0 +1,93 @@
+#pragma once
+
+#include <vector>
+#include <ATen/core/dispatch/OpSchemaRegistration.h>
+#include <ATen/core/dispatch/KernelRegistration.h>
+#include <ATen/core/function_schema.h>
+
+namespace caffe2 {
+namespace detail {
+
+template<class Caffe2Operator> const c10::OperatorHandle& c10_op_handle_for_c2_op();
+template <class Caffe2Operator>
+void call_caffe2_op_from_c10(c10::Stack* stack, c10::KernelCache* cache) { // TODO Pass in correct cache type
+  // precondition: on the stack, there's an IValue for each input and an IValue for each output.
+  // The output ones could either be a preallocated tensor or ivalue::None.
+
+  const auto& schema = c10_op_handle_for_c2_op<Caffe2Operator>().schema();
+  const size_t num_outputs = schema.returns().size();
+  const size_t total_num_arguments = schema.arguments().size();
+  const size_t num_inputs = total_num_arguments - num_outputs;
+
+  // TODO Avoid vector allocation. One idea would be to keep the std::vector instances in the cache.
+  auto outputs = torch::jit::pop(*stack, num_outputs);
+  auto inputs = torch::jit::pop(*stack, num_inputs);
+
+  const auto device = at::Device(at::DeviceType::CPU); // TODO Handle GPU devices
+
+  for (auto& output : outputs) {
+    if (output.isNone() || (output.isTensor() && !output.toTensor().defined())) {
+      output = at::Tensor(c10::C10Tensor(caffe2::empty({0}, device)));
+    }
+  }
+
+  std::vector<c10::IValue*> outputPtrs;
+  outputPtrs.reserve(outputs.size());
+  for (auto& output : outputs) {
+    outputPtrs.push_back(&output);
+  }
+
+  Caffe2Operator(schema, std::move(inputs), std::move(outputPtrs)).Run();
+
+  for (auto& output: outputs) {
+    torch::jit::push(*stack, std::move(output));
+  }
+
+  // postcondition: All inputs are cleared from the stack, there's now one
+  //                IValue for each output which holds the result. This
+  //                might reuse one of the preallocated tensors but doesn't have to.
+}
+
+inline c10::FunctionSchema make_function_schema_for_c10(const char* OperatorName, std::vector<c10::Argument> inputs, std::vector<c10::Argument> outputs) {
+  // actual_inputs is the real inputs plus an optional tensor argument for each output.
+  // this can be used to pass in a preallocated output tensor.
+  std::vector<c10::Argument> actual_inputs = std::move(inputs);
+  actual_inputs.reserve(actual_inputs.size() + outputs.size());
+  for (const auto& elem : outputs) {
+    AT_ASSERT(elem.type()->isSubtypeOf(c10::TensorType::get()));
+    actual_inputs.push_back(c10::Argument(elem.name(), c10::OptionalType::create(elem.type()), nullopt, IValue()));
+  }
+
+  return c10::FunctionSchema(
+    std::string("_caffe2::") + OperatorName,
+    std::move(actual_inputs), std::move(outputs));
+}
+
+}
+}
+
+/**
+ * Call this macro to register a caffe2 operator with the c10 dispatcher.
+ */
+// TODO This macro should take a JIT schema string instead of a vector of inputs and outputs.
+#define C10_REGISTER_CAFFE2_OPERATOR(OperatorName, Inputs, Outputs, OperatorClass)                \
+  /* Register the op schema with the c10 dispatcher */                                            \
+  namespace caffe2 {                                                                              \
+    C10_DEFINE_OP_SCHEMA(OperatorName,                                                            \
+      caffe2::detail::make_function_schema_for_c10(                                               \
+        #OperatorName, Inputs, Outputs));                                                         \
+  }                                                                                               \
+  /* Store the c10 operator handle so call_caffe2_op_from_c10 can access it */                    \
+  namespace caffe2 { namespace detail {                                                           \
+  template<>                                                                                      \
+  const c10::OperatorHandle& c10_op_handle_for_c2_op<OperatorClass<caffe2::CPUContext>>() {       \
+    return caffe2::OperatorName();                                                                \
+  }                                                                                               \
+  }}                                                                                              \
+  /* Register call_caffe2_op_from_c10 as a kernel with the c10 dispatcher */                      \
+  namespace c10 {                                                                                 \
+  C10_REGISTER_KERNEL(caffe2::OperatorName)                                                       \
+      /*.withCache<Cache>()*/                                                                     \
+      .kernel<&caffe2::detail::call_caffe2_op_from_c10<OperatorClass<caffe2::CPUContext>>>()      \
+      .dispatchKey(CPUTensorId());                                                                \
+  }
diff --git a/caffe2/core/operator.h b/caffe2/core/operator.h
@@ -1356,4 +1356,6 @@ std::function<void(const OperatorDef&)> GetOperatorLogger();
 
 }  // namespace caffe2
 
+#include "caffe2/core/c10_operator.h"
+
 #endif  // CAFFE2_CORE_OPERATOR_H_
diff --git a/caffe2/operators/layer_norm_op.cc b/caffe2/operators/layer_norm_op.cc
@@ -182,99 +182,18 @@ to the end.)
     .Output(1, "mean", "Mean values for each feature vector")
     .Output(2, "stddev", "Standard deviations for each feature vector");
 
-C10_DEFINE_OP_SCHEMA(LayerNorm, FunctionSchema(
-    "caffe2::layer_norm_dont_use_this_op_yet",
-    (std::vector<c10::Argument>{
-      c10::Argument("input"),
-      c10::Argument("axis", IntType::get()),
-      c10::Argument("epsilon", FloatType::get()),
-      c10::Argument("output", OptionalType::ofTensor(), nullopt, IValue()),
-      c10::Argument("output_mean", OptionalType::ofTensor(), nullopt, IValue()),
-      c10::Argument("output_stdev", OptionalType::ofTensor(), nullopt, IValue())
-    }), (std::vector<c10::Argument>{
-      c10::Argument("output"),
-      c10::Argument("mean"),
-      c10::Argument("stdev")
-    })
-));
-
 } // namespace caffe2
 
-
-// Register layer norm with c10
-namespace {
-struct Cache final : public c10::KernelCache {
-    at::optional<at::Tensor> scale = at::nullopt;
-    at::optional<at::Tensor> bias = at::nullopt;
-};
-
-template <class DataType>
-void layer_norm_c10(c10::Stack* stack, c10::KernelCache* cache_) { // TODO Pass in correct cache type
-  c10::ArrayRef<c10::IValue> inputs = torch::jit::peekSlice(*stack, 0, 3, 6);
-  c10::ArrayRef<c10::IValue> outputs = torch::jit::peekSlice(*stack, 3, 3, 6);
-
-
-  caffe2::Tensor X{inputs[0].toTensor()};
-  int64_t axis = inputs[1].toInt();
-  float epsilon = inputs[2].toDouble();
-
-  auto device = X.GetDevice();
-
-  caffe2::Tensor Y, mean, sig;
-  if (outputs[0].isTensor()) {
-    Y = caffe2::Tensor(std::move(torch::jit::peek(*stack, 0, 3)).toTensor());
-  }
-  if (outputs[1].isTensor()) {
-    mean = caffe2::Tensor(std::move(torch::jit::peek(*stack, 1, 3)).toTensor());
-  }
-  if (outputs[2].isTensor()) {
-    sig = caffe2::Tensor(std::move(torch::jit::peek(*stack, 2, 3)).toTensor());
-  }
-  if (!Y.defined()) {
-    Y = caffe2::empty({0}, device);
-  }
-  if (!mean.defined()) {
-    mean = caffe2::empty({0}, device);
-  }
-  if (!sig.defined()) {
-    sig = caffe2::empty({0}, device);
-  }
-
-  caffe2::CPUContext context;
-  Cache* cache = static_cast<Cache*>(cache_);
-  if (!cache->scale.has_value()) {
-    cache->scale = at::Tensor(caffe2::empty({0}, at::dtype<float>()));
-  }
-  if (!cache->bias.has_value()) {
-    cache->bias = at::Tensor(caffe2::empty({0}, at::dtype<float>()));
-  }
-  caffe2::Tensor scale(*cache->scale);
-  caffe2::Tensor bias(*cache->bias);
-
-  const int canonical_axis = X.canonical_axis_index(axis);
-  std::vector<int64_t> moments_dims(
-      X.sizes().cbegin(), X.sizes().cbegin() + canonical_axis);
-  moments_dims.push_back(1);
-  mean.Resize(moments_dims);
-  sig.Resize(moments_dims);
-  caffe2::LayerNormOp<caffe2::CPUContext>::runLayerNorm<DataType>(
-    X, &Y, &mean, &sig, canonical_axis, epsilon, &scale, &bias, static_cast<caffe2::CPUContext*>(&context)
-  );
-
-  torch::jit::drop(*stack, 6);
-  torch::jit::push(*stack,
-    at::Tensor(std::move(Y)),
-    at::Tensor(std::move(mean)),
-    at::Tensor(std::move(sig))
-  );
-
-  return;
-}
-
-}
-namespace c10 {
-C10_REGISTER_KERNEL(caffe2::LayerNorm)
-    .withCache<Cache>()
-    .kernel<&layer_norm_c10<float>>()
-    .dispatchKey(CPUTensorId());
-} // namespace c10
+C10_REGISTER_CAFFE2_OPERATOR(
+  LayerNorm,
+  (std::vector<c10::Argument>{
+    c10::Argument("input"),
+    c10::Argument("axis", c10::IntType::get()),
+    c10::Argument("epsilon", c10::FloatType::get())
+  }), (std::vector<c10::Argument>{
+    c10::Argument("output"),
+    c10::Argument("mean"),
+    c10::Argument("stdev")
+  }),
+  caffe2::LayerNormOp
+)
diff --git a/caffe2/operators/layer_norm_op.h b/caffe2/operators/layer_norm_op.h
@@ -19,8 +19,9 @@ class LayerNormOp final : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
 
-  LayerNormOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws),
+  template<class... Args>
+  explicit LayerNormOp(Args&&... args)
+      : Operator<Context>(std::forward<Args>(args)...),
         OP_SINGLE_ARG(int, "axis", axis_, 1),
         OP_SINGLE_ARG(float, "epsilon", epsilon_, 1e-5f) {}
 
diff --git a/caffe2/python/operator_test/layer_norm_op_test.py b/caffe2/python/operator_test/layer_norm_op_test.py
@@ -143,7 +143,7 @@ def test_layer_norm_op_pytorch(self, X, gc, dc):
         epsilon = 1e-4
 
         expected_norm, expected_mean, expected_stdev = _layer_norm_ref(axis, epsilon, X)
-        actual_norm, actual_mean, actual_stdev = torch.ops.caffe2.layer_norm_dont_use_this_op_yet(torch.tensor(X), axis, epsilon)
+        actual_norm, actual_mean, actual_stdev = torch.ops._caffe2.LayerNorm(torch.tensor(X), axis, epsilon)
 
         torch.testing.assert_allclose(expected_norm, actual_norm)
         torch.testing.assert_allclose(expected_mean, actual_mean)
@@ -154,7 +154,7 @@ def test_layer_norm_op_jit(self, X, gc, dc):
         @torch.jit.script
         def jit_layer_norm(tensor, axis, epsilon):
             # type: (Tensor, int, float) -> Tuple[Tensor, Tensor, Tensor]
-            norm, mean, stdev = torch.ops.caffe2.layer_norm_dont_use_this_op_yet(tensor, axis, epsilon)
+            norm, mean, stdev = torch.ops._caffe2.LayerNorm(tensor, axis, epsilon)
             return norm, mean, stdev
 
         axis = np.random.randint(0, len(X.shape))
diff --git a/test/test_torch.py b/test/test_torch.py
@@ -10072,7 +10072,7 @@ def test_c10_layer_norm(self):
 
         expected_norm = torch.nn.functional.layer_norm(X, X.size()[1:], eps=epsilon)
         actual_norm, actual_mean, actual_stdev = \
-            torch.ops.caffe2.layer_norm_dont_use_this_op_yet(torch.tensor(X), 1, epsilon)
+            torch.ops._caffe2.LayerNorm(torch.tensor(X), 1, epsilon)
         torch.testing.assert_allclose(expected_norm, actual_norm)
 
 # Functions to test negative dimension wrapping

Original file line number	Diff line number	Diff line change
`@@ -174,7 +174,7 @@ class DispatchTable final {`
`174`	`174`	`private:`
`175`	`175`	`static size_t get_index_of_first_tensor_arg_(const FunctionSchema& schema) {`
`176`	`176`	`for (size_t i = 0; i < schema.arguments().size(); ++i) {`
`177`		`- if (schema.arguments()[i].type()->isSubtypeOf(TensorType::get())) { // DynamicType means it's a tensor`
	`177`	`+ if (schema.arguments()[i].type()->isSubtypeOf(TensorType::get())) {`
`178`	`178`	`return i;`
`179`	`179`	`}`
`180`	`180`	`}`