Automatically register c10 ops with JIT (pytorch#16534)

smessmer · facebook-github-bot · commit 16468a9f4536 · 2019-02-06T21:21:33.000-08:00
Summary: Pull Request resolved: pytorch#16534 All c10 ops from the c10 dispatcher are now automatically registered with JIT Reviewed By: dzhulgakov Differential Revision: D13869275 fbshipit-source-id: 5ab5dec5b983fe661f977f9d29d8036768cdcab6
diff --git a/aten/src/ATen/core/dispatch/Dispatcher.cpp b/aten/src/ATen/core/dispatch/Dispatcher.cpp
@@ -1,8 +1,89 @@
 #include <ATen/core/dispatch/Dispatcher.h>
 
 namespace c10 {
+
+namespace detail {
+class RegistrationListenerList final {
+public:
+  void addListener(std::unique_ptr<OpRegistrationListener> listener) {
+    listeners_.push_back(std::move(listener));
+  }
+
+  void callOnOperatorRegistered(const OperatorHandle& op) {
+    for (auto& listener : listeners_) {
+      listener->onOperatorRegistered(op);
+    }
+  }
+
+  void callOnOperatorDeregistered(const OperatorHandle& op) {
+    for (auto& listener : listeners_) {
+      listener->onOperatorDeregistered(op);
+    }
+  }
+private:
+  std::vector<std::unique_ptr<OpRegistrationListener>> listeners_;
+};
+}
+
+OpRegistrationListener::~OpRegistrationListener() {}
+
+Dispatcher::Dispatcher()
+: operators_()
+, listeners_(guts::make_unique<detail::RegistrationListenerList>())
+, mutex_() {}
+
+Dispatcher::~Dispatcher() {}
+
 C10_EXPORT Dispatcher& Dispatcher::singleton() {
   static Dispatcher _singleton;
   return _singleton;
 }
+
+OperatorHandle Dispatcher::registerSchema(FunctionSchema schema) {
+  // we need a lock to avoid concurrent writes
+  std::lock_guard<std::mutex> lock(mutex_);
+
+  operators_.emplace_back(std::move(schema));
+  auto op = OperatorHandle(--operators_.end());
+
+  // note: call listeners *after* operator is added, i.e. dispatcher is already valid for new op
+  listeners_->callOnOperatorRegistered(op);
+
+  return op;
+}
+
+void Dispatcher::deregisterSchema(const OperatorHandle& op) {
+  // we need a lock to avoid concurrent writes
+  std::lock_guard<std::mutex> lock(mutex_);
+
+  if (!op.operatorDefIterator_->dispatchTable.isEmpty()) {
+    AT_ERROR("Tried to deregister op schema that still has kernels registered");
+  }
+
+  // note: call listeners *before* operator is removed, i.e. dispatcher is still valid for removed op
+  listeners_->callOnOperatorDeregistered(op);
+
+  operators_.erase(op.operatorDefIterator_);
+}
+
+void Dispatcher::registerKernel(const OperatorHandle& op, TensorTypeId dispatch_key, KernelFunction* kernel_func, KernelCacheCreatorFunction* cache_creator_func) {
+  // note: this doesn't need the mutex because write operations on the list keep iterators intact.
+  op.operatorDefIterator_->dispatchTable.registerKernel(std::move(dispatch_key), DispatchTableEntry{kernel_func, cache_creator_func});
+}
+
+void Dispatcher::deregisterKernel(const OperatorHandle& op, TensorTypeId dispatch_key) {
+  // note: this doesn't need the mutex because write operations on the list keep iterators intact.
+  op.operatorDefIterator_->dispatchTable.deregisterKernel(dispatch_key);
+}
+
+void Dispatcher::addRegistrationListener(std::unique_ptr<OpRegistrationListener> listener) {
+  std::lock_guard<std::mutex> lock(mutex_);
+
+  for (auto iter = operators_.begin(); iter != operators_.end(); ++iter) {
+    listener->onOperatorRegistered(OperatorHandle(iter));
+  }
+
+  listeners_->addListener(std::move(listener));
+}
+
 }
diff --git a/aten/src/ATen/core/dispatch/Dispatcher.h b/aten/src/ATen/core/dispatch/Dispatcher.h
@@ -51,6 +51,23 @@ class CAFFE2_API OpKernel final {
   mutable std::unique_ptr<c10::KernelCache> cache_;
 };
 
+/**
+ * Implement this interface and register your instance with the dispatcher
+ * to get notified when operators are registered or deregistered with
+ * the dispatcher.
+ */
+class CAFFE2_API OpRegistrationListener {
+public:
+  virtual ~OpRegistrationListener();
+
+  virtual void onOperatorRegistered(const OperatorHandle& op) = 0;
+  virtual void onOperatorDeregistered(const OperatorHandle& op) = 0;
+};
+
+namespace detail {
+class RegistrationListenerList;
+}
+
 /**
  * Top-level dispatch interface for dispatching via the dynamic dispatcher.
  */
@@ -67,6 +84,8 @@ class CAFFE2_API Dispatcher final {
   friend class OperatorHandle;
 
 public:
+  ~Dispatcher();
+
   // Implementation note: this class abstracts over the fact that we have per-operator
   // dispatch tables.  This could be easily adjusted to have a single global hash
   // table.
@@ -100,8 +119,19 @@ class CAFFE2_API Dispatcher final {
    */
   OpKernel lookup(const OperatorHandle& op, const Stack* stack) const;
 
+  /**
+   * Add a listener that gets called whenever a new op is registered or an existing
+   * op is deregistered. Immediately after registering, this listener gets called
+   * for all previously registered ops, so it can be used to keep track of ops
+   * registered with this dispatcher.
+   */
+  void addRegistrationListener(std::unique_ptr<OpRegistrationListener> listener);
+
 private:
+  Dispatcher();
+
   std::list<OperatorDef> operators_;
+  std::unique_ptr<detail::RegistrationListenerList> listeners_;
   std::mutex mutex_;
 };
 
@@ -130,35 +160,6 @@ class CAFFE2_API OperatorHandle final {
 };
 
 
-
-inline OperatorHandle Dispatcher::registerSchema(FunctionSchema schema) {
-  // we need a lock to avoid concurrent writes
-  std::lock_guard<std::mutex> lock(mutex_);
-
-  operators_.emplace_back(std::move(schema));
-  return OperatorHandle(--operators_.end());
-}
-
-inline void Dispatcher::deregisterSchema(const OperatorHandle& op) {
-  // we need a lock to avoid concurrent writes
-  std::lock_guard<std::mutex> lock(mutex_);
-
-  if (!op.operatorDefIterator_->dispatchTable.isEmpty()) {
-    AT_ERROR("Tried to deregister op schema that still has kernels registered");
-  }
-  operators_.erase(op.operatorDefIterator_);
-}
-
-inline void Dispatcher::registerKernel(const OperatorHandle& op, TensorTypeId dispatch_key, KernelFunction* kernel_func, KernelCacheCreatorFunction* cache_creator_func) {
-  // note: this doesn't need the mutex because write operations on the list keep iterators intact.
-  op.operatorDefIterator_->dispatchTable.registerKernel(std::move(dispatch_key), DispatchTableEntry{kernel_func, cache_creator_func});
-}
-
-inline void Dispatcher::deregisterKernel(const OperatorHandle& op, TensorTypeId dispatch_key) {
-  // note: this doesn't need the mutex because write operations on the list keep iterators intact.
-  op.operatorDefIterator_->dispatchTable.deregisterKernel(dispatch_key);
-}
-
 inline OpKernel Dispatcher::lookup(const OperatorHandle& op, const Stack* stack) const {
   // note: this doesn't need the mutex because write operations on the list keep iterators intact.
   const DispatchTableEntry& kernel = op.operatorDefIterator_->dispatchTable.lookup(stack);
diff --git a/aten/src/ATen/core/jit_type.h b/aten/src/ATen/core/jit_type.h
@@ -514,7 +514,7 @@ struct CAFFE2_API ListType : public SingleElementType<TypeKind::ListType, ListTy
 
 struct DictType;
 using DictTypePtr = std::shared_ptr<DictType>;
-struct DictType : public Type {
+struct CAFFE2_API DictType : public Type {
   friend struct Type;
   static const TypeKind Kind = TypeKind::DictType;
 
diff --git a/c10/util/flat_hash_map.h b/c10/util/flat_hash_map.h
@@ -924,6 +924,7 @@ class sherwood_v3_table : private EntryAlloc, private Hasher, private Equal
         return static_cast<Equal &>(*this)(lhs, rhs);
     }
 
+private:
     struct convertible_to_iterator
     {
         EntryPointer it;
diff --git a/caffe2/operators/layer_norm_op.cc b/caffe2/operators/layer_norm_op.cc
@@ -196,12 +196,32 @@ void layer_norm_c10(c10::Stack* stack, c10::KernelCache* cache_) { // TODO Pass
   c10::ArrayRef<c10::IValue> inputs = torch::jit::peekSlice(*stack, 0, 3, 6);
   c10::ArrayRef<c10::IValue> outputs = torch::jit::peekSlice(*stack, 3, 3, 6);
 
-  caffe2::Tensor X{c10::C10Tensor(inputs[0].toTensor())};
+
+  caffe2::Tensor X{inputs[0].toTensor()};
   int64_t axis = inputs[1].toInt();
   float epsilon = inputs[2].toDouble();
-  caffe2::Tensor Y{c10::C10Tensor(outputs[0].toTensor())};
-  caffe2::Tensor mean{c10::C10Tensor(outputs[1].toTensor())};
-  caffe2::Tensor sig{c10::C10Tensor(outputs[2].toTensor())};
+
+  auto device = X.GetDevice();
+
+  caffe2::Tensor Y, mean, sig;
+  if (outputs[0].isTensor()) {
+    Y = caffe2::Tensor(std::move(torch::jit::peek(*stack, 0, 3)).toTensor());
+  }
+  if (outputs[1].isTensor()) {
+    mean = caffe2::Tensor(std::move(torch::jit::peek(*stack, 1, 3)).toTensor());
+  }
+  if (outputs[2].isTensor()) {
+    sig = caffe2::Tensor(std::move(torch::jit::peek(*stack, 2, 3)).toTensor());
+  }
+  if (!Y.defined()) {
+    Y = caffe2::empty({0}, device);
+  }
+  if (!mean.defined()) {
+    mean = caffe2::empty({0}, device);
+  }
+  if (!sig.defined()) {
+    sig = caffe2::empty({0}, device);
+  }
 
   caffe2::CPUContext context;
   Cache* cache = static_cast<Cache*>(cache_);
@@ -226,9 +246,9 @@ void layer_norm_c10(c10::Stack* stack, c10::KernelCache* cache_) { // TODO Pass
 
   torch::jit::drop(*stack, 6);
   torch::jit::push(*stack,
-    at::Tensor(c10::C10Tensor(std::move(Y))),
-    at::Tensor(c10::C10Tensor(std::move(mean))),
-    at::Tensor(c10::C10Tensor(std::move(sig)))
+    at::Tensor(std::move(Y)),
+    at::Tensor(std::move(mean)),
+    at::Tensor(std::move(sig))
   );
 
   return;
diff --git a/tools/build_variables.py b/tools/build_variables.py
@@ -59,6 +59,7 @@
     "torch/csrc/jit/ir.cpp",
     "torch/csrc/jit/caffe2_operator.cpp",
     "torch/csrc/jit/register_caffe2_ops.cpp",
+    "torch/csrc/jit/register_c10_ops.cpp",
     "torch/csrc/jit/symbolic_script.cpp",
     "torch/csrc/jit/operator.cpp",
     "torch/csrc/jit/passes/alias_analysis.cpp",
@@ -101,7 +102,6 @@
     "torch/csrc/jit/script/lexer.cpp",
     "torch/csrc/jit/script/module.cpp",
     "torch/csrc/jit/tracer.cpp",
-    "torch/csrc/jit/c10_ops/layer_norm.cpp",
     "torch/csrc/utils/tensor_flatten.cpp",
     "torch/csrc/utils/variadic.cpp",
     "torch/csrc/jit/fuser/kernel_cache.cpp",
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
@@ -135,6 +135,7 @@ set(TORCH_SRCS
   ${TORCH_SRC_DIR}/csrc/jit/ir.cpp
   ${TORCH_SRC_DIR}/csrc/jit/operator.cpp
   ${TORCH_SRC_DIR}/csrc/jit/caffe2_operator.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/register_c10_ops.cpp
   ${TORCH_SRC_DIR}/csrc/jit/symbolic_script.cpp
   ${TORCH_SRC_DIR}/csrc/jit/passes/alias_analysis.cpp
   ${TORCH_SRC_DIR}/csrc/jit/passes/batch_mm.cpp
@@ -179,7 +180,6 @@ set(TORCH_SRCS
   ${TORCH_SRC_DIR}/csrc/jit/script/module.cpp
   ${TORCH_SRC_DIR}/csrc/jit/tracer.cpp
   ${TORCH_SRC_DIR}/csrc/jit/hooks_for_testing.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/c10_ops/layer_norm.cpp
   ${TORCH_SRC_DIR}/csrc/utils/tensor_flatten.cpp
   ${TORCH_SRC_DIR}/csrc/utils/variadic.cpp
   ${TORCH_SRC_DIR}/csrc/jit/fuser/kernel_cache.cpp
diff --git a/torch/csrc/jit/c10_ops/layer_norm.cpp b/torch/csrc/jit/c10_ops/layer_norm.cpp
diff --git a/torch/csrc/jit/operator.h b/torch/csrc/jit/operator.h
@@ -27,6 +27,36 @@ TORCH_API FunctionSchema parseSchema(const std::string& schema);
 
 using OperationCreator = std::function<Operation(const Node*)>;
 
+/*
+ * Note: JIT relies on Operator instances having static lifetime, because
+ * it for example stores a non-owning FunctionSchema* pointer in the Node class,
+ * which points to the function shema stored in the Operator instance.
+ * Also, jit::Operator is meant to store more operator related information like
+ * symbolic derivatives, which also requires them to have static lifetime
+ * so that changes to symbolic derivatives are remembered.
+ *
+ * Now, currently, the c10 operator library doesn't store jit::Operator instances,
+ * but we use a listener pattern that notifies JIT about changes in the
+ * c10 operator library and then registers jit::Operator instances to the JIT
+ * operator registry, acting as wrappers to the c10 operators.
+ *
+ * However, that results in code duplication as JIT and c10 will likely get
+ * their own mechanisms for storing derivatives and other operator related
+ * information, and all of this would have to be wrapped from c10 into JIT.
+ *
+ * We should consider merging the JIT and c10 registries, moving jit::Operator
+ * to c10 and storing these jit::Operator instances in the c10 operator library
+ * instead, allowing us to have these mechanisms only implemented once.
+ * However, the current jit::Operator implementation has additional features
+ * like OperationCreator that aren't needed in c10 (they're only used for
+ * prim ops like If/Else or While which wouldn't be in the c10 operator library),
+ * and which depend on other JIT features which we don't want to move to c10
+ * (notably jit/ir.h). We might, however, be able, to split jit::Operator into
+ * a c10::Operator with the core features and a jit::Operator that adds the
+ * JIT-only features like OperationCreator, and then use c10::Operator in the
+ * c10 operator library.
+ */
+
 struct TORCH_API Operator {
   Operator(FunctionSchema schema, OperationCreator op_creator)
       : schema_(std::make_shared<FunctionSchema>(std::move(schema))),
diff --git a/torch/csrc/jit/register_c10_ops.cpp b/torch/csrc/jit/register_c10_ops.cpp

Original file line number	Diff line number	Diff line change
`@@ -924,6 +924,7 @@ class sherwood_v3_table : private EntryAlloc, private Hasher, private Equal`
`924`	`924`	`return static_cast<Equal &>(*this)(lhs, rhs);`
`925`	`925`	`}`
`926`	`926`
	`927`	`+private:`
`927`	`928`	`struct convertible_to_iterator`
`928`	`929`	`{`
`929`	`930`	`EntryPointer it;`