New chaining/partitioning algorithm for async_scheduling for inference (pytorch#11957)

Yinghai Lu · facebook-github-bot · commit e7653c756130 · 2018-10-08T12:24:52.000-07:00
Summary: Pull Request resolved: pytorch#11957 For distributed inference, we want to use async_scheduling net to run the net as we need its async part. However, according to the profiling, async_net has big overhead of dispatching tasks onto worker threads. This diff improves the issue by generating a smaller number of chains/tasks by grouping the sync ops that can be run in one shot. Note that it also schedule individual async ops as a single chain because unlike gpu ops, rpc ops are not guaranteed to be linearized at the remote site. For example, if you have two rps ops `op1->op2`, op2 won't implicitly block until op1 finishes. Therefore we need to put each of the async op as one chain as async_scheduling net will only sync the tail of the chain. For the all sync op nets, this change give us `1.5X` slower than simple_net, while without the change, it is `7X` slower. Next step is to work on the executor to make the task scheduling faster. And add a fallback path to be able to run ops inline if it's a all-sync net. Reviewed By: ilia-cher Differential Revision: D9874140 fbshipit-source-id: fcd45328698c29211f2c06ee3287194acda12227
diff --git a/caffe2/core/net_async_base.cc b/caffe2/core/net_async_base.cc
@@ -13,6 +13,11 @@ C10_DEFINE_int(
 
 C10_DECLARE_bool(caffe2_dag_net_collect_stats);
 
+C10_DEFINE_bool(
+    caffe2_net_async_inference_mode,
+    false,
+    "If set, use one single chain containing all ops");
+
 C10_DEFINE_bool(
     caffe2_net_async_finish_chain,
     false,
@@ -73,7 +78,11 @@ AsyncNetBase::AsyncNetBase(
     operators_.push_back(op_ptr);
   }
 
-  execution_chains_ = dag_utils::computeChains(operator_nodes_);
+  if (c10::FLAGS_caffe2_net_async_inference_mode) {
+    execution_chains_ = dag_utils::computeGroups(operator_nodes_);
+  } else {
+    execution_chains_ = dag_utils::computeChains(operator_nodes_);
+  }
   chains_.reserve(execution_chains_.size());
   for (const auto& kv : execution_chains_) {
     chains_.push_back(kv.second);
diff --git a/caffe2/core/net_dag_utils.cc b/caffe2/core/net_dag_utils.cc
@@ -278,6 +278,88 @@ ExecutionChains computeChains(std::vector<OperatorNode>& orig_nodes) {
   return chains;
 }
 
+// Here chains are essentially groups, we used chain/group interchangeably
+ExecutionChains computeGroups(std::vector<OperatorNode>& orig_nodes) {
+  const std::vector<OpGraphNode> nodes = pruneOpNodeGraph(orig_nodes);
+  ExecutionChains chains;
+  std::vector<int> sync_frontier;
+  std::vector<int> async_frontier;
+
+  std::vector<int> in_degrees;
+  in_degrees.reserve(nodes.size());
+  std::transform(
+      nodes.begin(),
+      nodes.end(),
+      std::back_inserter(in_degrees),
+      [](const OpGraphNode& n) { return n.parents_.size(); });
+
+  // Screen out the primary root nodes
+  for (int idx = 0; idx < (int)nodes.size(); ++idx) {
+    if (in_degrees[idx] == 0) {
+      if (orig_nodes[idx].operator_->HasAsyncPart()) {
+        async_frontier.push_back(idx);
+      } else {
+        sync_frontier.push_back(idx);
+      }
+    }
+  }
+
+  // We check sync ops on the froniter first and then async ops. This gives us a
+  // head start to execute sync ops locally while waiting for async ops to
+  // finish.
+  std::queue<int> q;
+  while (!(async_frontier.empty() && sync_frontier.empty())) {
+    // Sync ops
+    for (const auto i : sync_frontier) {
+      q.push(i);
+    }
+    sync_frontier.clear();
+    std::vector<int> chain;
+    while (!q.empty()) {
+      int idx = q.front();
+      q.pop();
+      chain.push_back(idx);
+      for (int child : nodes[idx].children_) {
+        if (--in_degrees[child] == 0) {
+          if (orig_nodes[child].operator_->HasAsyncPart()) {
+            async_frontier.push_back(child);
+          } else {
+            q.push(child);
+          }
+        }
+      }
+    }
+    // add the whole group of continuous sync ops into one chain
+    if (!chain.empty()) {
+      chains.emplace(chain.front(), chain);
+    }
+
+    // Async ops
+    for (const auto i : async_frontier) {
+      q.push(i);
+    }
+    async_frontier.clear();
+    while (!q.empty()) {
+      int idx = q.front();
+      q.pop();
+      // Put each individual node as a new chain
+      chains[idx] = {idx};
+      for (int child : nodes[idx].children_) {
+        if (--in_degrees[child] == 0) {
+          if (orig_nodes[child].operator_->HasAsyncPart()) {
+            q.push(child);
+          } else {
+            sync_frontier.push_back(child);
+          }
+        }
+      }
+    }
+  }
+
+  updateOperatorNodes(orig_nodes, chains);
+  return chains;
+}
+
 ExecutionChains singleChains(std::vector<OperatorNode>& nodes) {
   ExecutionChains chains;
   for (int i = 0; i < (int)nodes.size(); ++i) {
diff --git a/caffe2/core/net_dag_utils.h b/caffe2/core/net_dag_utils.h
@@ -43,11 +43,19 @@ struct OpGraphNode {
 
 using ExecutionChains = std::unordered_map<int, std::vector<int>>;
 
-ExecutionChains computeChains(std::vector<OperatorNode>& orig_nodes);
+C10_EXPORT ExecutionChains computeChains(std::vector<OperatorNode>& orig_nodes);
 
-ExecutionChains singleChains(std::vector<OperatorNode>& nodes);
+// Instead of breaking down the DAG into chains, we partition it into clusters
+// of sync ops and individual async op. This is useful for disturbuted inference
+// case where we have sync and async cpu ops. Note that we have go sync each
+// aysnc op instead of put them into the chain and sync its tail like GPU op,
+// because CPU async ops are typically rpc calls and are not guaranteed to be
+// linearized at remote site.
+C10_EXPORT ExecutionChains computeGroups(std::vector<OperatorNode>& orig_nodes);
 
-std::vector<OperatorNode> prepareOperatorNodes(
+C10_EXPORT ExecutionChains singleChains(std::vector<OperatorNode>& nodes);
+
+C10_EXPORT std::vector<OperatorNode> prepareOperatorNodes(
     const std::shared_ptr<const NetDef>& net_def,
     Workspace* ws);
 
diff --git a/caffe2/core/net_dag_utils_test.cc b/caffe2/core/net_dag_utils_test.cc