Add distributed training.

kentonl · kentonl · commit 2d2105554dba · 2018-09-05T01:01:29.000-07:00
diff --git a/.gitignore b/.gitignore
@@ -7,3 +7,4 @@ char_vocab*.txt
 glove*.txt
 glove*.txt.filtered
 *.v*_*_conll
+*.hdf5
diff --git a/continuous_evaluate.py b/continuous_evaluate.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import re
+import time
+import shutil
+
+import tensorflow as tf
+import coref_model as cm
+import util
+
+def copy_checkpoint(source, target):
+  for ext in (".index", ".data-00000-of-00001"):
+    shutil.copyfile(source + ext, target + ext)
+
+if __name__ == "__main__":
+  config = util.initialize_from_env()
+  model = cm.CorefModel(config)
+
+  saver = tf.train.Saver()
+  log_dir = config["log_dir"]
+  writer = tf.summary.FileWriter(log_dir, flush_secs=20)
+  evaluated_checkpoints = set()
+  max_f1 = 0
+  checkpoint_pattern = re.compile(".*model.ckpt-([0-9]*)\Z")
+
+  with tf.Session() as session:
+    while True:
+      ckpt = tf.train.get_checkpoint_state(log_dir)
+      if ckpt and ckpt.model_checkpoint_path and ckpt.model_checkpoint_path not in evaluated_checkpoints:
+        print("Evaluating {}".format(ckpt.model_checkpoint_path))
+
+        # Move it to a temporary location to avoid being deleted by the training supervisor.
+        tmp_checkpoint_path = os.path.join(log_dir, "model.tmp.ckpt")
+        copy_checkpoint(ckpt.model_checkpoint_path, tmp_checkpoint_path)
+
+        global_step = int(checkpoint_pattern.match(ckpt.model_checkpoint_path).group(1))
+        saver.restore(session, ckpt.model_checkpoint_path)
+
+        eval_summary, f1 = model.evaluate(session)
+
+        if f1 > max_f1:
+          max_f1 = f1
+          copy_checkpoint(tmp_checkpoint_path, os.path.join(log_dir, "model.max.ckpt"))
+
+        print("Current max F1: {:.2f}".format(max_f1))
+
+        writer.add_summary(eval_summary, global_step)
+        print("Evaluation written to {} at step {}".format(log_dir, global_step))
+
+        evaluated_checkpoints.add(ckpt.model_checkpoint_path)
+        sleep_time = 60
+      else:
+        sleep_time = 10
+      print("Waiting for {} seconds before looking for next checkpoint.".format(sleep_time))
+      time.sleep(sleep_time)
diff --git a/evaluate.py b/evaluate.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
diff --git a/experiments.conf b/experiments.conf
@@ -12,6 +12,15 @@ glove_300d_2w {
   size = 300
 }
 
+# Distributed training configurations.
+two_local_gpus {
+  addresses {
+    ps = [localhost:2222]
+    worker = [localhost:2223, localhost:2224]
+  }
+  gpus = [0, 1]
+}
+
 # Main configuration.
 best {
   # Computation limits.
@@ -59,6 +68,7 @@ best {
   eval_frequency = 5000
   report_frequency = 100
   log_root = logs
+  cluster = ${two_local_gpus}
 }
 
 # For evaluation. Do not use for training (i.e. only for predict.py, evaluate.py, and demo.py). Rename `best` directory to `final`.
diff --git a/ps.py b/ps.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python
+
+import os
+
+import tensorflow as tf
+import util
+
+if __name__ == "__main__":
+  config = util.initialize_from_env()
+  report_frequency = config["report_frequency"]
+  cluster_config = config["cluster"]
+  util.set_gpus()
+  cluster = tf.train.ClusterSpec(cluster_config["addresses"])
+  server = tf.train.Server(cluster, job_name="ps", task_index=0)
+  server.join()
diff --git a/train.py b/train.py
@@ -1,4 +1,7 @@
 #!/usr/bin/env python
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
 
 import os
 import time
diff --git a/worker.py b/worker.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+import time
+
+import tensorflow as tf
+import coref_model as cm
+import util
+
+if __name__ == "__main__":
+  config = util.initialize_from_env()
+  task_index = int(os.environ["TASK"])
+
+  report_frequency = config["report_frequency"]
+  cluster_config = config["cluster"]
+
+  util.set_gpus(cluster_config["gpus"][task_index])
+
+  cluster = tf.train.ClusterSpec(cluster_config["addresses"])
+  server = tf.train.Server(cluster,
+                           job_name="worker",
+                           task_index=task_index)
+
+  # Assigns ops to the local worker by default.
+  with tf.device(tf.train.replica_device_setter(worker_device="/job:worker/task:%d" % task_index, cluster=cluster)):
+    model = cm.CorefModel(config)
+    saver = tf.train.Saver()
+    init_op = tf.global_variables_initializer()
+
+  log_dir = config["log_dir"]
+  writer = tf.summary.FileWriter(os.path.join(log_dir, "w{}".format(task_index)), flush_secs=20)
+
+  is_chief = (task_index == 0)
+
+  # Create a "supervisor", which oversees the training process.
+  sv = tf.train.Supervisor(is_chief=is_chief,
+                           logdir=log_dir,
+                           init_op=init_op,
+                           saver=saver,
+                           global_step=model.global_step,
+                           save_model_secs=120)
+
+  # The supervisor takes care of session initialization, restoring from
+  # a checkpoint, and closing when done or an error occurs.
+  with sv.managed_session(server.target) as session:
+    model.start_enqueue_thread(session)
+    accumulated_loss = 0.0
+    initial_time = time.time()
+    while not sv.should_stop():
+      tf_loss, tf_global_step, _ = session.run([model.loss, model.global_step, model.train_op])
+      accumulated_loss += tf_loss
+
+      if tf_global_step % report_frequency == 0:
+        total_time = time.time() - initial_time
+        steps_per_second = tf_global_step / total_time
+
+        average_loss = accumulated_loss / report_frequency
+        print("[{}] loss={:.2f}, steps/s={:.2f}".format(tf_global_step, tf_loss, steps_per_second))
+        accumulated_loss = 0.0
+        writer.add_summary(util.make_summary({
+          "Train Loss": average_loss,
+          "Steps per second": steps_per_second
+        }))
+
+  # Ask for all the services to stop.
+  sv.stop()

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
	`1`	`+#!/usr/bin/env python`
`1`	`2`	`from __future__ import absolute_import`
`2`	`3`	`from __future__ import division`
`3`	`4`	`from __future__ import print_function`