From 53ec1b154bf2cab69bf541ba61b089ee76adb1a9 Mon Sep 17 00:00:00 2001
From: Luca Wehrstedt <lcw@fb.com>
Date: Mon, 7 Oct 2019 07:04:04 -0700
Subject: [PATCH] Fix race condition in writing config to checkpoint

Summary:
We used to have _all_ trainers write the config to the checkpoint, at the same time. This is already problematic but what's worse is that only trainer 0 was creating the checkpoint directory. Thus if it didn't exist and a non-0 trainer was the first to reach that point the write would fail.

I'm fixing it in the same way we fixed all other similar issues: have only the rank-0 trainer write this.

Reviewed By: adamlerer

Differential Revision: D17787303

fbshipit-source-id: c3464dd9929ff95d54865ed03f041388d85c6f0d
---
 torchbiggraph/train.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/torchbiggraph/train.py b/torchbiggraph/train.py
index 33301fa2..b6cecbb7 100644
--- a/torchbiggraph/train.py
+++ b/torchbiggraph/train.py
@@ -470,7 +470,8 @@ def make_optimizer(params: Iterable[torch.nn.Parameter], is_emb: bool) -> Optimi
         subprocess_init=subprocess_init,
     )
     checkpoint_manager.register_metadata_provider(ConfigMetadataProvider(config))
-    checkpoint_manager.write_config(config)
+    if rank == 0:
+        checkpoint_manager.write_config(config)
 
     if config.num_edge_chunks is not None:
         num_edge_chunks = config.num_edge_chunks