From 53ec1b154bf2cab69bf541ba61b089ee76adb1a9 Mon Sep 17 00:00:00 2001 From: Luca Wehrstedt Date: Mon, 7 Oct 2019 07:04:04 -0700 Subject: [PATCH] Fix race condition in writing config to checkpoint Summary: We used to have _all_ trainers write the config to the checkpoint, at the same time. This is already problematic but what's worse is that only trainer 0 was creating the checkpoint directory. Thus if it didn't exist and a non-0 trainer was the first to reach that point the write would fail. I'm fixing it in the same way we fixed all other similar issues: have only the rank-0 trainer write this. Reviewed By: adamlerer Differential Revision: D17787303 fbshipit-source-id: c3464dd9929ff95d54865ed03f041388d85c6f0d --- torchbiggraph/train.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/torchbiggraph/train.py b/torchbiggraph/train.py index 33301fa2..b6cecbb7 100644 --- a/torchbiggraph/train.py +++ b/torchbiggraph/train.py @@ -470,7 +470,8 @@ def make_optimizer(params: Iterable[torch.nn.Parameter], is_emb: bool) -> Optimi subprocess_init=subprocess_init, ) checkpoint_manager.register_metadata_provider(ConfigMetadataProvider(config)) - checkpoint_manager.write_config(config) + if rank == 0: + checkpoint_manager.write_config(config) if config.num_edge_chunks is not None: num_edge_chunks = config.num_edge_chunks