Do GC collect after dcp.save and dcp.load (#839)

fegin · web-flow · commit ab94a9992de8 · 2025-02-13T09:37:18.000-08:00
We disable auto gc and manually perform GC every 50 steps. This can
cause issues when checkpointing frequence is smaller than GC frequency.

This PR change checkpoint manager to call GC when a save or load
happens.
diff --git a/torchtitan/checkpoint.py b/torchtitan/checkpoint.py
@@ -30,6 +30,7 @@
 from torchtitan.config_manager import JobConfig, TORCH_DTYPE_MAP
 from torchtitan.logging import init_logger, logger
 from torchtitan.optimizer import LRSchedulersContainer, OptimizersContainer
+from torchtitan.utils import GarbageCollection
 
 
 class IntervalType(enum.Enum):
@@ -106,6 +107,12 @@ class SaveDone:
     pass
 
 
+@torch.no_grad()
+def save_with_gc(state, checkpoint_id):
+    dcp.save(state, checkpoint_id=checkpoint_id)
+    GarbageCollection.collect("GC collection invoked by checkpointer.")
+
+
 def checkpoint_mp(recv, send):
     init_logger()
     os.environ["MASTER_PORT"] = str(int(os.environ["MASTER_PORT"]) + 2)
@@ -125,7 +132,7 @@ def checkpoint_mp(recv, send):
             assert isinstance(obj, tuple)
             begin = time.monotonic()
             state, checkpoint_id = obj
-            dcp.save(state, checkpoint_id=checkpoint_id)
+            save_with_gc(state, checkpoint_id=checkpoint_id)
             logger.info(
                 "Finish saving the checkpoint in the background process in "
                 f"{time.monotonic() - begin:.2f} seconds."
@@ -274,7 +281,7 @@ def _save_last_step(self, curr_step: int) -> None:
         else:
             logger.info(f"Saving a full checkpoint at last step, step {curr_step}.")
 
-        dcp.save(self.states, checkpoint_id=self._create_checkpoint_id(curr_step))
+        save_with_gc(self.states, checkpoint_id=self._create_checkpoint_id(curr_step))
         self.reset()
 
     def _should_save(self, curr_step: int, force: bool = False) -> bool:
@@ -363,16 +370,21 @@ def save(self, curr_step: int, force: bool = False) -> None:
         begin = time.monotonic()
         checkpoint_id = self._create_checkpoint_id(curr_step)
         self._async_wait()
+        # This GC is called for async checkpoint as it is useless to do
+        # GC right after async_save -- the CPU memory is not able to be
+        # freed until _async_wait()
         if force:
             self._save_last_step(curr_step)
         elif self.async_mode == AsyncMode.ASYNC_WITH_PINNED_MEM:
+            GarbageCollection.collect("GC collection invoked by checkpointer.")
             self._async_with_pinned_memory(checkpoint_id)
         elif self.async_mode == AsyncMode.ASYNC:
+            GarbageCollection.collect("GC collection invoked by checkpointer.")
             self.async_future = dcp.async_save(
                 self.states, checkpoint_id=checkpoint_id, process_group=self.pg
             )
         else:
-            dcp.save(self.states, checkpoint_id=checkpoint_id)
+            save_with_gc(self.states, checkpoint_id=checkpoint_id)
         self.reset()
         self._purge_stale_checkpoints()
 
@@ -451,6 +463,7 @@ def load(self, step: int = -1) -> bool:
         # bugfix from above: restore the original stateful objects,
         # whose states were already updated in-place by dcp.load()
         states.update(original_stateful_states)
+        GarbageCollection.collect("GC collection for checkpoint loading.")
         return True
 
     def _purge_stale_checkpoints(self):
diff --git a/torchtitan/utils.py b/torchtitan/utils.py
@@ -163,11 +163,16 @@ def __init__(self, gc_freq=1000):
         assert gc_freq > 0, "gc_freq must be a positive integer"
         self.gc_freq = gc_freq
         gc.disable()
-        gc.collect(1)
+        self.collect("Initial GC collection.")
 
     def run(self, step_count):
         if step_count > 1 and step_count % self.gc_freq == 0:
-            gc.collect(1)
+            self.collect("Peforming periodical GC collection.")
+
+    @staticmethod
+    def collect(reason: str):
+        logger.info(reason)
+        gc.collect(1)
 
 
 TRACE_BUFFER_SIZE = "TORCH_NCCL_TRACE_BUFFER_SIZE"