Add small util to enable FSDP offloading quickly (#3006)

muellerzr · web-flow · commit 90d502390168 · 2024-08-12T11:53:02.000-04:00
* Wrap up util

* Add small util

* Update doc

* Don't req

* Clean
diff --git a/docs/source/package_reference/fsdp.md b/docs/source/package_reference/fsdp.md
@@ -15,6 +15,10 @@ rendered properly in your Markdown viewer.
 
 # Utilities for Fully Sharded Data Parallelism
 
+[[autodoc]] utils.enable_fsdp_ram_efficient_loading
+
+[[autodoc]] utils.disable_fsdp_ram_efficient_loading
+
 [[autodoc]] utils.merge_fsdp_weights
 
-[[autodoc]] utils.FullyShardedDataParallelPlugin
+[[autodoc]] utils.FullyShardedDataParallelPlugin
diff --git a/src/accelerate/utils/__init__.py b/src/accelerate/utils/__init__.py
@@ -188,7 +188,15 @@
     )
 
 from .bnb import has_4bit_bnb_layers, load_and_quantize_model
-from .fsdp_utils import load_fsdp_model, load_fsdp_optimizer, merge_fsdp_weights, save_fsdp_model, save_fsdp_optimizer
+from .fsdp_utils import (
+    disable_fsdp_ram_efficient_loading,
+    enable_fsdp_ram_efficient_loading,
+    load_fsdp_model,
+    load_fsdp_optimizer,
+    merge_fsdp_weights,
+    save_fsdp_model,
+    save_fsdp_optimizer,
+)
 from .launch import (
     PrepareForLaunch,
     _filter_args,
diff --git a/src/accelerate/utils/dataclasses.py b/src/accelerate/utils/dataclasses.py
@@ -1300,7 +1300,7 @@ class FullyShardedDataParallelPlugin:
             "for reduced memory usage. Defaults to `False`"
         },
     )
-    ram_efficient_loading: bool = field(
+    cpu_ram_efficient_loading: bool = field(
         default=None,
         metadata={
             "help": "If True, only the first process loads the pretrained model checkoint while all other processes have empty weights. "
@@ -1399,12 +1399,12 @@ def __post_init__(self):
                 str_to_bool(os.environ.get(env_prefix + "ACTIVATION_CHECKPOINTING", "False")) == 1
             )
 
-        if self.ram_efficient_loading is None:
-            self.ram_efficient_loading = (
-                str_to_bool(os.environ.get(env_prefix + "RAM_EFFICIENT_LOADING", "False")) == 1
+        if self.cpu_ram_efficient_loading is None:
+            self.cpu_ram_efficient_loading = (
+                str_to_bool(os.environ.get(env_prefix + "CPU_RAM_EFFICIENT_LOADING", "False")) == 1
             )
 
-        if self.ram_efficient_loading and not self.sync_module_states:
+        if self.cpu_ram_efficient_loading and not self.sync_module_states:
             warnings.warn(
                 "sync_module_states cannot be False since efficient cpu ram loading enabled. "
                 "Setting sync_module_states to True."
diff --git a/src/accelerate/utils/fsdp_utils.py b/src/accelerate/utils/fsdp_utils.py
@@ -27,6 +27,23 @@
 logger = get_logger(__name__)
 
 
+def enable_fsdp_ram_efficient_loading():
+    """
+    Enables RAM efficient loading of Hugging Face models for FSDP in the environment.
+    """
+    # Sets values for `transformers.modeling_utils.is_fsdp_enabled`
+    if "ACCELERATE_USE_FSDP" not in os.environ:
+        os.environ["ACCELERATE_USE_FSDP"] = "True"
+    os.environ["FSDP_CPU_RAM_EFFICIENT_LOADING"] = "True"
+
+
+def disable_fsdp_ram_efficient_loading():
+    """
+    Disables RAM efficient loading of Hugging Face models for FSDP in the environment.
+    """
+    os.environ["FSDP_CPU_RAM_EFFICIENT_LOADING"] = "False"
+
+
 def _get_model_state_dict(model, adapter_only=False):
     if adapter_only and is_peft_model(model):
         from peft import get_peft_model_state_dict
diff --git a/tests/fsdp/test_fsdp.py b/tests/fsdp/test_fsdp.py
@@ -42,6 +42,7 @@
     FSDP_STATE_DICT_TYPE,
 )
 from accelerate.utils.dataclasses import FullyShardedDataParallelPlugin
+from accelerate.utils.fsdp_utils import disable_fsdp_ram_efficient_loading, enable_fsdp_ram_efficient_loading
 from accelerate.utils.other import patch_environment
 
 
@@ -98,16 +99,18 @@ def test_backward_prefetch(self):
 
         for i, prefetch_policy in enumerate(FSDP_BACKWARD_PREFETCH):
             expected_value = None if prefetch_policy == "NO_PREFETCH" else BackwardPrefetch(i + 1)
-            # env = self.fsdp_env.copy()
-            # env["FSDP_BACKWARD_PREFETCH"] = prefetch_policy
-            # with mockenv_context(**env):
-            #     fsdp_plugin = FullyShardedDataParallelPlugin()
-            #     assert fsdp_plugin.backward_prefetch == expected_value, f"Actual: {fsdp_plugin.backward_prefetch} != Expected: {expected_value}"
+            env = self.fsdp_env.copy()
+            env["FSDP_BACKWARD_PREFETCH"] = prefetch_policy
+            with mockenv_context(**env):
+                fsdp_plugin = FullyShardedDataParallelPlugin()
+                assert (
+                    fsdp_plugin.backward_prefetch == expected_value
+                ), f"Actual: {fsdp_plugin.backward_prefetch} != Expected: {expected_value}"
 
-            # # Check if torch enum works
-            # if prefetch_policy != "NO_PREFETCH":
-            #     fsdp_plugin = FullyShardedDataParallelPlugin(backward_prefetch=BackwardPrefetch(i + 1))
-            #     assert fsdp_plugin.backward_prefetch == expected_value
+            # Check if torch enum works
+            if prefetch_policy != "NO_PREFETCH":
+                fsdp_plugin = FullyShardedDataParallelPlugin(backward_prefetch=BackwardPrefetch(i + 1))
+                assert fsdp_plugin.backward_prefetch == expected_value
 
             # Check if name works
             fsdp_plugin = FullyShardedDataParallelPlugin(backward_prefetch=prefetch_policy)
@@ -263,6 +266,16 @@ def test_cpu_offload(self):
             fsdp_plugin = FullyShardedDataParallelPlugin(cpu_offload=flag)
             assert fsdp_plugin.cpu_offload == CPUOffload(offload_params=flag)
 
+    def test_cpu_ram_efficient_loading(self):
+        enable_fsdp_ram_efficient_loading()
+        fsdp_plugin = FullyShardedDataParallelPlugin()
+        assert fsdp_plugin.cpu_ram_efficient_loading is True
+        assert os.environ.get("FSDP_CPU_RAM_EFFICIENT_LOADING") == "True"
+        disable_fsdp_ram_efficient_loading()
+        fsdp_plugin = FullyShardedDataParallelPlugin()
+        assert fsdp_plugin.cpu_ram_efficient_loading is False
+        assert os.environ.get("FSDP_CPU_RAM_EFFICIENT_LOADING") == "False"
+
 
 # Skip this test when TorchXLA is available because accelerate.launch does not support TorchXLA FSDP.
 @require_non_torch_xla