[bitsandbytes] allow directly CUDA placements of pipelines loaded with bnb components (#9840)

sayakpaul · beniz · web-flow · commit e8da75dff530 · 2024-12-04T22:27:43.000+05:30
* allow device placement when using bnb quantization. * warning. * tests * fixes * docs. * require accelerate version. * remove print. * revert to() * tests * fixes * fix: missing AutoencoderKL lora adapter (#9807) * fix: missing AutoencoderKL lora adapter * fix --------- Co-authored-by: Sayak Paul <spsayakpaul@gmail.com> * fixes * fix condition test * updates * updates * remove is_offloaded. * fixes * better * empty --------- Co-authored-by: Emmanuel Benazera <emmanuel.benazera@jolibrain.com>
diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
@@ -66,7 +66,6 @@
 if is_torch_npu_available():
     import torch_npu  # noqa: F401
 
-
 from .pipeline_loading_utils import (
     ALL_IMPORTABLE_CLASSES,
     CONNECTED_PIPES_KEYS,
@@ -388,6 +387,7 @@ def to(self, *args, **kwargs):
             )
 
         device = device or device_arg
+        pipeline_has_bnb = any(any((_check_bnb_status(module))) for _, module in self.components.items())
 
         # throw warning if pipeline is in "offloaded"-mode but user tries to manually set to GPU.
         def module_is_sequentially_offloaded(module):
@@ -410,10 +410,16 @@ def module_is_offloaded(module):
         pipeline_is_sequentially_offloaded = any(
             module_is_sequentially_offloaded(module) for _, module in self.components.items()
         )
-        if pipeline_is_sequentially_offloaded and device and torch.device(device).type == "cuda":
-            raise ValueError(
-                "It seems like you have activated sequential model offloading by calling `enable_sequential_cpu_offload`, but are now attempting to move the pipeline to GPU. This is not compatible with offloading. Please, move your pipeline `.to('cpu')` or consider removing the move altogether if you use sequential offloading."
-            )
+        if device and torch.device(device).type == "cuda":
+            if pipeline_is_sequentially_offloaded and not pipeline_has_bnb:
+                raise ValueError(
+                    "It seems like you have activated sequential model offloading by calling `enable_sequential_cpu_offload`, but are now attempting to move the pipeline to GPU. This is not compatible with offloading. Please, move your pipeline `.to('cpu')` or consider removing the move altogether if you use sequential offloading."
+                )
+            # PR: https://github.com/huggingface/accelerate/pull/3223/
+            elif pipeline_has_bnb and is_accelerate_version("<", "1.1.0.dev0"):
+                raise ValueError(
+                    "You are trying to call `.to('cuda')` on a pipeline that has models quantized with `bitsandbytes`. Your current `accelerate` installation does not support it. Please upgrade the installation."
+                )
 
         is_pipeline_device_mapped = self.hf_device_map is not None and len(self.hf_device_map) > 1
         if is_pipeline_device_mapped:
diff --git a/tests/quantization/bnb/test_4bit.py b/tests/quantization/bnb/test_4bit.py
@@ -18,10 +18,11 @@
 import unittest
 
 import numpy as np
+import pytest
 import safetensors.torch
 
 from diffusers import BitsAndBytesConfig, DiffusionPipeline, FluxTransformer2DModel, SD3Transformer2DModel
-from diffusers.utils import logging
+from diffusers.utils import is_accelerate_version, logging
 from diffusers.utils.testing_utils import (
     CaptureLogger,
     is_bitsandbytes_available,
@@ -47,6 +48,7 @@ def get_some_linear_layer(model):
 
 
 if is_transformers_available():
+    from transformers import BitsAndBytesConfig as BnbConfig
     from transformers import T5EncoderModel
 
 if is_torch_available():
@@ -483,6 +485,47 @@ def test_moving_to_cpu_throws_warning(self):
 
         assert "Pipelines loaded with `dtype=torch.float16`" in cap_logger.out
 
+    @pytest.mark.xfail(
+        condition=is_accelerate_version("<=", "1.1.1"),
+        reason="Test will pass after https://github.com/huggingface/accelerate/pull/3223 is in a release.",
+        strict=True,
+    )
+    def test_pipeline_cuda_placement_works_with_nf4(self):
+        transformer_nf4_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_compute_dtype=torch.float16,
+        )
+        transformer_4bit = SD3Transformer2DModel.from_pretrained(
+            self.model_name,
+            subfolder="transformer",
+            quantization_config=transformer_nf4_config,
+            torch_dtype=torch.float16,
+        )
+        text_encoder_3_nf4_config = BnbConfig(
+            load_in_4bit=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_compute_dtype=torch.float16,
+        )
+        text_encoder_3_4bit = T5EncoderModel.from_pretrained(
+            self.model_name,
+            subfolder="text_encoder_3",
+            quantization_config=text_encoder_3_nf4_config,
+            torch_dtype=torch.float16,
+        )
+        # CUDA device placement works.
+        pipeline_4bit = DiffusionPipeline.from_pretrained(
+            self.model_name,
+            transformer=transformer_4bit,
+            text_encoder_3=text_encoder_3_4bit,
+            torch_dtype=torch.float16,
+        ).to("cuda")
+
+        # Check if inference works.
+        _ = pipeline_4bit("table", max_sequence_length=20, num_inference_steps=2)
+
+        del pipeline_4bit
+
 
 @require_transformers_version_greater("4.44.0")
 class SlowBnb4BitFluxTests(Base4bitTests):
diff --git a/tests/quantization/bnb/test_mixed_int8.py b/tests/quantization/bnb/test_mixed_int8.py
@@ -17,8 +17,10 @@
 import unittest
 
 import numpy as np
+import pytest
 
 from diffusers import BitsAndBytesConfig, DiffusionPipeline, FluxTransformer2DModel, SD3Transformer2DModel, logging
+from diffusers.utils import is_accelerate_version
 from diffusers.utils.testing_utils import (
     CaptureLogger,
     is_bitsandbytes_available,
@@ -44,6 +46,7 @@ def get_some_linear_layer(model):
 
 
 if is_transformers_available():
+    from transformers import BitsAndBytesConfig as BnbConfig
     from transformers import T5EncoderModel
 
 if is_torch_available():
@@ -432,6 +435,39 @@ def test_generate_quality_dequantize(self):
             output_type="np",
         ).images
 
+    @pytest.mark.xfail(
+        condition=is_accelerate_version("<=", "1.1.1"),
+        reason="Test will pass after https://github.com/huggingface/accelerate/pull/3223 is in a release.",
+        strict=True,
+    )
+    def test_pipeline_cuda_placement_works_with_mixed_int8(self):
+        transformer_8bit_config = BitsAndBytesConfig(load_in_8bit=True)
+        transformer_8bit = SD3Transformer2DModel.from_pretrained(
+            self.model_name,
+            subfolder="transformer",
+            quantization_config=transformer_8bit_config,
+            torch_dtype=torch.float16,
+        )
+        text_encoder_3_8bit_config = BnbConfig(load_in_8bit=True)
+        text_encoder_3_8bit = T5EncoderModel.from_pretrained(
+            self.model_name,
+            subfolder="text_encoder_3",
+            quantization_config=text_encoder_3_8bit_config,
+            torch_dtype=torch.float16,
+        )
+        # CUDA device placement works.
+        pipeline_8bit = DiffusionPipeline.from_pretrained(
+            self.model_name,
+            transformer=transformer_8bit,
+            text_encoder_3=text_encoder_3_8bit,
+            torch_dtype=torch.float16,
+        ).to("cuda")
+
+        # Check if inference works.
+        _ = pipeline_8bit("table", max_sequence_length=20, num_inference_steps=2)
+
+        del pipeline_8bit
+
 
 @require_transformers_version_greater("4.44.0")
 class SlowBnb8bitFluxTests(Base8bitTests):