fix: Fix multi gpu settings

hh-space-invader · hh-space-invader · commit 1c016a2a3ffe · 2025-03-05T02:45:37.000+02:00
diff --git a/fastembed/common/utils.py b/fastembed/common/utils.py
@@ -5,12 +5,12 @@
 import unicodedata
 from pathlib import Path
 from itertools import islice
-from typing import Iterable, Optional, TypeVar
+from typing import Iterable, Optional, TypeVar, Sequence
 
 import numpy as np
 from numpy.typing import NDArray
 
-from fastembed.common.types import NumpyArray
+from fastembed.common.types import NumpyArray, OnnxProvider
 
 T = TypeVar("T")
 
@@ -67,3 +67,18 @@ def get_all_punctuation() -> set[str]:
 
 def remove_non_alphanumeric(text: str) -> str:
     return re.sub(r"[^\w\s]", " ", text, flags=re.UNICODE)
+
+
+def is_cuda_enabled(cuda: bool, providers: Optional[Sequence[OnnxProvider]]) -> bool:
+    """
+    Check if CUDA is enabled based on the `cuda` and `providers` parameters
+    """
+    if cuda:
+        return True
+    if not providers:
+        return False
+    if isinstance(providers, str):
+        return "CUDAExecutionProvider" in providers
+    return isinstance(providers, (list, tuple)) and any(
+        isinstance(p, str) and "CUDAExecutionProvider" in p for p in providers
+    )
diff --git a/fastembed/image/onnx_image_model.py b/fastembed/image/onnx_image_model.py
@@ -13,7 +13,7 @@
 from fastembed.common import ImageInput, OnnxProvider
 from fastembed.common.onnx_model import EmbeddingWorker, OnnxModel, OnnxOutputContext, T
 from fastembed.common.preprocessor_utils import load_preprocessor
-from fastembed.common.utils import iter_batch
+from fastembed.common.utils import iter_batch, is_cuda_enabled
 from fastembed.parallel_processor import ParallelWorkerPool
 
 # Holds type of the embedding result
@@ -66,7 +66,6 @@ def _build_onnx_input(self, encoded: NumpyArray) -> dict[str, NumpyArray]:
         return {input_name: encoded}
 
     def onnx_embed(self, images: list[ImageInput], **kwargs: Any) -> OnnxOutputContext:
-        device_id = kwargs.pop("device_id", 0)
         with contextlib.ExitStack():
             image_files = [
                 Image.open(image) if not isinstance(image, Image.Image) else image
@@ -76,12 +75,18 @@ def onnx_embed(self, images: list[ImageInput], **kwargs: Any) -> OnnxOutputConte
             encoded = np.array(self.processor(image_files))
         onnx_input = self._build_onnx_input(encoded)
         onnx_input = self._preprocess_onnx_input(onnx_input)
-        device_id = device_id if isinstance(device_id, int) else 0
+
         run_options = ort.RunOptions()
-        run_options.add_run_config_entry(
-            "memory.enable_memory_arena_shrinkage", f"gpu:{device_id}"
-        )
-        model_output = self.model.run(None, onnx_input)  # type: ignore[union-attr]
+        providers = kwargs.get("providers", None)
+        cuda = kwargs.get("cuda", False)
+        if is_cuda_enabled(cuda, providers):
+            device_id = kwargs.get("device_id", None)
+            device_id = str(device_id if isinstance(device_id, int) else 0)
+            run_options.add_run_config_entry(
+                "memory.enable_memory_arena_shrinkage", f"gpu:{device_id}"
+            )
+
+        model_output = self.model.run(None, onnx_input, run_options)  # type: ignore[union-attr]
         embeddings = model_output[0].reshape(len(images), -1)
         return OnnxOutputContext(model_output=embeddings)
 
diff --git a/fastembed/late_interaction_multimodal/onnx_multimodal_model.py b/fastembed/late_interaction_multimodal/onnx_multimodal_model.py
@@ -13,7 +13,7 @@
 from fastembed.common.onnx_model import EmbeddingWorker, OnnxModel, OnnxOutputContext, T
 from fastembed.common.preprocessor_utils import load_tokenizer, load_preprocessor
 from fastembed.common.types import NumpyArray
-from fastembed.common.utils import iter_batch
+from fastembed.common.utils import iter_batch, is_cuda_enabled
 from fastembed.image.transform.operators import Compose
 from fastembed.parallel_processor import ParallelWorkerPool
 
@@ -89,7 +89,6 @@ def onnx_embed_text(
         documents: list[str],
         **kwargs: Any,
     ) -> OnnxOutputContext:
-        device_id = kwargs.pop("device_id", 0)
         encoded = self.tokenize(documents, **kwargs)
         input_ids = np.array([e.ids for e in encoded])
         attention_mask = np.array([e.attention_mask for e in encoded])  # type: ignore[union-attr]
@@ -105,12 +104,18 @@ def onnx_embed_text(
             )
 
         onnx_input = self._preprocess_onnx_text_input(onnx_input, **kwargs)
-        device_id = device_id if isinstance(device_id, int) else 0
+
         run_options = ort.RunOptions()
-        run_options.add_run_config_entry(
-            "memory.enable_memory_arena_shrinkage", f"gpu:{device_id}"
-        )
-        model_output = self.model.run(self.ONNX_OUTPUT_NAMES, onnx_input)  # type: ignore[union-attr]
+        providers = kwargs.get("providers", None)
+        cuda = kwargs.get("cuda", False)
+        if is_cuda_enabled(cuda, providers):
+            device_id = kwargs.get("device_id", None)
+            device_id = str(device_id if isinstance(device_id, int) else 0)
+            run_options.add_run_config_entry(
+                "memory.enable_memory_arena_shrinkage", f"gpu:{device_id}"
+            )
+
+        model_output = self.model.run(self.ONNX_OUTPUT_NAMES, onnx_input, run_options)  # type: ignore[union-attr]
         return OnnxOutputContext(
             model_output=model_output[0],
             attention_mask=onnx_input.get("attention_mask", attention_mask),
@@ -167,7 +172,6 @@ def _embed_documents(
                 yield from self._post_process_onnx_text_output(batch)  # type: ignore
 
     def onnx_embed_image(self, images: list[ImageInput], **kwargs: Any) -> OnnxOutputContext:
-        device_id = kwargs.pop("device_id", 0)
         with contextlib.ExitStack():
             image_files = [
                 Image.open(image) if not isinstance(image, Image.Image) else image
@@ -177,12 +181,18 @@ def onnx_embed_image(self, images: list[ImageInput], **kwargs: Any) -> OnnxOutpu
             encoded = np.array(self.processor(image_files))
         onnx_input = {"pixel_values": encoded}
         onnx_input = self._preprocess_onnx_image_input(onnx_input, **kwargs)
-        device_id = device_id if isinstance(device_id, int) else 0
+
         run_options = ort.RunOptions()
-        run_options.add_run_config_entry(
-            "memory.enable_memory_arena_shrinkage", f"gpu:{device_id}"
-        )
-        model_output = self.model.run(None, onnx_input)  # type: ignore[union-attr]
+        providers = kwargs.get("providers", None)
+        cuda = kwargs.get("cuda", False)
+        if is_cuda_enabled(cuda, providers):
+            device_id = kwargs.get("device_id", None)
+            device_id = str(device_id if isinstance(device_id, int) else 0)
+            run_options.add_run_config_entry(
+                "memory.enable_memory_arena_shrinkage", f"gpu:{device_id}"
+            )
+
+        model_output = self.model.run(None, onnx_input, run_options)  # type: ignore[union-attr]
         embeddings = model_output[0].reshape(len(images), -1)
         return OnnxOutputContext(model_output=embeddings)
 
diff --git a/fastembed/rerank/cross_encoder/onnx_text_model.py b/fastembed/rerank/cross_encoder/onnx_text_model.py
@@ -15,7 +15,7 @@
 )
 from fastembed.common.types import NumpyArray
 from fastembed.common.preprocessor_utils import load_tokenizer
-from fastembed.common.utils import iter_batch
+from fastembed.common.utils import iter_batch, is_cuda_enabled
 from fastembed.parallel_processor import ParallelWorkerPool
 
 
@@ -69,15 +69,20 @@ def onnx_embed(self, query: str, documents: list[str], **kwargs: Any) -> OnnxOut
         return self.onnx_embed_pairs(pairs, **kwargs)
 
     def onnx_embed_pairs(self, pairs: list[tuple[str, str]], **kwargs: Any) -> OnnxOutputContext:
-        device_id = kwargs.pop("device_id", 0)
         tokenized_input = self.tokenize(pairs, **kwargs)
         inputs = self._build_onnx_input(tokenized_input)
         onnx_input = self._preprocess_onnx_input(inputs, **kwargs)
-        device_id = device_id if isinstance(device_id, int) else 0
+
         run_options = ort.RunOptions()
-        run_options.add_run_config_entry(
-            "memory.enable_memory_arena_shrinkage", f"gpu:{device_id}"
-        )
+        providers = kwargs.get("providers", None)
+        cuda = kwargs.get("cuda", False)
+        if is_cuda_enabled(cuda, providers):
+            device_id = kwargs.get("device_id", None)
+            device_id = str(device_id if isinstance(device_id, int) else 0)
+            run_options.add_run_config_entry(
+                "memory.enable_memory_arena_shrinkage", f"gpu:{device_id}"
+            )
+
         outputs = self.model.run(self.ONNX_OUTPUT_NAMES, onnx_input, run_options)  # type: ignore[union-attr]
         relevant_output = outputs[0]
         scores: NumpyArray = relevant_output[:, 0]
diff --git a/fastembed/text/onnx_text_model.py b/fastembed/text/onnx_text_model.py
@@ -11,7 +11,7 @@
 from fastembed.common.types import NumpyArray, OnnxProvider
 from fastembed.common.onnx_model import EmbeddingWorker, OnnxModel, OnnxOutputContext, T
 from fastembed.common.preprocessor_utils import load_tokenizer
-from fastembed.common.utils import iter_batch
+from fastembed.common.utils import iter_batch, is_cuda_enabled
 from fastembed.parallel_processor import ParallelWorkerPool
 
 
@@ -68,7 +68,6 @@ def onnx_embed(
         documents: list[str],
         **kwargs: Any,
     ) -> OnnxOutputContext:
-        device_id = kwargs.pop("device_id", 0)
         encoded = self.tokenize(documents, **kwargs)
         input_ids = np.array([e.ids for e in encoded])
         attention_mask = np.array([e.attention_mask for e in encoded])
@@ -84,11 +83,16 @@ def onnx_embed(
             )
         onnx_input = self._preprocess_onnx_input(onnx_input, **kwargs)
 
-        device_id = device_id if isinstance(device_id, int) else 0
         run_options = ort.RunOptions()
-        run_options.add_run_config_entry(
-            "memory.enable_memory_arena_shrinkage", f"gpu:{device_id}"
-        )
+        providers = kwargs.get("providers", None)
+        cuda = kwargs.get("cuda", False)
+        if is_cuda_enabled(cuda, providers):
+            device_id = kwargs.get("device_id", None)
+            device_id = str(device_id if isinstance(device_id, int) else 0)
+            run_options.add_run_config_entry(
+                "memory.enable_memory_arena_shrinkage", f"gpu:{device_id}"
+            )
+
         model_output = self.model.run(self.ONNX_OUTPUT_NAMES, onnx_input, run_options)  # type: ignore[union-attr]
         return OnnxOutputContext(
             model_output=model_output[0],