From 5ea3bbcc8406c8af23dd91973cd41b54198cc8ca Mon Sep 17 00:00:00 2001 From: hh-space-invader Date: Wed, 5 Mar 2025 03:08:47 +0200 Subject: [PATCH] docs: Add description for changes --- fastembed/common/onnx_model.py | 3 +++ fastembed/image/onnx_image_model.py | 3 +++ fastembed/late_interaction/colbert.py | 1 - .../late_interaction_multimodal/onnx_multimodal_model.py | 6 ++++++ fastembed/rerank/cross_encoder/onnx_text_model.py | 3 +++ fastembed/text/onnx_text_model.py | 3 +++ 6 files changed, 18 insertions(+), 1 deletion(-) diff --git a/fastembed/common/onnx_model.py b/fastembed/common/onnx_model.py index 4afa2c34..020f16a4 100644 --- a/fastembed/common/onnx_model.py +++ b/fastembed/common/onnx_model.py @@ -68,6 +68,9 @@ def _load_onnx_model( if device_id is None: onnx_providers = ["CUDAExecutionProvider"] else: + # kSameAsRequested: Allocates only the requested memory, avoiding over-allocation. + # more precise than 'kNextPowerOfTwo', which grows memory aggressively. + # source: https://onnxruntime.ai/docs/get-started/with-c.html#features:~:text=Memory%20arena%20shrinkage: onnx_providers = [ ( "CUDAExecutionProvider", diff --git a/fastembed/image/onnx_image_model.py b/fastembed/image/onnx_image_model.py index 30ff66aa..911f425e 100644 --- a/fastembed/image/onnx_image_model.py +++ b/fastembed/image/onnx_image_model.py @@ -82,6 +82,9 @@ def onnx_embed(self, images: list[ImageInput], **kwargs: Any) -> OnnxOutputConte if is_cuda_enabled(cuda, providers): device_id = kwargs.get("device_id", None) device_id = str(device_id if isinstance(device_id, int) else 0) + # enables memory arena shrinkage, freeing unused memory after each Run() cycle. + # helps prevent excessive memory retention, especially for dynamic workloads. + # source: https://onnxruntime.ai/docs/get-started/with-c.html#features:~:text=Memory%20arena%20shrinkage: run_options.add_run_config_entry( "memory.enable_memory_arena_shrinkage", f"gpu:{device_id}" ) diff --git a/fastembed/late_interaction/colbert.py b/fastembed/late_interaction/colbert.py index 77093fd8..841d3a73 100644 --- a/fastembed/late_interaction/colbert.py +++ b/fastembed/late_interaction/colbert.py @@ -201,7 +201,6 @@ def load_onnx_model(self) -> None: current_max_length = self.tokenizer.truncation["max_length"] # ensure not to overflow after adding document-marker self.tokenizer.enable_truncation(max_length=current_max_length - 1) - print("ME VERSION") def embed( self, diff --git a/fastembed/late_interaction_multimodal/onnx_multimodal_model.py b/fastembed/late_interaction_multimodal/onnx_multimodal_model.py index 0e516b87..770eb787 100644 --- a/fastembed/late_interaction_multimodal/onnx_multimodal_model.py +++ b/fastembed/late_interaction_multimodal/onnx_multimodal_model.py @@ -111,6 +111,9 @@ def onnx_embed_text( if is_cuda_enabled(cuda, providers): device_id = kwargs.get("device_id", None) device_id = str(device_id if isinstance(device_id, int) else 0) + # enables memory arena shrinkage, freeing unused memory after each Run() cycle. + # helps prevent excessive memory retention, especially for dynamic workloads. + # source: https://onnxruntime.ai/docs/get-started/with-c.html#features:~:text=Memory%20arena%20shrinkage: run_options.add_run_config_entry( "memory.enable_memory_arena_shrinkage", f"gpu:{device_id}" ) @@ -188,6 +191,9 @@ def onnx_embed_image(self, images: list[ImageInput], **kwargs: Any) -> OnnxOutpu if is_cuda_enabled(cuda, providers): device_id = kwargs.get("device_id", None) device_id = str(device_id if isinstance(device_id, int) else 0) + # enables memory arena shrinkage, freeing unused memory after each Run() cycle. + # helps prevent excessive memory retention, especially for dynamic workloads. + # source: https://onnxruntime.ai/docs/get-started/with-c.html#features:~:text=Memory%20arena%20shrinkage: run_options.add_run_config_entry( "memory.enable_memory_arena_shrinkage", f"gpu:{device_id}" ) diff --git a/fastembed/rerank/cross_encoder/onnx_text_model.py b/fastembed/rerank/cross_encoder/onnx_text_model.py index e98b374a..8226e73b 100644 --- a/fastembed/rerank/cross_encoder/onnx_text_model.py +++ b/fastembed/rerank/cross_encoder/onnx_text_model.py @@ -79,6 +79,9 @@ def onnx_embed_pairs(self, pairs: list[tuple[str, str]], **kwargs: Any) -> OnnxO if is_cuda_enabled(cuda, providers): device_id = kwargs.get("device_id", None) device_id = str(device_id if isinstance(device_id, int) else 0) + # Enables memory arena shrinkage, freeing unused memory after each Run() cycle. + # Helps prevent excessive memory retention, especially for dynamic workloads. + # Source: https://onnxruntime.ai/docs/get-started/with-c.html#features:~:text=Memory%20arena%20shrinkage: run_options.add_run_config_entry( "memory.enable_memory_arena_shrinkage", f"gpu:{device_id}" ) diff --git a/fastembed/text/onnx_text_model.py b/fastembed/text/onnx_text_model.py index 1a3cf2cd..210d0ef9 100644 --- a/fastembed/text/onnx_text_model.py +++ b/fastembed/text/onnx_text_model.py @@ -89,6 +89,9 @@ def onnx_embed( if is_cuda_enabled(cuda, providers): device_id = kwargs.get("device_id", None) device_id = str(device_id if isinstance(device_id, int) else 0) + # enables memory arena shrinkage, freeing unused memory after each Run() cycle. + # helps prevent excessive memory retention, especially for dynamic workloads. + # source: https://onnxruntime.ai/docs/get-started/with-c.html#features:~:text=Memory%20arena%20shrinkage: run_options.add_run_config_entry( "memory.enable_memory_arena_shrinkage", f"gpu:{device_id}" )