chore: Add missing returns in defs (#451)

hh-space-invader · joein · web-flow · commit 73e1e5ecb981 · 2025-01-29T08:33:49.000+01:00
* chore: Add missing returns in defs

* remove return type from init

* remove incorrect ndarray specifier

---------

Co-authored-by: George Panchuk &lt;george.panchuk@qdrant.tech&gt;
diff --git a/docs/examples/ColBERT_with_FastEmbed.ipynb b/docs/examples/ColBERT_with_FastEmbed.ipynb
@@ -54,7 +54,14 @@
     },
     {
      "data": {
-      "text/plain": "[{'model': 'colbert-ir/colbertv2.0',\n  'dim': 128,\n  'description': 'Late interaction model',\n  'size_in_GB': 0.44,\n  'sources': {'hf': 'colbert-ir/colbertv2.0'},\n  'model_file': 'model.onnx'}]"
+      "text/plain": [
+       "[{'model': 'colbert-ir/colbertv2.0',\n",
+       "  'dim': 128,\n",
+       "  'description': 'Late interaction model',\n",
+       "  'size_in_GB': 0.44,\n",
+       "  'sources': {'hf': 'colbert-ir/colbertv2.0'},\n",
+       "  'model_file': 'model.onnx'}]"
+      ]
      },
      "execution_count": 1,
      "metadata": {},
@@ -212,7 +219,9 @@
    "outputs": [
     {
      "data": {
-      "text/plain": "((26, 128), (32, 128))"
+      "text/plain": [
+       "((26, 128), (32, 128))"
+      ]
      },
      "execution_count": 18,
      "metadata": {},
@@ -271,7 +280,9 @@
     "import numpy as np\n",
     "\n",
     "\n",
-    "def compute_relevance_scores(query_embedding: np.array, document_embeddings: np.array, k: int):\n",
+    "def compute_relevance_scores(\n",
+    "    query_embedding: np.array, document_embeddings: np.array, k: int\n",
+    ") -> list[int]:\n",
     "    \"\"\"\n",
     "    Compute relevance scores for top-k documents given a query.\n",
     "\n",
diff --git a/docs/examples/FastEmbed_vs_HF_Comparison.ipynb b/docs/examples/FastEmbed_vs_HF_Comparison.ipynb
@@ -152,7 +152,7 @@
     "    HuggingFace Transformer implementation of FlagEmbedding\n",
     "    \"\"\"\n",
     "\n",
-    "    def __init__(self, model_id: str):\n",
+    "    def __init__(self, model_id: str) -> None:\n",
     "        self.model = AutoModel.from_pretrained(model_id)\n",
     "        self.tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
     "\n",
diff --git a/docs/examples/Hybrid_Search.ipynb b/docs/examples/Hybrid_Search.ipynb
@@ -488,7 +488,7 @@
     }
    ],
    "source": [
-    "def make_sparse_embedding(texts: list[str]):\n",
+    "def make_sparse_embedding(texts: list[str]) -> list[SparseEmbedding]:\n",
     "    return list(sparse_model.embed(texts, batch_size=32))\n",
     "\n",
     "\n",
@@ -615,7 +615,7 @@
     }
    ],
    "source": [
-    "def get_tokens_and_weights(sparse_embedding, model_name):\n",
+    "def get_tokens_and_weights(sparse_embedding, model_name) -> dict[str, float]:\n",
     "    # Find the tokenizer for the model\n",
     "    tokenizer_source = None\n",
     "    for model_info in SparseTextEmbedding.list_supported_models():\n",
@@ -626,7 +626,7 @@
     "            raise ValueError(f\"Model {model_name} not found in the supported models.\")\n",
     "\n",
     "    tokenizer = AutoTokenizer.from_pretrained(tokenizer_source)\n",
-    "    token_weight_dict = {}\n",
+    "    token_weight_dict: dict[str, float] = {}\n",
     "    for i in range(len(sparse_embedding.indices)):\n",
     "        token = tokenizer.decode([sparse_embedding.indices[i]])\n",
     "        weight = sparse_embedding.values[i]\n",
diff --git a/fastembed/common/model_management.py b/fastembed/common/model_management.py
@@ -255,7 +255,7 @@ def _save_file_metadata(model_dir: Path, meta: dict[str, dict[str, int]]) -> Non
         return result
 
     @classmethod
-    def decompress_to_cache(cls, targz_path: str, cache_dir: str):
+    def decompress_to_cache(cls, targz_path: str, cache_dir: str) -> str:
         """
         Decompresses a .tar.gz file to a cache directory.
 
diff --git a/fastembed/image/onnx_image_model.py b/fastembed/image/onnx_image_model.py
@@ -24,7 +24,7 @@ def _get_worker_class(cls) -> Type["ImageEmbeddingWorker"]:
     def _post_process_onnx_output(self, output: OnnxOutputContext) -> Iterable[T]:
         raise NotImplementedError("Subclasses must implement this method")
 
-    def __init__(self) -> None:
+    def __init__(self):
         super().__init__()
         self.processor = None
 
diff --git a/fastembed/image/transform/functional.py b/fastembed/image/transform/functional.py
@@ -118,7 +118,7 @@ def rescale(image: np.ndarray, scale: float, dtype=np.float32) -> np.ndarray:
     return (image * scale).astype(dtype)
 
 
-def pil2ndarray(image: Union[Image.Image, np.ndarray]):
+def pil2ndarray(image: Union[Image.Image, np.ndarray]) -> np.ndarray:
     if isinstance(image, Image.Image):
         return np.asarray(image).transpose((2, 0, 1))
     return image
diff --git a/fastembed/image/transform/operators.py b/fastembed/image/transform/operators.py
@@ -133,11 +133,11 @@ def from_config(cls, config: dict[str, Any]) -> "Compose":
         return cls(transforms=transforms)
 
     @staticmethod
-    def _get_convert_to_rgb(transforms: list[Transform], config: dict[str, Any]):
+    def _get_convert_to_rgb(transforms: list[Transform], config: dict[str, Any]) -> None:
         transforms.append(ConvertToRGB())
 
     @classmethod
-    def _get_resize(cls, transforms: list[Transform], config: dict[str, Any]):
+    def _get_resize(cls, transforms: list[Transform], config: dict[str, Any]) -> None:
         mode = config.get("image_processor_type", "CLIPImageProcessor")
         if mode == "CLIPImageProcessor":
             if config.get("do_resize", False):
@@ -200,7 +200,7 @@ def _get_resize(cls, transforms: list[Transform], config: dict[str, Any]):
             raise ValueError(f"Preprocessor {mode} is not supported")
 
     @staticmethod
-    def _get_center_crop(transforms: list[Transform], config: dict[str, Any]):
+    def _get_center_crop(transforms: list[Transform], config: dict[str, Any]) -> None:
         mode = config.get("image_processor_type", "CLIPImageProcessor")
         if mode == "CLIPImageProcessor":
             if config.get("do_center_crop", False):
@@ -220,24 +220,24 @@ def _get_center_crop(transforms: list[Transform], config: dict[str, Any]):
             raise ValueError(f"Preprocessor {mode} is not supported")
 
     @staticmethod
-    def _get_pil2ndarray(transforms: list[Transform], config: dict[str, Any]):
+    def _get_pil2ndarray(transforms: list[Transform], config: dict[str, Any]) -> None:
         transforms.append(PILtoNDarray())
 
     @staticmethod
-    def _get_rescale(transforms: list[Transform], config: dict[str, Any]):
+    def _get_rescale(transforms: list[Transform], config: dict[str, Any]) -> None:
         if config.get("do_rescale", True):
             rescale_factor = config.get("rescale_factor", 1 / 255)
             transforms.append(Rescale(scale=rescale_factor))
 
     @staticmethod
-    def _get_normalize(transforms: list[Transform], config: dict[str, Any]):
+    def _get_normalize(transforms: list[Transform], config: dict[str, Any]) -> None:
         if config.get("do_normalize", False):
             transforms.append(Normalize(mean=config["image_mean"], std=config["image_std"]))
         elif "mean" in config and "std" in config:
             transforms.append(Normalize(mean=config["mean"], std=config["std"]))
 
     @staticmethod
-    def _get_pad2square(transforms: list[Transform], config: dict[str, Any]):
+    def _get_pad2square(transforms: list[Transform], config: dict[str, Any]) -> None:
         mode = config.get("image_processor_type", "CLIPImageProcessor")
         if mode == "CLIPImageProcessor":
             pass
diff --git a/fastembed/rerank/cross_encoder/onnx_text_model.py b/fastembed/rerank/cross_encoder/onnx_text_model.py
@@ -1,9 +1,10 @@
 import os
 from multiprocessing import get_all_start_methods
 from pathlib import Path
-from typing import Any, Iterable, Optional, Sequence, Type
+from typing import Any, Iterable, Optional, Sequence, Type, Union
 
 import numpy as np
+from numpy.typing import NDArray
 from tokenizers import Encoding
 
 from fastembed.common.onnx_model import (
@@ -46,7 +47,9 @@ def _load_onnx_model(
     def tokenize(self, pairs: list[tuple[str, str]], **_: Any) -> list[Encoding]:
         return self.tokenizer.encode_batch(pairs)
 
-    def _build_onnx_input(self, tokenized_input):
+    def _build_onnx_input(
+        self, tokenized_input
+    ) -> dict[str, NDArray[Union[np.float32, np.int64]]]:
         input_names = {node.name for node in self.model.get_inputs()}
         inputs = {
             "input_ids": np.array([enc.ids for enc in tokenized_input], dtype=np.int64),
diff --git a/fastembed/text/onnx_text_model.py b/fastembed/text/onnx_text_model.py
@@ -23,7 +23,7 @@ def _get_worker_class(cls) -> Type["TextEmbeddingWorker"]:
     def _post_process_onnx_output(self, output: OnnxOutputContext) -> Iterable[T]:
         raise NotImplementedError("Subclasses must implement this method")
 
-    def __init__(self) -> None:
+    def __init__(self):
         super().__init__()
         self.tokenizer = None
         self.special_token_to_id = {}
diff --git a/fastembed/text/text_embedding_base.py b/fastembed/text/text_embedding_base.py
@@ -43,6 +43,7 @@ def passage_embed(self, texts: Iterable[str], **kwargs: Any) -> Iterable[np.ndar
         yield from self.embed(texts, **kwargs)
 
     def query_embed(self, query: Union[str, Iterable[str]], **kwargs: Any) -> Iterable[np.ndarray]:
+
         """
         Embeds queries
 
diff --git a/tests/test_attention_embeddings.py b/tests/test_attention_embeddings.py
@@ -8,7 +8,7 @@
 
 
 @pytest.mark.parametrize("model_name", ["Qdrant/bm42-all-minilm-l6-v2-attentions", "Qdrant/bm25"])
-def test_attention_embeddings(model_name):
+def test_attention_embeddings(model_name) -> None:
     is_ci = os.getenv("CI")
     model = SparseTextEmbedding(model_name=model_name)
 
@@ -71,7 +71,7 @@ def test_attention_embeddings(model_name):
 
 
 @pytest.mark.parametrize("model_name", ["Qdrant/bm42-all-minilm-l6-v2-attentions", "Qdrant/bm25"])
-def test_parallel_processing(model_name):
+def test_parallel_processing(model_name) -> None:
     is_ci = os.getenv("CI")
 
     model = SparseTextEmbedding(model_name=model_name)
@@ -96,7 +96,7 @@ def test_parallel_processing(model_name):
 
 
 @pytest.mark.parametrize("model_name", ["Qdrant/bm25"])
-def test_multilanguage(model_name):
+def test_multilanguage(model_name) -> None:
     is_ci = os.getenv("CI")
 
     docs = ["Mangez-vous vraiment des grenouilles?", "Je suis au lit"]
@@ -122,7 +122,7 @@ def test_multilanguage(model_name):
 
 
 @pytest.mark.parametrize("model_name", ["Qdrant/bm25"])
-def test_special_characters(model_name):
+def test_special_characters(model_name) -> None:
     is_ci = os.getenv("CI")
 
     docs = [
@@ -145,7 +145,7 @@ def test_special_characters(model_name):
 
 
 @pytest.mark.parametrize("model_name", ["Qdrant/bm42-all-minilm-l6-v2-attentions"])
-def test_lazy_load(model_name):
+def test_lazy_load(model_name) -> None:
     model = SparseTextEmbedding(model_name=model_name, lazy_load=True)
     assert not hasattr(model.model, "model")
     docs = ["hello world", "flag embedding"]
diff --git a/tests/test_image_onnx_embeddings.py b/tests/test_image_onnx_embeddings.py
@@ -27,7 +27,7 @@
 }
 
 
-def test_embedding():
+def test_embedding() -> None:
     is_ci = os.getenv("CI")
 
     for model_desc in ImageEmbedding.list_supported_models():
@@ -61,7 +61,7 @@ def test_embedding():
 
 
 @pytest.mark.parametrize("n_dims,model_name", [(512, "Qdrant/clip-ViT-B-32-vision")])
-def test_batch_embedding(n_dims, model_name):
+def test_batch_embedding(n_dims, model_name) -> None:
     is_ci = os.getenv("CI")
     model = ImageEmbedding(model_name=model_name)
     n_images = 32
@@ -81,7 +81,7 @@ def test_batch_embedding(n_dims, model_name):
 
 
 @pytest.mark.parametrize("n_dims,model_name", [(512, "Qdrant/clip-ViT-B-32-vision")])
-def test_parallel_processing(n_dims, model_name):
+def test_parallel_processing(n_dims, model_name) -> None:
     is_ci = os.getenv("CI")
     model = ImageEmbedding(model_name=model_name)
 
@@ -109,7 +109,7 @@ def test_parallel_processing(n_dims, model_name):
 
 
 @pytest.mark.parametrize("model_name", ["Qdrant/clip-ViT-B-32-vision"])
-def test_lazy_load(model_name):
+def test_lazy_load(model_name) -> None:
     is_ci = os.getenv("CI")
     model = ImageEmbedding(model_name=model_name, lazy_load=True)
     assert not hasattr(model.model, "model")
diff --git a/tests/test_multi_gpu.py b/tests/test_multi_gpu.py
@@ -13,7 +13,7 @@
 
 @pytest.mark.skip(reason="Requires a multi-gpu server")
 @pytest.mark.parametrize("device_id", [None, 0, 1])
-def test_gpu_via_providers(device_id):
+def test_gpu_via_providers(device_id) -> None:
     docs = ["hello world", "flag embedding"]
 
     device_id = device_id if device_id is not None else 0
@@ -85,7 +85,7 @@ def test_gpu_via_providers(device_id):
 
 @pytest.mark.skip(reason="Requires a multi-gpu server")
 @pytest.mark.parametrize("device_ids", [None, [0], [1], [0, 1]])
-def test_gpu_cuda_device_ids(device_ids):
+def test_gpu_cuda_device_ids(device_ids) -> None:
     docs = ["hello world", "flag embedding"]
     device_id = device_ids[0] if device_ids else 0
     embedding_model = TextEmbedding(
@@ -170,7 +170,7 @@ def test_gpu_cuda_device_ids(device_ids):
 @pytest.mark.parametrize(
     "device_ids,parallel", [(None, None), (None, 2), ([1], None), ([1], 1), ([1], 2), ([0, 1], 2)]
 )
-def test_multi_gpu_parallel_inference(device_ids, parallel):
+def test_multi_gpu_parallel_inference(device_ids, parallel) -> None:
     docs = ["hello world", "flag embedding"] * 100
     batch_size = 5
 
diff --git a/tests/test_sparse_embeddings.py b/tests/test_sparse_embeddings.py
@@ -49,7 +49,7 @@
 docs = ["Hello World"]
 
 
-def test_batch_embedding():
+def test_batch_embedding() -> None:
     is_ci = os.getenv("CI")
     docs_to_embed = docs * 10
 
@@ -64,7 +64,7 @@ def test_batch_embedding():
             delete_model_cache(model.model._model_dir)
 
 
-def test_single_embedding():
+def test_single_embedding() -> None:
     is_ci = os.getenv("CI")
     for model_name, expected_result in CANONICAL_COLUMN_VALUES.items():
         model = SparseTextEmbedding(model_name=model_name)
@@ -80,7 +80,7 @@ def test_single_embedding():
             delete_model_cache(model.model._model_dir)
 
 
-def test_parallel_processing():
+def test_parallel_processing() -> None:
     is_ci = os.getenv("CI")
     model = SparseTextEmbedding(model_name="prithivida/Splade_PP_en_v1")
     docs = ["hello world", "flag embedding"] * 30
@@ -111,15 +111,15 @@ def test_parallel_processing():
 
 
 @pytest.fixture
-def bm25_instance():
+def bm25_instance() -> None:
     ci = os.getenv("CI", True)
     model = Bm25("Qdrant/bm25", language="english")
     yield model
     if ci:
         delete_model_cache(model._model_dir)
 
 
-def test_stem_with_stopwords_and_punctuation(bm25_instance):
+def test_stem_with_stopwords_and_punctuation(bm25_instance) -> None:
     # Setup
     bm25_instance.stopwords = {"the", "is", "a"}
     bm25_instance.punctuation = {".", ",", "!"}
@@ -135,7 +135,7 @@ def test_stem_with_stopwords_and_punctuation(bm25_instance):
     assert result == expected, f"Expected {expected}, but got {result}"
 
 
-def test_stem_case_insensitive_stopwords(bm25_instance):
+def test_stem_case_insensitive_stopwords(bm25_instance) -> None:
     # Setup
     bm25_instance.stopwords = {"the", "is", "a"}
     bm25_instance.punctuation = {".", ",", "!"}
@@ -152,7 +152,7 @@ def test_stem_case_insensitive_stopwords(bm25_instance):
 
 
 @pytest.mark.parametrize("disable_stemmer", [True, False])
-def test_disable_stemmer_behavior(disable_stemmer):
+def test_disable_stemmer_behavior(disable_stemmer) -> None:
     # Setup
     model = Bm25("Qdrant/bm25", language="english", disable_stemmer=disable_stemmer)
     model.stopwords = {"the", "is", "a"}
@@ -176,7 +176,7 @@ def test_disable_stemmer_behavior(disable_stemmer):
     "model_name",
     ["prithivida/Splade_PP_en_v1"],
 )
-def test_lazy_load(model_name):
+def test_lazy_load(model_name) -> None:
     is_ci = os.getenv("CI")
     model = SparseTextEmbedding(model_name=model_name, lazy_load=True)
     assert not hasattr(model.model, "model")
diff --git a/tests/test_text_cross_encoder.py b/tests/test_text_cross_encoder.py
diff --git a/tests/test_text_onnx_embeddings.py b/tests/test_text_onnx_embeddings.py
diff --git a/tests/utils.py b/tests/utils.py