new: Add sparse type hints (#460)

hh-space-invader · joein · web-flow · commit 37a66d9e16c9 · 2025-02-04T17:17:25.000+01:00
* new: Add sparse type hints

* fix: ndarray -&gt; numpyarray

---------

Co-authored-by: George Panchuk &lt;george.panchuk@qdrant.tech&gt;
diff --git a/fastembed/sparse/bm25.py b/fastembed/sparse/bm25.py
@@ -123,7 +123,7 @@ def __init__(
         self.avg_len = avg_len
 
         model_description = self._get_model_description(model_name)
-        self.cache_dir = define_cache_dir(cache_dir)
+        self.cache_dir = str(define_cache_dir(cache_dir))
 
         self._model_dir = self.download_model(
             model_description,
@@ -137,7 +137,7 @@ def __init__(
         self.disable_stemmer = disable_stemmer
 
         if disable_stemmer:
-            self.stopwords = set()
+            self.stopwords: set[str] = set()
             self.stemmer = None
         else:
             self.stopwords = set(self._load_stopwords(self._model_dir, self.language))
@@ -239,7 +239,7 @@ def embed(
         )
 
     def _stem(self, tokens: list[str]) -> list[str]:
-        stemmed_tokens = []
+        stemmed_tokens: list[str] = []
         for token in tokens:
             lower_token = token.lower()
 
@@ -262,7 +262,7 @@ def raw_embed(
         self,
         documents: list[str],
     ) -> list[SparseEmbedding]:
-        embeddings = []
+        embeddings: list[SparseEmbedding] = []
         for document in documents:
             document = remove_non_alphanumeric(document)
             tokens = self.tokenizer.tokenize(document)
@@ -286,8 +286,8 @@ def _term_frequency(self, tokens: list[str]) -> dict[int, float]:
         Returns:
             dict[int, float]: The token_id to term frequency mapping.
         """
-        tf_map = {}
-        counter = defaultdict(int)
+        tf_map: dict[int, float] = {}
+        counter: defaultdict[str, int] = defaultdict(int)
         for stemmed_token in tokens:
             counter[stemmed_token] += 1
 
diff --git a/fastembed/sparse/bm42.py b/fastembed/sparse/bm42.py
@@ -110,7 +110,7 @@ def __init__(
             self.device_id = None
 
         self.model_description = self._get_model_description(model_name)
-        self.cache_dir = define_cache_dir(cache_dir)
+        self.cache_dir = str(define_cache_dir(cache_dir))
 
         self._model_dir = self.download_model(
             self.model_description,
@@ -119,10 +119,10 @@ def __init__(
             specific_model_path=specific_model_path,
         )
 
-        self.invert_vocab = {}
+        self.invert_vocab: dict[int, str] = {}
 
-        self.special_tokens = set()
-        self.special_tokens_ids = set()
+        self.special_tokens: set[str] = set()
+        self.special_tokens_ids: set[int] = set()
         self.punctuation = set(string.punctuation)
         self.stopwords = set(self._load_stopwords(self._model_dir))
         self.stemmer = SnowballStemmer(MODEL_TO_LANGUAGE[model_name])
@@ -147,15 +147,15 @@ def load_onnx_model(self) -> None:
         self.stopwords = set(self._load_stopwords(self._model_dir))
 
     def _filter_pair_tokens(self, tokens: list[tuple[str, Any]]) -> list[tuple[str, Any]]:
-        result = []
+        result: list[tuple[str, Any]] = []
         for token, value in tokens:
             if token in self.stopwords or token in self.punctuation:
                 continue
             result.append((token, value))
         return result
 
     def _stem_pair_tokens(self, tokens: list[tuple[str, Any]]) -> list[tuple[str, Any]]:
-        result = []
+        result: list[tuple[str, Any]] = []
         for token, value in tokens:
             processed_token = self.stemmer.stem_word(token)
             result.append((processed_token, value))
@@ -165,7 +165,7 @@ def _stem_pair_tokens(self, tokens: list[tuple[str, Any]]) -> list[tuple[str, An
     def _aggregate_weights(
         cls, tokens: list[tuple[str, list[int]]], weights: list[float]
     ) -> list[tuple[str, float]]:
-        result = []
+        result: list[tuple[str, float]] = []
         for token, idxs in tokens:
             sum_weight = sum(weights[idx] for idx in idxs)
             result.append((token, sum_weight))
@@ -174,9 +174,9 @@ def _aggregate_weights(
     def _reconstruct_bpe(
         self, bpe_tokens: Iterable[tuple[int, str]]
     ) -> list[tuple[str, list[int]]]:
-        result = []
-        acc = ""
-        acc_idx = []
+        result: list[tuple[str, list[int]]] = []
+        acc: str = ""
+        acc_idx: list[int] = []
 
         continuing_subword_prefix = self.tokenizer.model.continuing_subword_prefix
         continuing_subword_prefix_len = len(continuing_subword_prefix)
@@ -206,7 +206,7 @@ def _rescore_vector(self, vector: dict[str, float]) -> dict[int, float]:
         So that the scoring doesn't depend on absolute values assigned by the model, but on the relative importance.
         """
 
-        new_vector = {}
+        new_vector: dict[int, float] = {}
 
         for token, value in vector.items():
             token_id = abs(mmh3.hash(token))
@@ -241,7 +241,7 @@ def _post_process_onnx_output(self, output: OnnxOutputContext) -> Iterable[Spars
 
             weighted = self._aggregate_weights(stemmed, attention_value)
 
-            max_token_weight = {}
+            max_token_weight: dict[str, float] = {}
 
             for token, weight in weighted:
                 max_token_weight[token] = max(max_token_weight.get(token, 0), weight)
@@ -304,7 +304,7 @@ def embed(
 
     @classmethod
     def _query_rehash(cls, tokens: Iterable[str]) -> dict[int, float]:
-        result = {}
+        result: dict[int, float] = {}
         for token in tokens:
             token_id = abs(mmh3.hash(token))
             result[token_id] = 1.0
@@ -334,11 +334,11 @@ def query_embed(
             yield SparseEmbedding.from_dict(self._query_rehash(token for token, _ in stemmed))
 
     @classmethod
-    def _get_worker_class(cls) -> Type[TextEmbeddingWorker]:
+    def _get_worker_class(cls) -> Type[TextEmbeddingWorker[SparseEmbedding]]:
         return Bm42TextEmbeddingWorker
 
 
-class Bm42TextEmbeddingWorker(TextEmbeddingWorker):
+class Bm42TextEmbeddingWorker(TextEmbeddingWorker[SparseEmbedding]):
     def init_embedding(self, model_name: str, cache_dir: str, **kwargs: Any) -> Bm42:
         return Bm42(
             model_name=model_name,
diff --git a/fastembed/sparse/sparse_embedding_base.py b/fastembed/sparse/sparse_embedding_base.py
@@ -3,15 +3,16 @@
 
 import numpy as np
 
+from fastembed.common.types import NumpyArray
 from fastembed.common.model_management import ModelManagement
 
 
 @dataclass
 class SparseEmbedding:
-    values: np.ndarray
-    indices: np.ndarray
+    values: NumpyArray
+    indices: NumpyArray
 
-    def as_object(self) -> dict[str, np.ndarray]:
+    def as_object(self) -> dict[str, NumpyArray]:
         return {
             "values": self.values,
             "indices": self.indices,
@@ -81,5 +82,5 @@ def query_embed(
         # This is model-specific, so that different models can have specialized implementations
         if isinstance(query, str):
             yield from self.embed([query], **kwargs)
-        if isinstance(query, Iterable):
+        else:
             yield from self.embed(query, **kwargs)
diff --git a/fastembed/sparse/sparse_text_embedding.py b/fastembed/sparse/sparse_text_embedding.py
@@ -38,7 +38,7 @@ def list_supported_models(cls) -> list[dict[str, Any]]:
                 ]
                 ```
         """
-        result = []
+        result: list[dict[str, Any]] = []
         for embedding in cls.EMBEDDINGS_REGISTRY:
             result.extend(embedding.list_supported_models())
         return result
diff --git a/fastembed/sparse/splade_pp.py b/fastembed/sparse/splade_pp.py
@@ -114,7 +114,7 @@ def __init__(
             self.device_id = None
 
         self.model_description = self._get_model_description(model_name)
-        self.cache_dir = define_cache_dir(cache_dir)
+        self.cache_dir = str(define_cache_dir(cache_dir))
 
         self._model_dir = self.download_model(
             self.model_description,
@@ -171,11 +171,11 @@ def embed(
         )
 
     @classmethod
-    def _get_worker_class(cls) -> Type[TextEmbeddingWorker]:
+    def _get_worker_class(cls) -> Type[TextEmbeddingWorker[SparseEmbedding]]:
         return SpladePPEmbeddingWorker
 
 
-class SpladePPEmbeddingWorker(TextEmbeddingWorker):
+class SpladePPEmbeddingWorker(TextEmbeddingWorker[SparseEmbedding]):
     def init_embedding(self, model_name: str, cache_dir: str, **kwargs: Any) -> SpladePP:
         return SpladePP(
             model_name=model_name,

Original file line number	Diff line number	Diff line change
`@@ -38,7 +38,7 @@ def list_supported_models(cls) -> list[dict[str, Any]]:`
`38`	`38`	`]`
`39`	`39`	```
`40`	`40`	`"""`
`41`		`- result = []`
	`41`	`+ result: list[dict[str, Any]] = []`
`42`	`42`	`for embedding in cls.EMBEDDINGS_REGISTRY:`
`43`	`43`	`result.extend(embedding.list_supported_models())`
`44`	`44`	`return result`