fix: fix None cache dir in parallel mode (#277)

joein · web-flow · commit e1ecfe9c2f68 · 2024-06-14T17:19:53.000+02:00
diff --git a/fastembed/image/onnx_embedding.py b/fastembed/image/onnx_embedding.py
@@ -56,9 +56,9 @@ def __init__(
         super().__init__(model_name, cache_dir, threads, **kwargs)
 
         model_description = self._get_model_description(model_name)
-        cache_dir = define_cache_dir(cache_dir)
+        self.cache_dir = define_cache_dir(cache_dir)
         model_dir = self.download_model(
-            model_description, cache_dir, local_files_only=self._local_files_only
+            model_description, self.cache_dir, local_files_only=self._local_files_only
         )
 
         self.load_onnx_model(
@@ -122,16 +122,10 @@ def _preprocess_onnx_input(
 
         return onnx_input
 
-    def _post_process_onnx_output(
-        self, output: OnnxOutputContext
-    ) -> Iterable[np.ndarray]:
+    def _post_process_onnx_output(self, output: OnnxOutputContext) -> Iterable[np.ndarray]:
         return normalize(output.model_output).astype(np.float32)
 
 
 class OnnxImageEmbeddingWorker(ImageEmbeddingWorker):
-    def init_embedding(
-        self, model_name: str, cache_dir: str, **kwargs
-    ) -> OnnxImageEmbedding:
-        return OnnxImageEmbedding(
-            model_name=model_name, cache_dir=cache_dir, threads=1, **kwargs
-        )
+    def init_embedding(self, model_name: str, cache_dir: str, **kwargs) -> OnnxImageEmbedding:
+        return OnnxImageEmbedding(model_name=model_name, cache_dir=cache_dir, threads=1, **kwargs)
diff --git a/fastembed/late_interaction/colbert.py b/fastembed/late_interaction/colbert.py
@@ -43,9 +43,7 @@ def _post_process_onnx_output(
                 if token_id in self.skip_list or token_id == self.pad_token_id:
                     output.attention_mask[i, j] = 0
 
-        output.model_output *= np.expand_dims(output.attention_mask, 2).astype(
-            np.float32
-        )
+        output.model_output *= np.expand_dims(output.attention_mask, 2).astype(np.float32)
         norm = np.linalg.norm(output.model_output, ord=2, axis=2, keepdims=True)
         norm_clamped = np.maximum(norm, 1e-12)
         output.model_output /= norm_clamped
@@ -126,10 +124,10 @@ def __init__(
         super().__init__(model_name, cache_dir, threads, **kwargs)
 
         model_description = self._get_model_description(model_name)
-        cache_dir = define_cache_dir(cache_dir)
+        self.cache_dir = define_cache_dir(cache_dir)
 
         model_dir = self.download_model(
-            model_description, cache_dir, local_files_only=self._local_files_only
+            model_description, self.cache_dir, local_files_only=self._local_files_only
         )
 
         self.load_onnx_model(
diff --git a/fastembed/sparse/bm25.py b/fastembed/sparse/bm25.py
@@ -79,10 +79,10 @@ def __init__(
         self.avg_len = avg_len
 
         model_description = self._get_model_description(model_name)
-        cache_dir = define_cache_dir(cache_dir)
+        self.cache_dir = define_cache_dir(cache_dir)
 
         model_dir = self.download_model(
-            model_description, cache_dir, local_files_only=self._local_files_only
+            model_description, self.cache_dir, local_files_only=self._local_files_only
         )
 
         self.punctuation = set(string.punctuation)
@@ -133,9 +133,7 @@ def _embed_documents(
             for batch in iter_batch(documents, batch_size):
                 yield from self.raw_embed(batch)
         else:
-            start_method = (
-                "forkserver" if "forkserver" in get_all_start_methods() else "spawn"
-            )
+            start_method = "forkserver" if "forkserver" in get_all_start_methods() else "spawn"
             params = {
                 "model_name": model_name,
                 "cache_dir": cache_dir,
@@ -241,9 +239,7 @@ def _term_frequency(self, tokens: List[str]) -> Dict[int, float]:
     def compute_token_id(cls, token: str) -> int:
         return abs(mmh3.hash(token))
 
-    def query_embed(
-        self, query: Union[str, Iterable[str]], **kwargs
-    ) -> Iterable[SparseEmbedding]:
+    def query_embed(self, query: Union[str, Iterable[str]], **kwargs) -> Iterable[SparseEmbedding]:
         """To emulate BM25 behaviour, we don't need to use weights in the query, and
         it's enough to just hash the tokens and assign a weight of 1.0 to them.
         """
diff --git a/fastembed/sparse/bm42.py b/fastembed/sparse/bm42.py
@@ -81,10 +81,10 @@ def __init__(
         super().__init__(model_name, cache_dir, threads, **kwargs)
 
         model_description = self._get_model_description(model_name)
-        cache_dir = define_cache_dir(cache_dir)
+        self.cache_dir = define_cache_dir(cache_dir)
 
         model_dir = self.download_model(
-            model_description, cache_dir, local_files_only=self._local_files_only
+            model_description, self.cache_dir, local_files_only=self._local_files_only
         )
 
         self.load_onnx_model(
@@ -106,9 +106,7 @@ def __init__(
         self.stemmer = get_stemmer(MODEL_TO_LANGUAGE[model_name])
         self.alpha = alpha
 
-    def _filter_pair_tokens(
-        self, tokens: List[Tuple[str, Any]]
-    ) -> List[Tuple[str, Any]]:
+    def _filter_pair_tokens(self, tokens: List[Tuple[str, Any]]) -> List[Tuple[str, Any]]:
         result = []
         for token, value in tokens:
             if token in self.stopwords or token in self.punctuation:
@@ -180,19 +178,13 @@ def _rescore_vector(self, vector: Dict[str, float]) -> Dict[int, float]:
 
         return new_vector
 
-    def _post_process_onnx_output(
-        self, output: OnnxOutputContext
-    ) -> Iterable[SparseEmbedding]:
+    def _post_process_onnx_output(self, output: OnnxOutputContext) -> Iterable[SparseEmbedding]:
         token_ids_batch = output.input_ids
 
         # attention_value shape: (batch_size, num_heads, num_tokens, num_tokens)
-        pooled_attention = (
-            np.mean(output.model_output[:, :, 0], axis=1) * output.attention_mask
-        )
+        pooled_attention = np.mean(output.model_output[:, :, 0], axis=1) * output.attention_mask
 
-        for document_token_ids, attention_value in zip(
-            token_ids_batch, pooled_attention
-        ):
+        for document_token_ids, attention_value in zip(token_ids_batch, pooled_attention):
             document_tokens_with_ids = (
                 (idx, self.invert_vocab[token_id])
                 for idx, token_id in enumerate(document_token_ids)
@@ -272,9 +264,7 @@ def _query_rehash(cls, tokens: Iterable[str]) -> Dict[int, float]:
             result[token_id] = 1.0
         return result
 
-    def query_embed(
-        self, query: Union[str, Iterable[str]], **kwargs
-    ) -> Iterable[SparseEmbedding]:
+    def query_embed(self, query: Union[str, Iterable[str]], **kwargs) -> Iterable[SparseEmbedding]:
         """
         To emulate BM25 behaviour, we don't need to use smart weights in the query, and
         it's enough to just hash the tokens and assign a weight of 1.0 to them.
@@ -290,9 +280,7 @@ def query_embed(
             filtered = self._filter_pair_tokens(reconstructed)
             stemmed = self._stem_pair_tokens(filtered)
 
-            yield SparseEmbedding.from_dict(
-                self._query_rehash(token for token, _ in stemmed)
-            )
+            yield SparseEmbedding.from_dict(self._query_rehash(token for token, _ in stemmed))
 
     @classmethod
     def _get_worker_class(cls) -> Type[TextEmbeddingWorker]:
diff --git a/fastembed/sparse/splade_pp.py b/fastembed/sparse/splade_pp.py
@@ -36,9 +36,7 @@
 
 
 class SpladePP(SparseTextEmbeddingBase, OnnxTextModel[SparseEmbedding]):
-    def _post_process_onnx_output(
-        self, output: OnnxOutputContext
-    ) -> Iterable[SparseEmbedding]:
+    def _post_process_onnx_output(self, output: OnnxOutputContext) -> Iterable[SparseEmbedding]:
         relu_log = np.log(1 + np.maximum(output.model_output, 0))
 
         weighted_log = relu_log * np.expand_dims(output.attention_mask, axis=-1)
@@ -84,10 +82,10 @@ def __init__(
         super().__init__(model_name, cache_dir, threads, **kwargs)
 
         model_description = self._get_model_description(model_name)
-        cache_dir = define_cache_dir(cache_dir)
+        self.cache_dir = define_cache_dir(cache_dir)
 
         model_dir = self.download_model(
-            model_description, cache_dir, local_files_only=self._local_files_only
+            model_description, self.cache_dir, local_files_only=self._local_files_only
         )
 
         self.load_onnx_model(
diff --git a/fastembed/text/mini_lm_embedding.py b/fastembed/text/mini_lm_embedding.py
@@ -28,12 +28,10 @@ def _get_worker_class(cls) -> Type[TextEmbeddingWorker]:
         return MiniLMEmbeddingWorker
 
     @classmethod
-    def mean_pooling(self, model_output: np.ndarray, attention_mask: np.ndarray) -> np.ndarray:
+    def mean_pooling(cls, model_output: np.ndarray, attention_mask: np.ndarray) -> np.ndarray:
         token_embeddings = model_output
         input_mask_expanded = np.expand_dims(attention_mask, axis=-1)
-        input_mask_expanded = np.tile(
-            input_mask_expanded, (1, 1, token_embeddings.shape[-1])
-        )
+        input_mask_expanded = np.tile(input_mask_expanded, (1, 1, token_embeddings.shape[-1]))
         input_mask_expanded = input_mask_expanded.astype(float)
         sum_embeddings = np.sum(token_embeddings * input_mask_expanded, axis=1)
         sum_mask = np.sum(input_mask_expanded, axis=1)
@@ -49,21 +47,12 @@ def list_supported_models(cls) -> List[Dict[str, Any]]:
         """
         return supported_mini_lm_models
 
-    def _post_process_onnx_output(
-        self, output: OnnxOutputContext
-    ) -> Iterable[np.ndarray]:
+    def _post_process_onnx_output(self, output: OnnxOutputContext) -> Iterable[np.ndarray]:
         embeddings = output.model_output
         attn_mask = output.attention_mask
         return normalize(self.mean_pooling(embeddings, attn_mask)).astype(np.float32)
 
 
 class MiniLMEmbeddingWorker(OnnxTextEmbeddingWorker):
-    def init_embedding(
-        self,
-        model_name: str,
-        cache_dir: str,
-        **kwargs
-    ) -> OnnxTextEmbedding:
-        return MiniLMOnnxEmbedding(
-            model_name=model_name, cache_dir=cache_dir, threads=1, **kwargs
-        )
+    def init_embedding(self, model_name: str, cache_dir: str, **kwargs) -> OnnxTextEmbedding:
+        return MiniLMOnnxEmbedding(model_name=model_name, cache_dir=cache_dir, threads=1, **kwargs)
diff --git a/fastembed/text/onnx_embedding.py b/fastembed/text/onnx_embedding.py
@@ -219,9 +219,9 @@ def __init__(
         super().__init__(model_name, cache_dir, threads, **kwargs)
 
         model_description = self._get_model_description(model_name)
-        cache_dir = define_cache_dir(cache_dir)
+        self.cache_dir = define_cache_dir(cache_dir)
         model_dir = self.download_model(
-            model_description, cache_dir, local_files_only=self._local_files_only
+            model_description, self.cache_dir, local_files_only=self._local_files_only
         )
 
         self.load_onnx_model(
@@ -274,9 +274,7 @@ def _preprocess_onnx_input(
         """
         return onnx_input
 
-    def _post_process_onnx_output(
-        self, output: OnnxOutputContext
-    ) -> Iterable[np.ndarray]:
+    def _post_process_onnx_output(self, output: OnnxOutputContext) -> Iterable[np.ndarray]:
         embeddings = output.model_output
         return normalize(embeddings[:, 0]).astype(np.float32)
 
@@ -288,6 +286,4 @@ def init_embedding(
         cache_dir: str,
         **kwargs,
     ) -> OnnxTextEmbedding:
-        return OnnxTextEmbedding(
-            model_name=model_name, cache_dir=cache_dir, threads=1, **kwargs
-        )
+        return OnnxTextEmbedding(model_name=model_name, cache_dir=cache_dir, threads=1, **kwargs)
diff --git a/tests/test_late_interaction_embeddings.py b/tests/test_late_interaction_embeddings.py
@@ -77,9 +77,7 @@ def test_single_embedding():
 
     for model_name, expected_result in CANONICAL_COLUMN_VALUES.items():
         print("evaluating", model_name)
-        model = LateInteractionTextEmbedding(
-            model_name=model_name, cache_dir="colbert-cache"
-        )
+        model = LateInteractionTextEmbedding(model_name=model_name)
         result = next(iter(model.embed(docs_to_embed, batch_size=6)))
         token_num, abridged_dim = expected_result.shape
         assert np.allclose(result[:, :abridged_dim], expected_result, atol=10e-4)
@@ -90,18 +88,14 @@ def test_single_embedding_query():
 
     for model_name, expected_result in CANONICAL_QUERY_VALUES.items():
         print("evaluating", model_name)
-        model = LateInteractionTextEmbedding(
-            model_name=model_name, cache_dir="colbert-cache"
-        )
+        model = LateInteractionTextEmbedding(model_name=model_name)
         result = next(iter(model.query_embed(queries_to_embed)))
         token_num, abridged_dim = expected_result.shape
         assert np.allclose(result[:, :abridged_dim], expected_result, atol=10e-4)
 
 
 def test_parallel_processing():
-    model = LateInteractionTextEmbedding(
-        model_name="colbert-ir/colbertv2.0", cache_dir="colbert-cache"
-    )
+    model = LateInteractionTextEmbedding(model_name="colbert-ir/colbertv2.0")
     token_dim = 128
     docs = ["hello world", "flag embedding"] * 100
     embeddings = list(model.embed(docs, batch_size=10, parallel=2))