Update ruff (#172)

joein · NirantK · web-flow · commit 25671ec3498f · 2024-04-01T12:55:15.000+05:30
* refactoring: reduce max line-length

* new: update ruff

---------

Co-authored-by: Nirant &lt;NirantK@users.noreply.github.com&gt;
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,6 +1,6 @@
 repos:
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.1.13
+    rev: v0.3.4
     hooks:
       - id: ruff
         types_or: [ python, pyi, jupyter ]
diff --git a/fastembed/common/model_management.py b/fastembed/common/model_management.py
@@ -92,7 +92,9 @@ def download_file_from_gcs(cls, url: str, output_path: str, show_progress: bool
 
         show_progress = total_size_in_bytes and show_progress
 
-        with tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True, disable=not show_progress) as progress_bar:
+        with tqdm(
+            total=total_size_in_bytes, unit="iB", unit_scale=True, disable=not show_progress
+        ) as progress_bar:
             with open(output_path, "wb") as file:
                 for chunk in response.iter_content(chunk_size=1024):
                     if chunk:  # Filter out keep-alive new chunks
@@ -101,7 +103,9 @@ def download_file_from_gcs(cls, url: str, output_path: str, show_progress: bool
         return output_path
 
     @classmethod
-    def download_files_from_huggingface(cls, hf_source_repo: str, cache_dir: Optional[str] = None) -> str:
+    def download_files_from_huggingface(
+        cls, hf_source_repo: str, cache_dir: Optional[str] = None
+    ) -> str:
         """
         Downloads a model from HuggingFace Hub.
         Args:
@@ -216,9 +220,14 @@ def download_model(cls, model: Dict[str, Any], cache_dir: Path) -> Path:
 
         if hf_source:
             try:
-                return Path(cls.download_files_from_huggingface(hf_source, cache_dir=str(cache_dir)))
+                return Path(
+                    cls.download_files_from_huggingface(hf_source, cache_dir=str(cache_dir))
+                )
             except (EnvironmentError, RepositoryNotFoundError, ValueError) as e:
-                logger.error(f"Could not download model from HuggingFace: {e}" "Falling back to other sources.")
+                logger.error(
+                    f"Could not download model from HuggingFace: {e}"
+                    "Falling back to other sources."
+                )
 
         if url_source:
             return cls.retrieve_model_gcs(model["model"], url_source, str(cache_dir))
diff --git a/fastembed/common/models.py b/fastembed/common/models.py
@@ -33,7 +33,9 @@ def load_tokenizer(model_dir: Path, max_length: int = 512) -> Tokenizer:
 
     tokenizer = Tokenizer.from_file(str(tokenizer_path))
     tokenizer.enable_truncation(max_length=min(tokenizer_config["model_max_length"], max_length))
-    tokenizer.enable_padding(pad_id=config.get("pad_token_id", 0), pad_token=tokenizer_config["pad_token"])
+    tokenizer.enable_padding(
+        pad_id=config.get("pad_token_id", 0), pad_token=tokenizer_config["pad_token"]
+    )
 
     for token in tokens_map.values():
         if isinstance(token, str):
diff --git a/fastembed/common/onnx_model.py b/fastembed/common/onnx_model.py
@@ -48,7 +48,9 @@ def load_onnx_model(self, model_dir: Path, threads: Optional[int], max_length: i
             so.inter_op_num_threads = threads
 
         self.tokenizer = load_tokenizer(model_dir=model_dir, max_length=max_length)
-        self.model = ort.InferenceSession(str(model_path), providers=onnx_providers, sess_options=so)
+        self.model = ort.InferenceSession(
+            str(model_path), providers=onnx_providers, sess_options=so
+        )
 
     def onnx_embed(self, documents: List[str]) -> Tuple[np.ndarray, np.ndarray]:
         encoded = self.tokenizer.encode_batch(documents)
@@ -58,7 +60,9 @@ def onnx_embed(self, documents: List[str]) -> Tuple[np.ndarray, np.ndarray]:
         onnx_input = {
             "input_ids": np.array(input_ids, dtype=np.int64),
             "attention_mask": np.array(attention_mask, dtype=np.int64),
-            "token_type_ids": np.array([np.zeros(len(e), dtype=np.int64) for e in input_ids], dtype=np.int64),
+            "token_type_ids": np.array(
+                [np.zeros(len(e), dtype=np.int64) for e in input_ids], dtype=np.int64
+            ),
         }
 
         onnx_input = self._preprocess_onnx_input(onnx_input)
@@ -97,7 +101,9 @@ def _embed_documents(
                 "model_name": model_name,
                 "cache_dir": cache_dir,
             }
-            pool = ParallelWorkerPool(parallel, self._get_worker_class(), start_method=start_method)
+            pool = ParallelWorkerPool(
+                parallel, self._get_worker_class(), start_method=start_method
+            )
             for batch in pool.ordered_map(iter_batch(documents, batch_size), **params):
                 yield from self._post_process_onnx_output(batch)
 
diff --git a/fastembed/embedding.py b/fastembed/embedding.py
@@ -5,7 +5,8 @@
 from fastembed.text.text_embedding import TextEmbedding
 
 logger.warning(
-    "DefaultEmbedding, FlagEmbedding, JinaEmbedding are deprecated." "Use from fastembed import TextEmbedding instead."
+    "DefaultEmbedding, FlagEmbedding, JinaEmbedding are deprecated."
+    "Use from fastembed import TextEmbedding instead."
 )
 
 DefaultEmbedding = TextEmbedding
diff --git a/fastembed/parallel_processor.py b/fastembed/parallel_processor.py
@@ -128,7 +128,9 @@ def ordered_map(self, stream: Iterable[Any], *args: Any, **kwargs: Any) -> Itera
                 yield buffer.pop(next_expected)
                 next_expected += 1
 
-    def semi_ordered_map(self, stream: Iterable[Any], *args: Any, **kwargs: Any) -> Iterable[Tuple[int, Any]]:
+    def semi_ordered_map(
+        self, stream: Iterable[Any], *args: Any, **kwargs: Any
+    ) -> Iterable[Tuple[int, Any]]:
         try:
             self.start(**kwargs)
 
diff --git a/fastembed/sparse/sparse_embedding_base.py b/fastembed/sparse/sparse_embedding_base.py
@@ -22,7 +22,13 @@ def as_dict(self) -> Dict[int, float]:
 
 
 class SparseTextEmbeddingBase(ModelManagement):
-    def __init__(self, model_name: str, cache_dir: Optional[str] = None, threads: Optional[int] = None, **kwargs):
+    def __init__(
+        self,
+        model_name: str,
+        cache_dir: Optional[str] = None,
+        threads: Optional[int] = None,
+        **kwargs,
+    ):
         self.model_name = model_name
         self.cache_dir = cache_dir
         self.threads = threads
diff --git a/fastembed/sparse/splade_pp.py b/fastembed/sparse/splade_pp.py
@@ -30,7 +30,9 @@
 
 class SpladePP(SparseTextEmbeddingBase, OnnxModel[SparseEmbedding]):
     @classmethod
-    def _post_process_onnx_output(cls, output: Tuple[np.ndarray, np.ndarray]) -> Iterable[SparseEmbedding]:
+    def _post_process_onnx_output(
+        cls, output: Tuple[np.ndarray, np.ndarray]
+    ) -> Iterable[SparseEmbedding]:
         logits, attention_mask = output
         relu_log = np.log(1 + np.maximum(logits, 0))
 
diff --git a/fastembed/text/jina_onnx_embedding.py b/fastembed/text/jina_onnx_embedding.py
@@ -49,7 +49,9 @@ def list_supported_models(cls) -> List[Dict[str, Any]]:
         return supported_jina_models
 
     @classmethod
-    def _post_process_onnx_output(cls, output: Tuple[np.ndarray, np.ndarray]) -> Iterable[np.ndarray]:
+    def _post_process_onnx_output(
+        cls, output: Tuple[np.ndarray, np.ndarray]
+    ) -> Iterable[np.ndarray]:
         embeddings, attn_mask = output
         return normalize(cls.mean_pooling(embeddings, attn_mask)).astype(np.float32)
 
diff --git a/fastembed/text/onnx_embedding.py b/fastembed/text/onnx_embedding.py
@@ -144,12 +144,12 @@
     #     ]
     # }
     {
-    "model": "mixedbread-ai/mxbai-embed-large-v1",
-    "dim": 1024,
-    "description": "MixedBread Base sentence embedding model, does well on MTEB",
-    "size_in_GB": 1.34, 
-    "sources": {
-        "hf": "mixedbread-ai/mxbai-embed-large-v1",
+        "model": "mixedbread-ai/mxbai-embed-large-v1",
+        "dim": 1024,
+        "description": "MixedBread Base sentence embedding model, does well on MTEB",
+        "size_in_GB": 1.34,
+        "sources": {
+            "hf": "mixedbread-ai/mxbai-embed-large-v1",
         },
     },
 ]
@@ -239,7 +239,9 @@ def _preprocess_onnx_input(self, onnx_input: Dict[str, np.ndarray]) -> Dict[str,
         return onnx_input
 
     @classmethod
-    def _post_process_onnx_output(cls, output: Tuple[np.ndarray, np.ndarray]) -> Iterable[np.ndarray]:
+    def _post_process_onnx_output(
+        cls, output: Tuple[np.ndarray, np.ndarray]
+    ) -> Iterable[np.ndarray]:
         embeddings, _ = output
         return normalize(embeddings[:, 0]).astype(np.float32)
 
diff --git a/fastembed/text/text_embedding_base.py b/fastembed/text/text_embedding_base.py
@@ -6,7 +6,13 @@
 
 
 class TextEmbeddingBase(ModelManagement):
-    def __init__(self, model_name: str, cache_dir: Optional[str] = None, threads: Optional[int] = None, **kwargs):
+    def __init__(
+        self,
+        model_name: str,
+        cache_dir: Optional[str] = None,
+        threads: Optional[int] = None,
+        **kwargs,
+    ):
         self.model_name = model_name
         self.cache_dir = cache_dir
         self.threads = threads
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml

Original file line number	Diff line number	Diff line change
`@@ -5,7 +5,8 @@`
`5`	`5`	`from fastembed.text.text_embedding import TextEmbedding`
`6`	`6`
`7`	`7`	`logger.warning(`
`8`		`- "DefaultEmbedding, FlagEmbedding, JinaEmbedding are deprecated." "Use from fastembed import TextEmbedding instead."`
	`8`	`+ "DefaultEmbedding, FlagEmbedding, JinaEmbedding are deprecated."`
	`9`	`+ "Use from fastembed import TextEmbedding instead."`
`9`	`10`	`)`
`10`	`11`
`11`	`12`	`DefaultEmbedding = TextEmbedding`