refactor: Removed type ignore

hh-space-invader · hh-space-invader · commit 877dd74ab24a · 2025-02-06T13:33:14.000+02:00
diff --git a/fastembed/common/onnx_model.py b/fastembed/common/onnx_model.py
@@ -90,7 +90,7 @@ def _load_onnx_model(
             str(model_path), providers=onnx_providers, sess_options=so
         )
         if "CUDAExecutionProvider" in requested_provider_names:
-            current_providers = self.model.get_providers()  # type: ignore
+            current_providers = self.model.get_providers()
             if "CUDAExecutionProvider" not in current_providers:
                 warnings.warn(
                     f"Attempt to set CUDAExecutionProvider failed. Current providers: {current_providers}."
diff --git a/fastembed/late_interaction/colbert.py b/fastembed/late_interaction/colbert.py
@@ -88,26 +88,26 @@ def tokenize(self, documents: list[str], is_doc: bool = True, **kwargs: Any) ->
         )
 
     def _tokenize_query(self, query: str) -> list[Encoding]:
-        encoded = self.tokenizer.encode_batch([query])  # type: ignore
+        encoded = self.tokenizer.encode_batch([query])
         # colbert authors recommend to pad queries with [MASK] tokens for query augmentation to improve performance
         if len(encoded[0].ids) < self.MIN_QUERY_LENGTH:
             prev_padding = None
-            if self.tokenizer.padding:  # type: ignore
-                prev_padding = self.tokenizer.padding  # type: ignore
-            self.tokenizer.enable_padding(  # type: ignore
+            if self.tokenizer.padding:
+                prev_padding = self.tokenizer.padding
+            self.tokenizer.enable_padding(
                 pad_token=self.MASK_TOKEN,
                 pad_id=self.mask_token_id,
                 length=self.MIN_QUERY_LENGTH,
             )
-            encoded = self.tokenizer.encode_batch([query])  # type: ignore
+            encoded = self.tokenizer.encode_batch([query])
             if prev_padding is None:
-                self.tokenizer.no_padding()  # type: ignore
+                self.tokenizer.no_padding()
             else:
-                self.tokenizer.enable_padding(**prev_padding)  # type: ignore
+                self.tokenizer.enable_padding(**prev_padding)
         return encoded
 
     def _tokenize_documents(self, documents: list[str]) -> list[Encoding]:
-        encoded = self.tokenizer.encode_batch(documents)  # type: ignore
+        encoded = self.tokenizer.encode_batch(documents)
         return encoded
 
     @classmethod
@@ -195,14 +195,14 @@ def load_onnx_model(self) -> None:
             device_id=self.device_id,
         )
         self.mask_token_id = self.special_token_to_id[self.MASK_TOKEN]
-        self.pad_token_id = self.tokenizer.padding["pad_id"]  # type: ignore
+        self.pad_token_id = self.tokenizer.padding["pad_id"]
         self.skip_list = {
-            self.tokenizer.encode(symbol, add_special_tokens=False).ids[0]  # type: ignore
+            self.tokenizer.encode(symbol, add_special_tokens=False).ids[0]
             for symbol in string.punctuation
         }
-        current_max_length = self.tokenizer.truncation["max_length"]  # type: ignore
+        current_max_length = self.tokenizer.truncation["max_length"]
         # ensure not to overflow after adding document-marker
-        self.tokenizer.enable_truncation(max_length=current_max_length - 1)  # type: ignore
+        self.tokenizer.enable_truncation(max_length=current_max_length - 1)
 
     def embed(
         self,
diff --git a/fastembed/rerank/cross_encoder/onnx_text_model.py b/fastembed/rerank/cross_encoder/onnx_text_model.py
@@ -45,10 +45,10 @@ def _load_onnx_model(
         self.tokenizer, _ = load_tokenizer(model_dir=model_dir)
 
     def tokenize(self, pairs: list[tuple[str, str]], **_: Any) -> list[Encoding]:
-        return self.tokenizer.encode_batch(pairs)  # type: ignore
+        return self.tokenizer.encode_batch(pairs)
 
     def _build_onnx_input(self, tokenized_input: list[Encoding]) -> dict[str, NumpyArray]:
-        input_names: set[str] = {node.name for node in self.model.get_inputs()}  # type: ignore
+        input_names: set[str] = {node.name for node in self.model.get_inputs()}
         inputs: dict[str, NumpyArray] = {
             "input_ids": np.array([enc.ids for enc in tokenized_input], dtype=np.int64),
         }
@@ -70,7 +70,7 @@ def onnx_embed_pairs(self, pairs: list[tuple[str, str]], **kwargs: Any) -> OnnxO
         tokenized_input = self.tokenize(pairs, **kwargs)
         inputs = self._build_onnx_input(tokenized_input)
         onnx_input = self._preprocess_onnx_input(inputs, **kwargs)
-        outputs = self.model.run(self.ONNX_OUTPUT_NAMES, onnx_input)  # type: ignore
+        outputs = self.model.run(self.ONNX_OUTPUT_NAMES, onnx_input)
         relevant_output = outputs[0]
         scores: NumpyArray = relevant_output[:, 0]
         return OnnxOutputContext(model_output=scores)
@@ -98,7 +98,7 @@ def _rerank_pairs(
         is_small = False
 
         if isinstance(pairs, tuple):
-            pairs = [pairs]  # type: ignore
+            pairs = [pairs]
             is_small = True
 
         if isinstance(pairs, list):
diff --git a/fastembed/sparse/bm42.py b/fastembed/sparse/bm42.py
@@ -139,7 +139,7 @@ def load_onnx_model(self) -> None:
             cuda=self.cuda,
             device_id=self.device_id,
         )
-        for token, idx in self.tokenizer.get_vocab().items():  # type: ignore
+        for token, idx in self.tokenizer.get_vocab().items():
             self.invert_vocab[idx] = token
         self.special_tokens = set(self.special_token_to_id.keys())
         self.special_tokens_ids = set(self.special_token_to_id.values())
@@ -177,7 +177,7 @@ def _reconstruct_bpe(
         acc: str = ""
         acc_idx: list[int] = []
 
-        continuing_subword_prefix = self.tokenizer.model.continuing_subword_prefix  # type: ignore
+        continuing_subword_prefix = self.tokenizer.model.continuing_subword_prefix
         continuing_subword_prefix_len = len(continuing_subword_prefix)
 
         for idx, token in bpe_tokens:
@@ -324,7 +324,7 @@ def query_embed(
             self.load_onnx_model()
 
         for text in query:
-            encoded = self.tokenizer.encode(text)  # type: ignore
+            encoded = self.tokenizer.encode(text)
             document_tokens_with_ids = enumerate(encoded.tokens)
             reconstructed = self._reconstruct_bpe(document_tokens_with_ids)
             filtered = self._filter_pair_tokens(reconstructed)
diff --git a/fastembed/text/onnx_text_model.py b/fastembed/text/onnx_text_model.py
@@ -60,17 +60,17 @@ def load_onnx_model(self) -> None:
         raise NotImplementedError("Subclasses must implement this method")
 
     def tokenize(self, documents: list[str], **kwargs: Any) -> list[Encoding]:
-        return self.tokenizer.encode_batch(documents)  # type: ignore
+        return self.tokenizer.encode_batch(documents)
 
     def onnx_embed(
         self,
         documents: list[str],
         **kwargs: Any,
     ) -> OnnxOutputContext:
         encoded = self.tokenize(documents, **kwargs)
-        input_ids = np.array([e.ids for e in encoded])  # type: ignore
-        attention_mask = np.array([e.attention_mask for e in encoded])  # type: ignore
-        input_names = {node.name for node in self.model.get_inputs()}  # type: ignore
+        input_ids = np.array([e.ids for e in encoded])
+        attention_mask = np.array([e.attention_mask for e in encoded])
+        input_names = {node.name for node in self.model.get_inputs()}
         onnx_input: dict[str, NumpyArray] = {
             "input_ids": np.array(input_ids, dtype=np.int64),
         }
@@ -82,7 +82,7 @@ def onnx_embed(
             )
         onnx_input = self._preprocess_onnx_input(onnx_input, **kwargs)
 
-        model_output = self.model.run(self.ONNX_OUTPUT_NAMES, onnx_input)  # type: ignore
+        model_output = self.model.run(self.ONNX_OUTPUT_NAMES, onnx_input)
         return OnnxOutputContext(
             model_output=model_output[0],
             attention_mask=onnx_input.get("attention_mask", attention_mask),

Original file line number	Diff line number	Diff line change
`@@ -90,7 +90,7 @@ def _load_onnx_model(`
`90`	`90`	`str(model_path), providers=onnx_providers, sess_options=so`
`91`	`91`	`)`
`92`	`92`	`if "CUDAExecutionProvider" in requested_provider_names:`
`93`		`- current_providers = self.model.get_providers() # type: ignore`
	`93`	`+ current_providers = self.model.get_providers()`
`94`	`94`	`if "CUDAExecutionProvider" not in current_providers:`
`95`	`95`	`warnings.warn(`
`96`	`96`	`f"Attempt to set CUDAExecutionProvider failed. Current providers: {current_providers}."`