chore: Generalized query marker and document marker

hh-space-invader · hh-space-invader · commit ce0e544b13e9 · 2024-10-14T17:24:36.000+03:00
diff --git a/fastembed/common/preprocessor_utils.py b/fastembed/common/preprocessor_utils.py
@@ -68,6 +68,18 @@ def load_tokenizer(model_dir: Path) -> Tuple[Tokenizer, dict]:
             token_str = token.get("content", "")
             special_token_to_id[token_str] = tokenizer.token_to_id(token_str)
 
+    if tokenizer_config["tokenizer_class"] == "BertTokenizer":
+        query_marker = {"[Q]": 1}
+        document_marker = {"[D]": 2}
+    elif tokenizer_config["tokenizer_class"] == "XLMRobertaTokenizer":
+        query_marker = {"[QueryMarker]": 250002}
+        document_marker = {"[DocumentMarker]": 250003}
+    else:
+        query_marker = {}
+        document_marker = {}
+
+    special_token_to_id.update(query_marker)
+    special_token_to_id.update(document_marker)
     return tokenizer, special_token_to_id
 
 
diff --git a/fastembed/late_interaction/colbert.py b/fastembed/late_interaction/colbert.py
@@ -48,10 +48,10 @@
 
 
 class Colbert(LateInteractionTextEmbeddingBase, OnnxTextModel[np.ndarray]):
-    QUERY_MARKER_TOKEN_ID = 1
-    DOCUMENT_MARKER_TOKEN_ID = 2
     MIN_QUERY_LENGTH = 32
     MASK_TOKENS = ["[MASK]", "<mask>"]
+    QUERY_MARKER_TOKENS = ["[Q]", "[QueryMarker]"]
+    DOCUMENT_MARKER_TOKENS = ["[D]", "[DocumentMarker]"]
 
     def _post_process_onnx_output(
         self, output: OnnxOutputContext, is_doc: bool = True
@@ -74,9 +74,9 @@ def _preprocess_onnx_input(
         self, onnx_input: Dict[str, np.ndarray], is_doc: bool = True
     ) -> Dict[str, np.ndarray]:
         if is_doc:
-            onnx_input["input_ids"][:, 1] = self.DOCUMENT_MARKER_TOKEN_ID
+            onnx_input["input_ids"][:, 1] = self.document_marker_token_id
         else:
-            onnx_input["input_ids"][:, 1] = self.QUERY_MARKER_TOKEN_ID
+            onnx_input["input_ids"][:, 1] = self.query_marker_token_id
         return onnx_input
 
     def tokenize(self, documents: List[str], is_doc: bool = True) -> List[Encoding]:
@@ -87,16 +87,17 @@ def tokenize(self, documents: List[str], is_doc: bool = True) -> List[Encoding]:
         )
 
     def _tokenize_query(self, query: str) -> List[Encoding]:
-        # ". " is added to a query to be replaced with a special query token
-        query = [f". {query}"]
+        # "@ " is added to a query to be replaced with a special query token
+        # please make sure that "@ " is considered as one token in all tokenizers we use in Late Interaction Models
+        query = [f"@ {query}"]
         encoded = self.tokenizer.encode_batch(query)
         # colbert authors recommend to pad queries with [MASK] tokens for query augmentation to improve performance
         if len(encoded[0].ids) < self.MIN_QUERY_LENGTH:
             prev_padding = None
             if self.tokenizer.padding:
                 prev_padding = self.tokenizer.padding
             self.tokenizer.enable_padding(
-                pad_token=self.MASK_TOKENS[0],
+                pad_token=self.mask_token,
                 pad_id=self.mask_token_id,
                 length=self.MIN_QUERY_LENGTH,
             )
@@ -108,8 +109,9 @@ def _tokenize_query(self, query: str) -> List[Encoding]:
         return encoded
 
     def _tokenize_documents(self, documents: List[str]) -> List[Encoding]:
-        # ". " is added to a document to be replaced with a special document token
-        documents = [". " + doc for doc in documents]
+        # "@ " is added to a document to be replaced with a special document token
+        # please make sure that "@ " is considered as one token in all tokenizers we use in Late Interaction Models
+        documents = ["@ " + doc for doc in documents]
         encoded = self.tokenizer.encode_batch(documents)
         return encoded
 
@@ -157,12 +159,28 @@ def __init__(
             threads=threads,
             providers=providers,
         )
-        self.mask_token_id = next(
+        self.mask_token_id, self.mask_token = next(
             (
-                self.special_token_to_id[token]
+                (self.special_token_to_id[token], token)
                 for token in self.MASK_TOKENS
                 if token in self.special_token_to_id
             ),
+            (None, None),
+        )
+        self.query_marker_token_id = next(
+            (
+                self.special_token_to_id[token]
+                for token in self.QUERY_MARKER_TOKENS
+                if token in self.special_token_to_id
+            ),
+            None,
+        )
+        self.document_marker_token_id = next(
+            (
+                self.special_token_to_id[token]
+                for token in self.DOCUMENT_MARKER_TOKENS
+                if token in self.special_token_to_id
+            ),
             None,
         )
         self.pad_token_id = self.tokenizer.padding["pad_id"]