qdrant
diff --git a/‎fastembed/__init__.py
+8-3 b/‎fastembed/__init__.py
+8-3
diff --git a/‎fastembed/common/onnx_model.py
+3-1 b/‎fastembed/common/onnx_model.py
+3-1
diff --git a/‎fastembed/image/onnx_embedding.py
+3-1 b/‎fastembed/image/onnx_embedding.py
+3-1
diff --git a/‎fastembed/image/onnx_image_model.py
+4-4 b/‎fastembed/image/onnx_image_model.py
+4-4
diff --git a/‎fastembed/late_interaction/__init__.py
+4 b/‎fastembed/late_interaction/__init__.py
+4
diff --git a/‎fastembed/late_interaction/colbert.py
+196 b/‎fastembed/late_interaction/colbert.py
+196
diff --git a/‎fastembed/late_interaction/late_interaction_embedding_base.py
+60 b/‎fastembed/late_interaction/late_interaction_embedding_base.py
+60
@@ -3,13 +3,18 @@
 from fastembed.image import ImageEmbedding
 from fastembed.text import TextEmbedding
 from fastembed.sparse import SparseTextEmbedding, SparseEmbedding
-
+from fastembed.late_interaction import LateInteractionTextEmbedding
 
 try:
     version = importlib.metadata.version("fastembed")
 except importlib.metadata.PackageNotFoundError as _:
     version = importlib.metadata.version("fastembed-gpu")
 
 __version__ = version
-__all__ = ["TextEmbedding", "SparseTextEmbedding", "SparseEmbedding", "ImageEmbedding"]
-
+__all__ = [
+    "TextEmbedding",
+    "SparseTextEmbedding",
+    "SparseEmbedding",
+    "ImageEmbedding",
+    "LateInteractionTextEmbedding",
+]
@@ -33,7 +33,9 @@ def __init__(self) -> None:
         self.model = None
         self.tokenizer = None
 
-    def _preprocess_onnx_input(self, onnx_input: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]:
+    def _preprocess_onnx_input(
+        self, onnx_input: Dict[str, np.ndarray], **kwargs
+    ) -> Dict[str, np.ndarray]:
         """
         Preprocess the onnx input.
         """
 
@@ -112,7 +112,9 @@ def embed(
     def _get_worker_class(cls) -> Type["ImageEmbeddingWorker"]:
         return OnnxImageEmbeddingWorker
 
-    def _preprocess_onnx_input(self, onnx_input: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]:
+    def _preprocess_onnx_input(
+        self, onnx_input: Dict[str, np.ndarray], **kwargs
+    ) -> Dict[str, np.ndarray]:
         """
         Preprocess the onnx input.
         """
 
@@ -28,7 +28,9 @@ def __init__(self) -> None:
         super().__init__()
         self.processor = None
 
-    def _preprocess_onnx_input(self, onnx_input: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]:
+    def _preprocess_onnx_input(
+        self, onnx_input: Dict[str, np.ndarray], **kwargs
+    ) -> Dict[str, np.ndarray]:
         """
         Preprocess the onnx input.
         """
@@ -49,16 +51,14 @@ def load_onnx_model(
     def _build_onnx_input(self, encoded: np.ndarray) -> Dict[str, np.ndarray]:
         return {node.name: encoded for node in self.model.get_inputs()}
 
-    def onnx_embed(self, images: List[PathInput]) -> OnnxOutputContext:
+    def onnx_embed(self, images: List[PathInput], **kwargs) -> OnnxOutputContext:
         with contextlib.ExitStack():
             image_files = [Image.open(image) for image in images]
             encoded = self.processor(image_files)
         onnx_input = self._build_onnx_input(encoded)
         onnx_input = self._preprocess_onnx_input(onnx_input)
         model_output = self.model.run(None, onnx_input)
-
         embeddings = model_output[0].reshape(len(images), -1)
-
         return OnnxOutputContext(
             model_output=embeddings
         )
 
@@ -0,0 +1,4 @@
+from fastembed.late_interaction.late_interaction_text_embedding import LateInteractionTextEmbedding
+
+
+__all__ = ["LateInteractionTextEmbedding"]
@@ -0,0 +1,196 @@
+from typing import Any, Dict, Iterable, List, Optional, Union, Type, Sequence
+import string
+
+import numpy as np
+from tokenizers import Encoding
+
+from fastembed.common import OnnxProvider
+from fastembed.common.onnx_model import OnnxOutputContext
+from fastembed.common.utils import define_cache_dir
+from fastembed.late_interaction.late_interaction_embedding_base import (
+    LateInteractionTextEmbeddingBase,
+)
+from fastembed.text.onnx_text_model import OnnxTextModel, TextEmbeddingWorker
+
+
+supported_colbert_models = [
+    {
+        "model": "colbert-ir/colbertv2.0",
+        "dim": 128,
+        "description": "Late interaction model",
+        "size_in_GB": 0.44,
+        "sources": {
+            "hf": "colbert-ir/colbertv2.0",
+        },
+        "model_file": "model.onnx",
+    }
+]
+
+
+class Colbert(LateInteractionTextEmbeddingBase, OnnxTextModel[np.ndarray]):
+    QUERY_MARKER_TOKEN_ID = 1
+    DOCUMENT_MARKER_TOKEN_ID = 2
+    MIN_QUERY_LENGTH = 32
+    MASK_TOKEN = "[MASK]"
+
+    def _post_process_onnx_output(
+        self, output: OnnxOutputContext, is_doc: bool = True
+    ) -> Iterable[np.ndarray]:
+        if not is_doc:
+            return output.model_output.astype(np.float32)
+
+        for i, token_sequence in enumerate(output.input_ids):
+            for j, token_id in enumerate(token_sequence):
+                if token_id in self.skip_list or token_id == self.pad_token_id:
+                    output.attention_mask[i, j] = 0
+
+        output.model_output *= np.expand_dims(output.attention_mask, 2).astype(np.float32)
+        norm = np.linalg.norm(output.model_output, ord=2, axis=2, keepdims=True)
+        norm_clamped = np.maximum(norm, 1e-12)
+        output.model_output /= norm_clamped
+        return output.model_output.astype(np.float32)
+
+    def _preprocess_onnx_input(
+        self, onnx_input: Dict[str, np.ndarray], is_doc: bool = True
+    ) -> Dict[str, np.ndarray]:
+        if is_doc:
+            onnx_input["input_ids"][:, 1] = self.DOCUMENT_MARKER_TOKEN_ID
+        else:
+            onnx_input["input_ids"][:, 1] = self.QUERY_MARKER_TOKEN_ID
+        return onnx_input
+
+    def tokenize(self, documents: List[str], is_doc: bool = True) -> List[Encoding]:
+        return (
+            self._tokenize_documents(documents=documents)
+            if is_doc
+            else self._tokenize_query(query=next(iter(documents)))
+        )
+
+    def _tokenize_query(self, query: str) -> List[Encoding]:
+        # ". " is added to a query to be replaced with a special query token
+        query = [f". {query}"]
+        encoded = self.tokenizer.encode_batch(query)
+        # colbert authors recommend to pad queries with [MASK] tokens for query augmentation to improve performance
+        if len(encoded[0].ids) < self.MIN_QUERY_LENGTH:
+            prev_padding = None
+            if self.tokenizer.padding:
+                prev_padding = self.tokenizer.padding
+            self.tokenizer.enable_padding(
+                pad_token=self.MASK_TOKEN, pad_id=self.mask_token_id, length=self.MIN_QUERY_LENGTH
+            )
+            encoded = self.tokenizer.encode_batch(query)
+            if prev_padding is None:
+                self.tokenizer.no_padding()
+            else:
+                self.tokenizer.enable_padding(**prev_padding)
+        return encoded
+
+    def _tokenize_documents(self, documents: List[str]) -> List[Encoding]:
+        # ". " is added to a document to be replaced with a special document token
+        documents = [". " + doc for doc in documents]
+        encoded = self.tokenizer.encode_batch(documents)
+        return encoded
+
+    @classmethod
+    def list_supported_models(cls) -> List[Dict[str, Any]]:
+        """Lists the supported models.
+
+        Returns:
+            List[Dict[str, Any]]: A list of dictionaries containing the model information.
+        """
+        return supported_colbert_models
+
+    def __init__(
+        self,
+        model_name: str,
+        cache_dir: Optional[str] = None,
+        threads: Optional[int] = None,
+        providers: Optional[Sequence[OnnxProvider]] = None,
+        **kwargs,
+    ):
+        """
+        Args:
+            model_name (str): The name of the model to use.
+            cache_dir (str, optional): The path to the cache directory.
+                                       Can be set using the `FASTEMBED_CACHE_PATH` env variable.
+                                       Defaults to `fastembed_cache` in the system's temp directory.
+            threads (int, optional): The number of threads single onnxruntime session can use. Defaults to None.
+
+        Raises:
+            ValueError: If the model_name is not in the format <org>/<model> e.g. BAAI/bge-base-en.
+        """
+
+        super().__init__(model_name, cache_dir, threads, **kwargs)
+
+        model_description = self._get_model_description(model_name)
+        cache_dir = define_cache_dir(cache_dir)
+
+        model_dir = self.download_model(
+            model_description, cache_dir, local_files_only=self._local_files_only
+        )
+
+        self.load_onnx_model(
+            model_dir=model_dir,
+            model_file=model_description["model_file"],
+            threads=threads,
+            providers=providers,
+        )
+        self.mask_token_id = self.special_token_to_id["[MASK]"]
+        self.pad_token_id = self.tokenizer.padding["pad_id"]
+
+        self.skip_list = {
+            self.tokenizer.encode(symbol, add_special_tokens=False).ids[0]
+            for symbol in string.punctuation
+        }
+
+    def embed(
+        self,
+        documents: Union[str, Iterable[str]],
+        batch_size: int = 256,
+        parallel: Optional[int] = None,
+        **kwargs,
+    ) -> Iterable[np.ndarray]:
+        """
+        Encode a list of documents into list of embeddings.
+        We use mean pooling with attention so that the model can handle variable-length inputs.
+
+        Args:
+            documents: Iterator of documents or single document to embed
+            batch_size: Batch size for encoding -- higher values will use more memory, but be faster
+            parallel:
+                If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets.
+                If 0, use all available cores.
+                If None, don't use data-parallel processing, use default onnxruntime threading instead.
+
+        Returns:
+            List of embeddings, one per document
+        """
+        yield from self._embed_documents(
+            model_name=self.model_name,
+            cache_dir=str(self.cache_dir),
+            documents=documents,
+            batch_size=batch_size,
+            parallel=parallel,
+        )
+
+    def query_embed(self, query: Union[str, List[str]], **kwargs) -> np.ndarray:
+        if isinstance(query, str):
+            query = [query]
+
+        for text in query:
+            yield from self._post_process_onnx_output(
+                self.onnx_embed([text], is_doc=False), is_doc=False
+            )
+
+    @classmethod
+    def _get_worker_class(cls) -> Type[TextEmbeddingWorker]:
+        return ColbertEmbeddingWorker
+
+
+class ColbertEmbeddingWorker(TextEmbeddingWorker):
+    def init_embedding(
+        self,
+        model_name: str,
+        cache_dir: str,
+    ) -> Colbert:
+        return Colbert(model_name=model_name, cache_dir=cache_dir, threads=1)
@@ -0,0 +1,60 @@
+from typing import Iterable, Optional, Union
+
+import numpy as np
+
+from fastembed.common.model_management import ModelManagement
+
+
+class LateInteractionTextEmbeddingBase(ModelManagement):
+    def __init__(
+        self,
+        model_name: str,
+        cache_dir: Optional[str] = None,
+        threads: Optional[int] = None,
+        **kwargs,
+    ):
+        self.model_name = model_name
+        self.cache_dir = cache_dir
+        self.threads = threads
+        self._local_files_only = kwargs.pop("local_files_only", False)
+
+    def embed(
+        self,
+        documents: Union[str, Iterable[str]],
+        batch_size: int = 256,
+        parallel: Optional[int] = None,
+        **kwargs,
+    ) -> Iterable[np.ndarray]:
+        raise NotImplementedError()
+
+    def passage_embed(self, texts: Iterable[str], **kwargs) -> Iterable[np.ndarray]:
+        """
+        Embeds a list of text passages into a list of embeddings.
+
+        Args:
+            texts (Iterable[str]): The list of texts to embed.
+            **kwargs: Additional keyword argument to pass to the embed method.
+
+        Yields:
+            Iterable[np.ndarray]: The embeddings.
+        """
+
+        # This is model-specific, so that different models can have specialized implementations
+        yield from self.embed(texts, **kwargs)
+
+    def query_embed(self, query: Union[str, Iterable[str]], **kwargs) -> Iterable[np.ndarray]:
+        """
+        Embeds queries
+
+        Args:
+            query (Union[str, Iterable[str]]): The query to embed, or an iterable e.g. list of queries.
+
+        Returns:
+            Iterable[np.ndarray]: The embeddings.
+        """
+
+        # This is model-specific, so that different models can have specialized implementations
+        if isinstance(query, str):
+            yield from self.embed([query], **kwargs)
+        if isinstance(query, Iterable):
+            yield from self.embed(query, **kwargs)