StarlightSearch
diff --git a/‎Cargo.lock‎
Lines changed: 383 additions & 13 deletions b/‎Cargo.lock‎
Lines changed: 383 additions & 13 deletions
diff --git a/‎Cargo.toml‎
Lines changed: 1 addition & 1 deletion b/‎Cargo.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/blog/posts/ReleaseNotes5-5.md‎
Lines changed: 3 additions & 7 deletions b/‎docs/blog/posts/ReleaseNotes5-5.md‎
Lines changed: 3 additions & 7 deletions
diff --git a/‎examples/text.py‎
Lines changed: 2 additions & 2 deletions b/‎examples/text.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎processors/Cargo.toml‎
Lines changed: 1 addition & 0 deletions b/‎processors/Cargo.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎processors/src/pdf/pdf_processor.rs‎
Lines changed: 69 additions & 14 deletions b/‎processors/src/pdf/pdf_processor.rs‎
Lines changed: 69 additions & 14 deletions
diff --git a/‎python/Cargo.toml‎
Lines changed: 1 addition & 0 deletions b/‎python/Cargo.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎python/python/embed_anything/_embed_anything.pyi‎
Lines changed: 5 additions & 0 deletions b/‎python/python/embed_anything/_embed_anything.pyi‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎python/src/config.rs‎
Lines changed: 16 additions & 2 deletions b/‎python/src/config.rs‎
Lines changed: 16 additions & 2 deletions
diff --git a/‎python/src/lib.rs‎
Lines changed: 23 additions & 2 deletions b/‎python/src/lib.rs‎
Lines changed: 23 additions & 2 deletions
@@ -15,7 +15,7 @@ description = "Embed anything at lightning speed"
 repository = "https://github.com/StarlightSearch/EmbedAnything"
 authors = ["Akshay Ballal <arballal95@gmail.com>"]
 exclude = ["test_files/*", "python", "*.py", "pyproject.toml", "examples/images/*", "mkdocs.yml", "docs/*", "tests/*", ".github", "Dockerfile", "docs"]
-version = "0.5.6"
+version = "0.6.0"
 
 [workspace.dependencies]
 pdf-extract = "0.7.7"
 
@@ -11,7 +11,7 @@ Super Excited to share the latest development in our library, which essentially
 
 <!-- more -->
 
-## Support for late chunking.
+## Late Chunking
 
 The new 0.5.6 version adds Late Chunking to EmbedAnything, a technique introduced by Jina AI and Weaviate. 
 Here's how we've implemented Late Chunking in EA:
@@ -46,7 +46,7 @@ data: list[EmbedData] = model.embed_file("test_files/attention.pdf", config=conf
 ```
 
 
-## 𝘊𝘰𝘩𝘦𝘳𝘦 𝘌𝘮𝘣𝘦𝘥 4:
+## Cohere Embed 4:
 
 🧊 Single embedding per document, even for multimodal inputs
 📚 Handles up to 128K tokens – perfect for long-form business documents
@@ -77,11 +77,7 @@ model = embed_anything.EmbeddingModel.from_pretrained_hf(
 
 ## Processor Crate:
 
-This crate contains various "processors" that accept files/folders/bytes and produced a chunked, metadata-rich document description. This is especially helpful for retrieval-augmented generation!
-
-
-
-
+This crate contains various "processors" that accepts files and produces a chunked, metadata-rich document description. This is especially helpful for retrieval-augmented generation! This also supports PDF text extraction via two different backends at the moment: MuPDF and LoPDF. You can choose between the two by either passing `pdf_backend` as `mupdf` or `lopdf` in  `TextEmbedConfig` 
 
 We have also received some additional cool feature requests on GitHub, which we would like to implement. If you want to help out please check out EmbedAnything on GitHub. We would love to have a contribution. 🚀
 
 
@@ -13,7 +13,7 @@
 def embed_directory_example():
     # Configure the embedding process
     config = TextEmbedConfig(
-        chunk_size=1000, batch_size=32, buffer_size=64, splitting_strategy="sentence"
+        chunk_size=1000, batch_size=32, buffer_size=64, splitting_strategy="sentence", pdf_backend="lopdf"
     )
 
     # Start timing
@@ -73,7 +73,7 @@ def embed_file_example():
 # Example 4: Embed files in a batch
 def embed_files_batch_example():
 
-    config = TextEmbedConfig(chunk_size = 1000, batch_size = 32, buffer_size = 64)
+    config = TextEmbedConfig(chunk_size = 1000, batch_size = 32, buffer_size = 64, pdf_backend="mupdf")
 
     data = model.embed_files_batch(["test_files/bank.txt", "test_files/test.pdf"])
 
 
@@ -23,6 +23,7 @@ htmd = "0.1.6"
 
 # PDF processing
 pdf-extract = {workspace = true}
+mupdf = "0.5.0"
 docx-parser = "0.1.1"
 pdf2image = "0.1.3"
 image = "0.25.6"
 
@@ -1,29 +1,42 @@
-use std::path::Path;
+use crate::markdown_processor::MarkdownProcessor;
+use crate::pdf::tesseract::input::{Args, Image};
+use crate::processor::{Document, DocumentProcessor, FileProcessor};
 use anyhow::Error;
 use image::DynamicImage;
 use pdf2image::{Pages, RenderOptionsBuilder, PDF};
+use std::path::Path;
 use text_splitter::ChunkConfigError;
-use crate::markdown_processor::MarkdownProcessor;
-use crate::pdf::tesseract::input::{Args, Image};
-use crate::processor::{Document, DocumentProcessor, FileProcessor};
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum PdfBackend {
+    LoPdf,
+    MuPdf,
+}
 
 /// A struct for processing PDF files.
 pub struct PdfProcessor {
     markdown_processor: MarkdownProcessor,
     ocr_config: OcrConfig,
+    backend: PdfBackend,
 }
 
 pub struct OcrConfig {
     pub use_ocr: bool,
-    pub tesseract_path: Option<String>
+    pub tesseract_path: Option<String>,
 }
 
 impl PdfProcessor {
-    pub fn new(chunk_size: usize, overlap: usize, ocr_config: OcrConfig) -> Result<PdfProcessor, ChunkConfigError> {
+    pub fn new(
+        chunk_size: usize,
+        overlap: usize,
+        ocr_config: OcrConfig,
+        backend: PdfBackend,
+    ) -> Result<PdfProcessor, ChunkConfigError> {
         let markdown_processor = MarkdownProcessor::new(chunk_size, overlap)?;
         Ok(PdfProcessor {
             markdown_processor,
             ocr_config,
+            backend,
         })
     }
 }
@@ -34,16 +47,49 @@ impl FileProcessor for PdfProcessor {
             let tesseract_path = self.ocr_config.tesseract_path.as_deref();
             extract_text_with_ocr(&path, tesseract_path)?
         } else {
-            pdf_extract::extract_text(path).map_err(|e| anyhow::anyhow!(e))?
+            match self.backend {
+                PdfBackend::MuPdf => {
+                    let mut page_texts = Vec::new();
+                    {
+                        let document = mupdf::document::Document::open(path.as_ref())?;
+                        let pages = document.pages()?;
+
+                        for (page_number, page_result) in pages.enumerate() {
+                            let page = page_result?;
+                            let text_page =
+                                page.to_text_page(mupdf::text_page::TextPageOptions::empty())?;
+
+                            let mut page_text = String::new();
+                            for block in text_page.blocks() {
+                                for line in block.lines() {
+                                    let chars: String =
+                                        line.chars().map(|c| c.char().unwrap_or(' ')).collect();
+                                    page_text.push_str(&chars);
+                                    page_text.push('\n');
+                                }
+                                page_text.push('\n');
+                            }
+
+                            page_texts.push((page_number, page_text));
+                        }
+                    }
+                    page_texts
+                        .into_iter()
+                        .map(|(_, text)| text)
+                        .collect::<Vec<String>>()
+                        .join("\n")
+                }
+                PdfBackend::LoPdf => {
+                    pdf_extract::extract_text(path.as_ref()).map_err(|e| anyhow::anyhow!(e))?
+                }
+            }
         };
-        
+
         self.markdown_processor.process_document(&content)
     }
 }
 
-fn get_images_from_pdf<T: AsRef<Path>>(
-    file_path: &T,
-) -> Result<Vec<DynamicImage>, Error> {
+fn get_images_from_pdf<T: AsRef<Path>>(file_path: &T) -> Result<Vec<DynamicImage>, Error> {
     let pdf = PDF::from_file(file_path)?;
     let page_count = pdf.page_count();
     let pages = pdf.render(
@@ -68,15 +114,15 @@ fn extract_text_with_ocr<T: AsRef<Path>>(
         .iter()
         .map(|image| extract_text_from_image(image, &Args::default().with_path(tesseract_path)))
         .collect();
-    
+
     // Join the texts and clean up empty lines
     let text = texts?.join("\n");
     let cleaned_text = text
         .lines()
         .filter(|line| !line.trim().is_empty())
         .collect::<Vec<&str>>()
         .join("\n");
-    
+
     Ok(cleaned_text)
 }
 
@@ -90,7 +136,16 @@ mod tests {
     fn test_extract_text() {
         let temp_dir = TempDir::new("example").unwrap();
         let pdf_file = temp_dir.path().join("test.pdf");
-        let processor = PdfProcessor::new(128, 0, OcrConfig { use_ocr: false, tesseract_path: None }).unwrap();
+        let processor = PdfProcessor::new(
+            128,
+            0,
+            OcrConfig {
+                use_ocr: false,
+                tesseract_path: None,
+            },
+            PdfBackend::MuPdf,
+        )
+        .unwrap();
 
         File::create(pdf_file).unwrap();
 
 
@@ -9,6 +9,7 @@ crate-type = ["cdylib"]
 
 [dependencies]
 embed_anything = {path = "../rust", features = ["ort"]}
+processors = {path = "../processors"}
 pyo3 = { version = "0.23.2"}
 tokio = { version = "1.39.0", features = ["rt-multi-thread"]}
 strum =  {workspace = true}
 
@@ -502,6 +502,7 @@ class TextEmbedConfig:
         semantic_encoder: The semantic encoder for the Text Embedding model. Default is None.
         use_ocr: A flag indicating whether to use OCR for the Text Embedding model. Default is False.
         tesseract_path: The path to the Tesseract OCR executable. Default is None and uses the system path.
+        pdf_backend: The backend to use for PDF text extraction. Options are `mupdf` and `lopdf`. Default is `lopdf`.
     """
 
     def __init__(
@@ -515,6 +516,7 @@ class TextEmbedConfig:
         semantic_encoder: EmbeddingModel | None = None,
         use_ocr: bool | None = False,
         tesseract_path: str | None = None,
+        pdf_backend: str | None = "lopdf",
     ):
         self.chunk_size = chunk_size
         self.overlap_ratio = overlap_ratio
@@ -525,6 +527,7 @@ class TextEmbedConfig:
         self.semantic_encoder = semantic_encoder
         self.use_ocr = use_ocr
         self.tesseract_path = tesseract_path
+        self.pdf_backend = pdf_backend
     chunk_size: int | None
     overlap_ratio: float | None
     batch_size: int | None
@@ -534,6 +537,7 @@ class TextEmbedConfig:
     semantic_encoder: EmbeddingModel | None
     use_ocr: bool | None
     tesseract_path: str | None
+    pdf_backend: str | None
 
 class ImageEmbedConfig:
     """
@@ -853,6 +857,7 @@ class WhichModel(Enum):
     Cohere = ("Cohere",)
     CohereVision = ("CohereVision",)
     Bert = ("Bert",)
+    Model2Vec = ("Model2Vec",)
     Jina = ("Jina",)
     Clip = ("Clip",)
     Colpali = ("Colpali",)
 
@@ -1,6 +1,7 @@
 use crate::EmbeddingModel;
 use embed_anything::config::SplittingStrategy;
 use pyo3::prelude::*;
+use processors::pdf::pdf_processor::PdfBackend;
 
 #[pyclass]
 #[derive(Default)]
@@ -12,7 +13,7 @@ pub struct TextEmbedConfig {
 #[pymethods]
 impl TextEmbedConfig {
     #[new]
-    #[pyo3(signature = (chunk_size=None, batch_size=None, late_chunking=None, buffer_size=None, overlap_ratio=None, splitting_strategy=None, semantic_encoder=None, use_ocr=None, tesseract_path=None))]
+    #[pyo3(signature = (chunk_size=None, batch_size=None, late_chunking=None, buffer_size=None, overlap_ratio=None, splitting_strategy=None, semantic_encoder=None, use_ocr=None, tesseract_path=None, pdf_backend=None))]
     pub fn new(
         chunk_size: Option<usize>,
         batch_size: Option<usize>,
@@ -23,7 +24,19 @@ impl TextEmbedConfig {
         semantic_encoder: Option<&EmbeddingModel>,
         use_ocr: Option<bool>,
         tesseract_path: Option<&str>,
+        pdf_backend: Option<&str>,
     ) -> Self {
+        let pdf_backend = match pdf_backend {
+            Some(backend) => {
+                match backend {
+                    "mupdf" => PdfBackend::MuPdf,
+                    "lopdf" => PdfBackend::LoPdf,
+                    _ => panic!("Unknown PDF backend provided!"),
+                }
+            }
+            None => PdfBackend::LoPdf,
+        };
+
         let strategy = match splitting_strategy {
             Some(strategy) => {
                 match strategy {
@@ -49,7 +62,8 @@ impl TextEmbedConfig {
                 .with_buffer_size(buffer_size.unwrap_or(100))
                 .with_splitting_strategy(strategy)
                 .with_late_chunking(late_chunking.unwrap_or(false))
-                .with_ocr(use_ocr.unwrap_or(false), tesseract_path),
+                .with_ocr(use_ocr.unwrap_or(false), tesseract_path)
+                .with_pdf_backend(pdf_backend),
         }
     }
 
 
@@ -1,7 +1,14 @@
 pub mod config;
 pub mod models;
 use embed_anything::embeddings::embed::{TextEmbedder, VisionEmbedder};
-use embed_anything::{self, config::TextEmbedConfig, emb_audio, embeddings::embed::{Embedder, EmbeddingResult}, file_processor::audio::audio_processor, FileLoadingError};
+use embed_anything::{
+    self,
+    config::TextEmbedConfig,
+    emb_audio,
+    embeddings::embed::{Embedder, EmbeddingResult},
+    file_processor::audio::audio_processor,
+    FileLoadingError,
+};
 use models::colbert::ColbertModel;
 use models::colpali::ColpaliModel;
 use models::reranker::{DocumentRank, Dtype, Reranker, RerankerResult};
@@ -84,6 +91,7 @@ pub enum WhichModel {
     Cohere,
     CohereVision,
     Bert,
+    Model2Vec,
     SparseBert,
     ColBert,
     Clip,
@@ -221,12 +229,25 @@ impl EmbeddingModel {
                     embed_anything::embeddings::local::jina::JinaEmbedder::new(
                         model_id, revision, token,
                     )
-                    .unwrap(),
+                    .map_err(|e| PyValueError::new_err(e.to_string()))?,
                 )));
                 Ok(EmbeddingModel {
                     inner: Arc::new(model),
                 })
             }
+            WhichModel::Model2Vec => {
+                let model_id = model_id.unwrap_or("minishlab/potion-base-8M");
+                let model = Embedder::Text(TextEmbedder::Model2Vec(Box::new(
+                    embed_anything::embeddings::local::model2vec::Model2VecEmbedder::new(
+                        model_id, token, None,
+                    )
+                    .map_err(|e| PyValueError::new_err(e.to_string()))?,
+                )));
+
+                Ok(EmbeddingModel {
+                    inner: Arc::new(model),
+                })
+            }
             WhichModel::Colpali => {
                 let model_id = model_id.unwrap_or("vidore/colpali-v1.2-merged");
                 let model = Embedder::Vision(VisionEmbedder::ColPali(Box::new(