chonkie-ai · shreyashnigam · Jan 7, 2025 · Jan 7, 2025 · Jan 8, 2025 · Jan 8, 2025
diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -3,7 +3,7 @@ name: 🐛 Bug Report
 about: Report a bug in Chonkie
 title: "[BUG] "
 labels: bug
-assignees: bhavnicksm
+assignees: bhavnicksm, shreyashnigam
 
 ---
 

diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -3,7 +3,7 @@ name: ✨ Feature Request
 about: Suggest a new feature for Chonkie
 title: "[FEAT] "
 labels: enhancement
-assignees: bhavnicksm
+assignees: bhavnicksm, shreyashnigam
 
 ---
 

diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -61,6 +61,7 @@ All tests were run on a `c3-highmem-4` VM from Google Cloud with 32 GB RAM and a
 
 | Library | Time | Speed Factor |
 |---------|-----------|--------------|
+
 | 🦛 Chonkie | 2 min 17 sec | 1x |
 | 🔗 LangChain | 2 min 42 sec | 1.18x slower |
 | 📚 LlamaIndex | 50 min | 21.9x slower |
@@ -128,6 +129,7 @@ The following benchmarks were run on the Paul Graham Essays dataset using the GP
 | 🔗 LangChain | 625 MiB | ~10x CHONKier |
 | 📚 LlamaIndex | 678 MiB | ~11x CHONKier |
 
+
 ## 💡 Why These Numbers Matter
 
 ### Speed Benefits

diff --git a/src/chonkie/chunker/late.py b/src/chonkie/chunker/late.py
@@ -426,3 +426,15 @@
             chunk.embedding = embedding
 
         return chunks
+
+    def __repr__(self):
+        """Return a string representation of the LateChunker."""
+        return (
+            f"LateChunker(embedding_model={self.embedding_model}, "
+            f"mode={self.mode}, "
+            f"chunk_size={self.chunk_size}, "
+            f"min_sentences_per_chunk={self.min_sentences_per_chunk}, "
+            f"min_characters_per_sentence={self.min_characters_per_sentence}, "
+            f"approximate={self.approximate}, "
+            f"delim={self.delim})"
+        )
diff --git a/src/chonkie/chunker/recursive.py b/src/chonkie/chunker/recursive.py
@@ -2,7 +2,7 @@
 from bisect import bisect_left
 from functools import lru_cache
 from itertools import accumulate
-from typing import Any, Callable, List, Optional, Union, Literal
+from typing import Any, Callable, List, Literal, Optional, Union
 
 from chonkie.chunker.base import BaseChunker
 from chonkie.types import Chunk, RecursiveChunk, RecursiveLevel, RecursiveRules
@@ -59,7 +59,12 @@
         # At every delimiter, replace it with the sep   
         if rule.delimiters:
             for delimiter in rule.delimiters:
-                text = text.replace(delimiter, delimiter + sep)
+                if rule.include_delim == "prev":
+                    text = text.replace(delimiter, delimiter + sep)
+                elif rule.include_delim == "next":
+                    text = text.replace(delimiter, sep + delimiter)
+                else:
+                    text = text.replace(delimiter, sep)
 
             # Split the text at the sep
             splits = [s for s in text.split(sep) if s != ""]
@@ -262,7 +267,8 @@
 
     def __repr__(self) -> str:
         """Get a string representation of the recursive chunker."""
-        return (f"RecursiveChunker(rules={self.rules}, "
+        return (f"RecursiveChunker(tokenizer={self.tokenizer}, "
+                f"rules={self.rules}, "
                 f"chunk_size={self.chunk_size}, "
                 f"min_characters_per_chunk={self.min_characters_per_chunk}, "
                 f"return_type={self.return_type})")
@@ -272,4 +278,4 @@
         return (f"RecursiveChunker(rules={self.rules}, "
                 f"chunk_size={self.chunk_size}, "
                 f"min_characters_per_chunk={self.min_characters_per_chunk}, "
-                f"return_type={self.return_type})")
+                f"return_type={self.return_type})")
diff --git a/src/chonkie/chunker/sdpm.py b/src/chonkie/chunker/sdpm.py
@@ -1,6 +1,6 @@
 """Semantic Double Pass Merge chunking using sentence embeddings."""
 
-from typing import Any, List, Union, Literal
+from typing import Any, List, Literal, Union
 
 from chonkie.types import SemanticChunk, Sentence
 
@@ -177,5 +177,6 @@ def __repr__(self) -> str:
             f"min_chunk_size={self.min_chunk_size}, "
             f"min_characters_per_sentence={self.min_characters_per_sentence}, "
             f"threshold_step={self.threshold_step}, "
+            f"delim={self.delim}, "
             f"skip_window={self.skip_window})"
         )
diff --git a/src/chonkie/chunker/semantic.py b/src/chonkie/chunker/semantic.py
@@ -1,14 +1,15 @@
 """Semantic chunking using sentence embeddings."""
 
+import importlib.util as importutil
 import warnings
-from typing import List, Union, Literal
-
-import numpy as np
+from typing import List, Literal, Union
 
 from chonkie.chunker.base import BaseChunker
 from chonkie.embeddings.base import BaseEmbeddings
 from chonkie.types import SemanticChunk, SemanticSentence, Sentence
 
+if importutil.find_spec("numpy"):
+    import numpy as np
 
 class SemanticChunker(BaseChunker):
     """Chunker that splits text into semantically coherent chunks using embeddings.
@@ -28,6 +29,7 @@ class SemanticChunker(BaseChunker):
 
     Raises:
         ValueError: If parameters are invalid
+
     """
 
     def __init__(
@@ -250,13 +252,13 @@ def _prepare_sentences(self, text: str) -> List[Sentence]:
         return sentences
 
     def _get_semantic_similarity(
-        self, embedding1: np.ndarray, embedding2: np.ndarray
+        self, embedding1: "np.ndarray", embedding2: "np.ndarray"
     ) -> float:
         """Compute cosine similarity between two embeddings."""
         similarity = self.embedding_model.similarity(embedding1, embedding2)
         return similarity
 
-    def _compute_group_embedding(self, sentences: List[Sentence]) -> np.ndarray:
+    def _compute_group_embedding(self, sentences: List[Sentence]) -> "np.ndarray":
         """Compute mean embedding for a group of sentences."""
         if len(sentences) == 1:
             return sentences[0].embedding
@@ -567,5 +569,8 @@ def __repr__(self) -> str:
             f"threshold={self.threshold}, "
             f"similarity_window={self.similarity_window}, "
             f"min_sentences={self.min_sentences}, "
-            f"min_chunk_size={self.min_chunk_size})"
+            f"min_chunk_size={self.min_chunk_size}, "
+            f"min_characters_per_sentence={self.min_characters_per_sentence}, "
+            f"threshold_step={self.threshold_step}, "
+            f"delim={self.delim})"
         )
diff --git a/src/chonkie/chunker/sentence.py b/src/chonkie/chunker/sentence.py
@@ -1,7 +1,7 @@
 """Sentence chunker."""
 from bisect import bisect_left
 from itertools import accumulate
-from typing import Any, Callable, List, Union, Literal
+from typing import Any, Callable, List, Literal, Union
 
 from chonkie.types import Chunk, Sentence, SentenceChunk
 
@@ -12,12 +12,15 @@
     """SentenceChunker splits the sentences in a text based on token limits and sentence boundaries.
 
     Args:
-        tokenizer: The tokenizer instance to use for encoding/decoding
+        tokenizer_or_token_counter: The tokenizer instance to use for encoding/decoding
         chunk_size: Maximum number of tokens per chunk
         chunk_overlap: Number of tokens to overlap between chunks
         min_sentences_per_chunk: Minimum number of sentences per chunk (defaults to 1)
-        min_chunk_size: Minimum number of tokens per sentence (defaults to 2)
+        min_characters_per_sentence: Minimum number of characters per sentence
         approximate: Whether to use approximate token counting (defaults to True)
+        delim: Delimiters to split sentences on
+        include_delim: Whether to include delimiters in current chunk, next chunk or not at all (defaults to "prev")
+        return_type: Whether to return chunks or texts
 
     Raises:
         ValueError: If parameters are invalid
@@ -28,27 +31,28 @@
         self,
         tokenizer_or_token_counter: Union[str, Callable, Any] = "gpt2",
         chunk_size: int = 512,
-        chunk_overlap: int = 128,
+        chunk_overlap: int = 0,
         min_sentences_per_chunk: int = 1,
         min_characters_per_sentence: int = 12,
         approximate: bool = True,
         delim: Union[str, List[str]] = [".", "!", "?", "\n"],
+        include_delim: Union[Literal["prev", "next"], None] = "prev",
         return_type: Literal["chunks", "texts"] = "chunks"
     ):
         """Initialize the SentenceChunker with configuration parameters.
 
         SentenceChunker splits the sentences in a text based on token limits and sentence boundaries.
 
         Args:
-            tokenizer: The tokenizer instance to use for encoding/decoding
-            chunk_size: Maximum number of tokens per chunk
-            chunk_overlap: Number of tokens to overlap between chunks
+            tokenizer_or_token_counter: The tokenizer instance to use for encoding/decoding (defaults to "gpt2")
+            chunk_size: Maximum number of tokens per chunk (defaults to 512)
+            chunk_overlap: Number of tokens to overlap between chunks (defaults to 0)
             min_sentences_per_chunk: Minimum number of sentences per chunk (defaults to 1)
-            min_chunk_size: Minimum number of tokens per sentence (defaults to 2)
-            min_characters_per_sentence: Minimum number of characters per sentence
+            min_characters_per_sentence: Minimum number of characters per sentence (defaults to 12)
             approximate: Whether to use approximate token counting (defaults to True)
-            delim: Delimiters to split sentences on
-            return_type: Whether to return chunks or texts
+            delim: Delimiters to split sentences on (defaults to [".", "!", "?", "newline"])
+            include_delim: Whether to include delimiters in current chunk, next chunk or not at all (defaults to "prev")
+            return_type: Whether to return chunks or texts (defaults to "chunks")
 
         Raises:
             ValueError: If parameters are invalid
@@ -64,6 +68,10 @@
             raise ValueError("min_sentences_per_chunk must be at least 1")
         if min_characters_per_sentence < 1:
             raise ValueError("min_characters_per_sentence must be at least 1")
+        if delim is None:
+            raise ValueError("delim must be a list of strings or a string")
+        if include_delim not in ["prev", "next", None]:
+            raise ValueError("include_delim must be 'prev', 'next' or None")
         if return_type not in ["chunks", "texts"]:
             raise ValueError("Invalid return_type. Must be either 'chunks' or 'texts'.")
 
@@ -73,88 +81,10 @@
         self.min_characters_per_sentence = min_characters_per_sentence
         self.approximate = approximate
         self.delim = delim
+        self.include_delim = include_delim
         self.sep = "🦛"
         self.return_type = return_type
 
-    # TODO: This is a older method of sentence splitting that uses Regex
-    # but since Regex in python via re is super slooooow we use a different method
-    # that is faster and more accurate. We can keep this method for reference
-    # and comparison. And also, we'll need to have a seperate preprocessing
-    # to handle the special cases that this method handles.
-
-    # def _split_sentences(self, text: str) -> List[str]:
-    #     """Split text into sentences using enhanced regex patterns.
-
-    #     Handles various cases including:
-    #     - Standard sentence endings across multiple writing systems
-    #     - Quotations and parentheses
-    #     - Common abbreviations
-    #     - Decimal numbers
-    #     - Ellipsis
-    #     - Lists and enumerations
-    #     - Special punctuation
-    #     - Common honorifics and titles
-
-    #     Args:
-    #         text: Input text to be split into sentences
-
-    #     Returns:
-    #         List of sentences
-    #     """
-    #     # Define sentence ending punctuation marks from various writing systems
-    #     sent_endings = (
-    #         r'[!.?։؟۔܀܁܂߹।॥၊။።፧፨᙮᜵᜶᠃᠉᥄᥅᪨᪩᪪᪫᭚᭛᭞᭟᰻᰼᱾᱿'
-    #         r'‼‽⁇⁈⁉⸮⸼꓿꘎꘏꛳꛷꡶꡷꣎꣏꤯꧈꧉꩝꩞꩟꫰꫱꯫﹒﹖﹗！．？𐩖𐩗'
-    #         r'𑁇𑁈𑂾𑂿𑃀𑃁𑅁𑅂𑅃𑇅𑇆𑇍𑇞𑇟𑈸𑈹𑈻𑈼𑊩𑑋𑑌𑗂𑗃𑗉𑗊𑗋𑗌𑗍𑗎𑗏𑗐𑗑𑗒'
-    #         r'𑗓𑗔𑗕𑗖𑗗𑙁𑙂𑜼𑜽𑜾𑩂𑩃𑪛𑪜𑱁𑱂𖩮𖩯𖫵𖬷𖬸𖭄𛲟𝪈｡。]'
-    #     )
-
-    #     # Common abbreviations and titles that don't end sentences
-    #     abbrevs = (
-    #         r"(?:Mr|Mrs|Ms|Dr|Prof|Sr|Jr|vs|etc|viz|al|Gen|Col|Fig|Lt|Mt|St"
-    #         r"|etc|approx|appt|apt|dept|est|min|max|misc|no|ps|seq|temp|etal"
-    #         r"|e\.g|i\.e|vol|vs|cm|mm|km|kg|lb|ft|pd|hr|sec|min|sq|fx|Feb|Mar"
-    #         r"|Apr|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec)"
-    #     )
-
-    #     # First, protect periods in known abbreviations
-    #     text = re.sub(rf"({abbrevs})\.", r"\1@POINT@", text, flags=re.IGNORECASE)
-
-    #     # Protect decimal numbers
-    #     text = re.sub(r"(\d+)\.(\d+)", r"\1@POINT@\2", text)
-
-    #     # Protect ellipsis
-    #     text = re.sub(r"\.\.\.", "@ELLIPSIS@", text)
-
-    #     # Protect email addresses and websites
-    #     text = re.sub(r"([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})", r"@EMAIL@\1@EMAIL@", text)
-    #     text = re.sub(r"(https?://[^\s]+)", r"@URL@\1@URL@", text)
-
-    #     # Handle parentheses and brackets
-    #     text = re.sub(r'\([^)]*\.[^)]*\)', lambda m: m.group().replace('.', '@POINT@'), text)
-    #     text = re.sub(r'\[[^\]]*\.[^\]]*\]', lambda m: m.group().replace('.', '@POINT@'), text)
-
-    #     # Handle quotations with sentence endings
-    #     text = re.sub(rf'({sent_endings})"(\s+[A-Z])', r'\1"\n\2', text)
-
-    #     # Handle standard sentence endings
-    #     text = re.sub(rf'({sent_endings})(\s+[A-Z"]|\s*$)', r'\1\n\2', text)
-
-    #     # Handle lists and enumerations
-    #     text = re.sub(r'(\d+\.)(\s+[A-Z])', r'\1\n\2', text)
-    #     text = re.sub(r'([a-zA-Z]\.)(\s+[A-Z])', r'\1\n\2', text)
-
-    #     # Restore protected periods and symbols
-    #     text = text.replace("@POINT@", ".")
-    #     text = text.replace("@ELLIPSIS@", "...")
-    #     text = re.sub(r'@EMAIL@([^@]+)@EMAIL@', r'\1', text)
-    #     text = re.sub(r'@URL@([^@]+)@URL@', r'\1', text)
-
-    #     # Split into sentences
-    #     sentences = [s.strip() for s in text.split('\n') if s.strip()]
-
-    #     return sentences
-
     def _split_sentences(self, text: str) -> List[str]:
         """Fast sentence splitting while maintaining accuracy.
 
@@ -169,16 +99,21 @@
         """
         t = text
         for c in self.delim:
-            t = t.replace(c, c + self.sep)
+            if self.include_delim == "prev":
+                t = t.replace(c,  c + self.sep)
+            elif self.include_delim == "next":
+                t = t.replace(c, self.sep + c)
+            else:
+                t = t.replace(c, self.sep)
 
         # Initial split
         splits = [s for s in t.split(self.sep) if s != ""]
-        # print(splits)
 
         # Combine short splits with previous sentence
         sentences = []
         current = ""
 
+        # Combine short splits with previous sentence
         for s in splits:
             if len(s.strip()) < self.min_characters_per_sentence:
                 current += s
@@ -187,6 +122,7 @@
                     sentences.append(current)
                 current = s
 
+        # Add last sentence
         if current:
             sentences.append(current)
 
@@ -393,7 +329,11 @@
     def __repr__(self) -> str:
         """Return a string representation of the SentenceChunker."""
         return (
-            f"SentenceChunker(chunk_size={self.chunk_size}, "
+            f"SentenceChunker(tokenizer={self.tokenizer}, "
+            f"chunk_size={self.chunk_size}, "
             f"chunk_overlap={self.chunk_overlap}, "
-            f"min_sentences_per_chunk={self.min_sentences_per_chunk})"
+            f"min_sentences_per_chunk={self.min_sentences_per_chunk}, "
+            f"min_characters_per_sentence={self.min_characters_per_sentence}, "
+            f"approximate={self.approximate}, "
+            f"delim={self.delim})"
         )
diff --git a/src/chonkie/chunker/token.py b/src/chonkie/chunker/token.py
@@ -1,6 +1,6 @@
 """Token-based chunking."""
 
-from typing import Any, Generator, List, Tuple, Union, Literal
+from typing import Any, Generator, List, Literal, Union
 
 from tqdm import trange
 

diff --git a/src/chonkie/chunker/word.py b/src/chonkie/chunker/word.py
@@ -1,6 +1,6 @@
 """Word-based chunker."""
 import re
-from typing import Any, Callable, List, Tuple, Union, Literal
+from typing import Any, Callable, List, Literal, Tuple, Union
 
 from chonkie.types import Chunk
 
@@ -185,6 +185,7 @@ def chunk(self, text: str) -> List[Chunk]:
     def __repr__(self) -> str:
         """Return a string representation of the WordChunker."""
         return (
-            f"WordChunker(chunk_size={self.chunk_size}, "
+            f"WordChunker(tokenizer={self.tokenizer}, "
+            f"chunk_size={self.chunk_size}, "
             f"chunk_overlap={self.chunk_overlap})"
         )