Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix __repr__ to output all fields #149

Open
wants to merge 10 commits into
base: development
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/ISSUE_TEMPLATE/bug_report.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ name: 🐛 Bug Report
about: Report a bug in Chonkie
title: "[BUG] "
labels: bug
assignees: bhavnicksm
assignees: bhavnicksm, shreyashnigam

---

Expand Down
2 changes: 1 addition & 1 deletion .github/ISSUE_TEMPLATE/feature_request.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ name: ✨ Feature Request
about: Suggest a new feature for Chonkie
title: "[FEAT] "
labels: enhancement
assignees: bhavnicksm
assignees: bhavnicksm, shreyashnigam

---

Expand Down
2 changes: 2 additions & 0 deletions benchmarks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ All tests were run on a `c3-highmem-4` VM from Google Cloud with 32 GB RAM and a

| Library | Time | Speed Factor |
|---------|-----------|--------------|

| 🦛 Chonkie | 2 min 17 sec | 1x |
| 🔗 LangChain | 2 min 42 sec | 1.18x slower |
| 📚 LlamaIndex | 50 min | 21.9x slower |
Expand Down Expand Up @@ -128,6 +129,7 @@ The following benchmarks were run on the Paul Graham Essays dataset using the GP
| 🔗 LangChain | 625 MiB | ~10x CHONKier |
| 📚 LlamaIndex | 678 MiB | ~11x CHONKier |


## 💡 Why These Numbers Matter

### Speed Benefits
Expand Down
12 changes: 12 additions & 0 deletions src/chonkie/chunker/late.py
Original file line number Diff line number Diff line change
Expand Up @@ -426,3 +426,15 @@
chunk.embedding = embedding

return chunks

def __repr__(self):
"""Return a string representation of the LateChunker."""
return (

Check warning on line 432 in src/chonkie/chunker/late.py

View check run for this annotation

Codecov / codecov/patch

src/chonkie/chunker/late.py#L432

Added line #L432 was not covered by tests
f"LateChunker(embedding_model={self.embedding_model}, "
f"mode={self.mode}, "
f"chunk_size={self.chunk_size}, "
f"min_sentences_per_chunk={self.min_sentences_per_chunk}, "
f"min_characters_per_sentence={self.min_characters_per_sentence}, "
f"approximate={self.approximate}, "
f"delim={self.delim})"
)
14 changes: 10 additions & 4 deletions src/chonkie/chunker/recursive.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from bisect import bisect_left
from functools import lru_cache
from itertools import accumulate
from typing import Any, Callable, List, Optional, Union, Literal
from typing import Any, Callable, List, Literal, Optional, Union

from chonkie.chunker.base import BaseChunker
from chonkie.types import Chunk, RecursiveChunk, RecursiveLevel, RecursiveRules
Expand Down Expand Up @@ -59,7 +59,12 @@
# At every delimiter, replace it with the sep
if rule.delimiters:
for delimiter in rule.delimiters:
text = text.replace(delimiter, delimiter + sep)
if rule.include_delim == "prev":
text = text.replace(delimiter, delimiter + sep)
elif rule.include_delim == "next":
text = text.replace(delimiter, sep + delimiter)

Check warning on line 65 in src/chonkie/chunker/recursive.py

View check run for this annotation

Codecov / codecov/patch

src/chonkie/chunker/recursive.py#L65

Added line #L65 was not covered by tests
else:
text = text.replace(delimiter, sep)

Check warning on line 67 in src/chonkie/chunker/recursive.py

View check run for this annotation

Codecov / codecov/patch

src/chonkie/chunker/recursive.py#L67

Added line #L67 was not covered by tests

# Split the text at the sep
splits = [s for s in text.split(sep) if s != ""]
Expand Down Expand Up @@ -262,7 +267,8 @@

def __repr__(self) -> str:
"""Get a string representation of the recursive chunker."""
return (f"RecursiveChunker(rules={self.rules}, "
return (f"RecursiveChunker(tokenizer={self.tokenizer}, "

Check warning on line 270 in src/chonkie/chunker/recursive.py

View check run for this annotation

Codecov / codecov/patch

src/chonkie/chunker/recursive.py#L270

Added line #L270 was not covered by tests
f"rules={self.rules}, "
f"chunk_size={self.chunk_size}, "
f"min_characters_per_chunk={self.min_characters_per_chunk}, "
f"return_type={self.return_type})")
Expand All @@ -272,4 +278,4 @@
return (f"RecursiveChunker(rules={self.rules}, "
f"chunk_size={self.chunk_size}, "
f"min_characters_per_chunk={self.min_characters_per_chunk}, "
f"return_type={self.return_type})")
f"return_type={self.return_type})")
3 changes: 2 additions & 1 deletion src/chonkie/chunker/sdpm.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Semantic Double Pass Merge chunking using sentence embeddings."""

from typing import Any, List, Union, Literal
from typing import Any, List, Literal, Union

from chonkie.types import SemanticChunk, Sentence

Expand Down Expand Up @@ -177,5 +177,6 @@ def __repr__(self) -> str:
f"min_chunk_size={self.min_chunk_size}, "
f"min_characters_per_sentence={self.min_characters_per_sentence}, "
f"threshold_step={self.threshold_step}, "
f"delim={self.delim}, "
f"skip_window={self.skip_window})"
)
17 changes: 11 additions & 6 deletions src/chonkie/chunker/semantic.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
"""Semantic chunking using sentence embeddings."""

import importlib.util as importutil
import warnings
from typing import List, Union, Literal

import numpy as np
from typing import List, Literal, Union

from chonkie.chunker.base import BaseChunker
from chonkie.embeddings.base import BaseEmbeddings
from chonkie.types import SemanticChunk, SemanticSentence, Sentence

if importutil.find_spec("numpy"):
import numpy as np

class SemanticChunker(BaseChunker):
"""Chunker that splits text into semantically coherent chunks using embeddings.
Expand All @@ -28,6 +29,7 @@ class SemanticChunker(BaseChunker):

Raises:
ValueError: If parameters are invalid

"""

def __init__(
Expand Down Expand Up @@ -250,13 +252,13 @@ def _prepare_sentences(self, text: str) -> List[Sentence]:
return sentences

def _get_semantic_similarity(
self, embedding1: np.ndarray, embedding2: np.ndarray
self, embedding1: "np.ndarray", embedding2: "np.ndarray"
) -> float:
"""Compute cosine similarity between two embeddings."""
similarity = self.embedding_model.similarity(embedding1, embedding2)
return similarity

def _compute_group_embedding(self, sentences: List[Sentence]) -> np.ndarray:
def _compute_group_embedding(self, sentences: List[Sentence]) -> "np.ndarray":
"""Compute mean embedding for a group of sentences."""
if len(sentences) == 1:
return sentences[0].embedding
Expand Down Expand Up @@ -567,5 +569,8 @@ def __repr__(self) -> str:
f"threshold={self.threshold}, "
f"similarity_window={self.similarity_window}, "
f"min_sentences={self.min_sentences}, "
f"min_chunk_size={self.min_chunk_size})"
f"min_chunk_size={self.min_chunk_size}, "
f"min_characters_per_sentence={self.min_characters_per_sentence}, "
f"threshold_step={self.threshold_step}, "
f"delim={self.delim})"
)
128 changes: 34 additions & 94 deletions src/chonkie/chunker/sentence.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Sentence chunker."""
from bisect import bisect_left
from itertools import accumulate
from typing import Any, Callable, List, Union, Literal
from typing import Any, Callable, List, Literal, Union

from chonkie.types import Chunk, Sentence, SentenceChunk

Expand All @@ -12,12 +12,15 @@
"""SentenceChunker splits the sentences in a text based on token limits and sentence boundaries.

Args:
tokenizer: The tokenizer instance to use for encoding/decoding
tokenizer_or_token_counter: The tokenizer instance to use for encoding/decoding
chunk_size: Maximum number of tokens per chunk
chunk_overlap: Number of tokens to overlap between chunks
min_sentences_per_chunk: Minimum number of sentences per chunk (defaults to 1)
min_chunk_size: Minimum number of tokens per sentence (defaults to 2)
min_characters_per_sentence: Minimum number of characters per sentence
approximate: Whether to use approximate token counting (defaults to True)
delim: Delimiters to split sentences on
include_delim: Whether to include delimiters in current chunk, next chunk or not at all (defaults to "prev")
return_type: Whether to return chunks or texts

Raises:
ValueError: If parameters are invalid
Expand All @@ -28,27 +31,28 @@
self,
tokenizer_or_token_counter: Union[str, Callable, Any] = "gpt2",
chunk_size: int = 512,
chunk_overlap: int = 128,
chunk_overlap: int = 0,
min_sentences_per_chunk: int = 1,
min_characters_per_sentence: int = 12,
approximate: bool = True,
delim: Union[str, List[str]] = [".", "!", "?", "\n"],
include_delim: Union[Literal["prev", "next"], None] = "prev",
return_type: Literal["chunks", "texts"] = "chunks"
):
"""Initialize the SentenceChunker with configuration parameters.

SentenceChunker splits the sentences in a text based on token limits and sentence boundaries.

Args:
tokenizer: The tokenizer instance to use for encoding/decoding
chunk_size: Maximum number of tokens per chunk
chunk_overlap: Number of tokens to overlap between chunks
tokenizer_or_token_counter: The tokenizer instance to use for encoding/decoding (defaults to "gpt2")
chunk_size: Maximum number of tokens per chunk (defaults to 512)
chunk_overlap: Number of tokens to overlap between chunks (defaults to 0)
min_sentences_per_chunk: Minimum number of sentences per chunk (defaults to 1)
min_chunk_size: Minimum number of tokens per sentence (defaults to 2)
min_characters_per_sentence: Minimum number of characters per sentence
min_characters_per_sentence: Minimum number of characters per sentence (defaults to 12)
approximate: Whether to use approximate token counting (defaults to True)
delim: Delimiters to split sentences on
return_type: Whether to return chunks or texts
delim: Delimiters to split sentences on (defaults to [".", "!", "?", "newline"])
include_delim: Whether to include delimiters in current chunk, next chunk or not at all (defaults to "prev")
return_type: Whether to return chunks or texts (defaults to "chunks")

Raises:
ValueError: If parameters are invalid
Expand All @@ -64,6 +68,10 @@
raise ValueError("min_sentences_per_chunk must be at least 1")
if min_characters_per_sentence < 1:
raise ValueError("min_characters_per_sentence must be at least 1")
if delim is None:
raise ValueError("delim must be a list of strings or a string")

Check warning on line 72 in src/chonkie/chunker/sentence.py

View check run for this annotation

Codecov / codecov/patch

src/chonkie/chunker/sentence.py#L72

Added line #L72 was not covered by tests
if include_delim not in ["prev", "next", None]:
raise ValueError("include_delim must be 'prev', 'next' or None")

Check warning on line 74 in src/chonkie/chunker/sentence.py

View check run for this annotation

Codecov / codecov/patch

src/chonkie/chunker/sentence.py#L74

Added line #L74 was not covered by tests
if return_type not in ["chunks", "texts"]:
raise ValueError("Invalid return_type. Must be either 'chunks' or 'texts'.")

Expand All @@ -73,88 +81,10 @@
self.min_characters_per_sentence = min_characters_per_sentence
self.approximate = approximate
self.delim = delim
self.include_delim = include_delim
self.sep = "🦛"
self.return_type = return_type

# TODO: This is a older method of sentence splitting that uses Regex
# but since Regex in python via re is super slooooow we use a different method
# that is faster and more accurate. We can keep this method for reference
# and comparison. And also, we'll need to have a seperate preprocessing
# to handle the special cases that this method handles.

# def _split_sentences(self, text: str) -> List[str]:
# """Split text into sentences using enhanced regex patterns.

# Handles various cases including:
# - Standard sentence endings across multiple writing systems
# - Quotations and parentheses
# - Common abbreviations
# - Decimal numbers
# - Ellipsis
# - Lists and enumerations
# - Special punctuation
# - Common honorifics and titles

# Args:
# text: Input text to be split into sentences

# Returns:
# List of sentences
# """
# # Define sentence ending punctuation marks from various writing systems
# sent_endings = (
# r'[!.?։؟۔܀܁܂߹।॥၊။።፧፨᙮᜵᜶᠃᠉᥄᥅᪨᪩᪪᪫᭚᭛᭞᭟᰻᰼᱾᱿'
# r'‼‽⁇⁈⁉⸮⸼꓿꘎꘏꛳꛷꡶꡷꣎꣏꤯꧈꧉꩝꩞꩟꫰꫱꯫﹒﹖﹗!.?𐩖𐩗'
# r'𑁇𑁈𑂾𑂿𑃀𑃁𑅁𑅂𑅃𑇅𑇆𑇍𑇞𑇟𑈸𑈹𑈻𑈼𑊩𑑋𑑌𑗂𑗃𑗉𑗊𑗋𑗌𑗍𑗎𑗏𑗐𑗑𑗒'
# r'𑗓𑗔𑗕𑗖𑗗𑙁𑙂𑜼𑜽𑜾𑩂𑩃𑪛𑪜𑱁𑱂𖩮𖩯𖫵𖬷𖬸𖭄𛲟𝪈。。]'
# )

# # Common abbreviations and titles that don't end sentences
# abbrevs = (
# r"(?:Mr|Mrs|Ms|Dr|Prof|Sr|Jr|vs|etc|viz|al|Gen|Col|Fig|Lt|Mt|St"
# r"|etc|approx|appt|apt|dept|est|min|max|misc|no|ps|seq|temp|etal"
# r"|e\.g|i\.e|vol|vs|cm|mm|km|kg|lb|ft|pd|hr|sec|min|sq|fx|Feb|Mar"
# r"|Apr|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec)"
# )

# # First, protect periods in known abbreviations
# text = re.sub(rf"({abbrevs})\.", r"\1@POINT@", text, flags=re.IGNORECASE)

# # Protect decimal numbers
# text = re.sub(r"(\d+)\.(\d+)", r"\1@POINT@\2", text)

# # Protect ellipsis
# text = re.sub(r"\.\.\.", "@ELLIPSIS@", text)

# # Protect email addresses and websites
# text = re.sub(r"([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})", r"@EMAIL@\1@EMAIL@", text)
# text = re.sub(r"(https?://[^\s]+)", r"@URL@\1@URL@", text)

# # Handle parentheses and brackets
# text = re.sub(r'\([^)]*\.[^)]*\)', lambda m: m.group().replace('.', '@POINT@'), text)
# text = re.sub(r'\[[^\]]*\.[^\]]*\]', lambda m: m.group().replace('.', '@POINT@'), text)

# # Handle quotations with sentence endings
# text = re.sub(rf'({sent_endings})"(\s+[A-Z])', r'\1"\n\2', text)

# # Handle standard sentence endings
# text = re.sub(rf'({sent_endings})(\s+[A-Z"]|\s*$)', r'\1\n\2', text)

# # Handle lists and enumerations
# text = re.sub(r'(\d+\.)(\s+[A-Z])', r'\1\n\2', text)
# text = re.sub(r'([a-zA-Z]\.)(\s+[A-Z])', r'\1\n\2', text)

# # Restore protected periods and symbols
# text = text.replace("@POINT@", ".")
# text = text.replace("@ELLIPSIS@", "...")
# text = re.sub(r'@EMAIL@([^@]+)@EMAIL@', r'\1', text)
# text = re.sub(r'@URL@([^@]+)@URL@', r'\1', text)

# # Split into sentences
# sentences = [s.strip() for s in text.split('\n') if s.strip()]

# return sentences

def _split_sentences(self, text: str) -> List[str]:
"""Fast sentence splitting while maintaining accuracy.

Expand All @@ -169,16 +99,21 @@
"""
t = text
for c in self.delim:
t = t.replace(c, c + self.sep)
if self.include_delim == "prev":
t = t.replace(c, c + self.sep)
elif self.include_delim == "next":
t = t.replace(c, self.sep + c)

Check warning on line 105 in src/chonkie/chunker/sentence.py

View check run for this annotation

Codecov / codecov/patch

src/chonkie/chunker/sentence.py#L105

Added line #L105 was not covered by tests
else:
t = t.replace(c, self.sep)

Check warning on line 107 in src/chonkie/chunker/sentence.py

View check run for this annotation

Codecov / codecov/patch

src/chonkie/chunker/sentence.py#L107

Added line #L107 was not covered by tests

# Initial split
splits = [s for s in t.split(self.sep) if s != ""]
# print(splits)

# Combine short splits with previous sentence
sentences = []
current = ""

# Combine short splits with previous sentence
for s in splits:
if len(s.strip()) < self.min_characters_per_sentence:
current += s
Expand All @@ -187,6 +122,7 @@
sentences.append(current)
current = s

# Add last sentence
if current:
sentences.append(current)

Expand Down Expand Up @@ -393,7 +329,11 @@
def __repr__(self) -> str:
"""Return a string representation of the SentenceChunker."""
return (
f"SentenceChunker(chunk_size={self.chunk_size}, "
f"SentenceChunker(tokenizer={self.tokenizer}, "
f"chunk_size={self.chunk_size}, "
f"chunk_overlap={self.chunk_overlap}, "
f"min_sentences_per_chunk={self.min_sentences_per_chunk})"
f"min_sentences_per_chunk={self.min_sentences_per_chunk}, "
f"min_characters_per_sentence={self.min_characters_per_sentence}, "
f"approximate={self.approximate}, "
f"delim={self.delim})"
)
2 changes: 1 addition & 1 deletion src/chonkie/chunker/token.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Token-based chunking."""

from typing import Any, Generator, List, Tuple, Union, Literal
from typing import Any, Generator, List, Literal, Union

from tqdm import trange

Expand Down
5 changes: 3 additions & 2 deletions src/chonkie/chunker/word.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Word-based chunker."""
import re
from typing import Any, Callable, List, Tuple, Union, Literal
from typing import Any, Callable, List, Literal, Tuple, Union

from chonkie.types import Chunk

Expand Down Expand Up @@ -185,6 +185,7 @@ def chunk(self, text: str) -> List[Chunk]:
def __repr__(self) -> str:
"""Return a string representation of the WordChunker."""
return (
f"WordChunker(chunk_size={self.chunk_size}, "
f"WordChunker(tokenizer={self.tokenizer}, "
f"chunk_size={self.chunk_size}, "
f"chunk_overlap={self.chunk_overlap})"
)
Loading