Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Semantic Chunking Processor #96

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3,380 changes: 2,123 additions & 1,257 deletions poetry.lock

Large diffs are not rendered by default.

5 changes: 4 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ version = "0.1.0"
description = "Scraper is designed to automate the process of gathering information from a variety of key Bitcoin-related sources. It leverages GitHub Actions to schedule nightly cron jobs, ensuring that the most up-to-date content is captured from each source according to a defined frequency. The scraped data are then stored in an Elasticsearch index."
authors = ["kouloumos <[email protected]>", "urvish patel <[email protected]>"]
readme = "README.md"
packages = [{include = "scraper"}]
packages = [{include = "./scraper"}]

[tool.poetry.dependencies]
python = ">=3.11,<3.13"
Expand All @@ -18,6 +18,9 @@ python-dotenv = "^1.0.1"
aiohttp = "^3.10.5"
scrapy = "^2.11.2"
openai = "^1.52.2"
langchain = "^0.3.11"
langchain-experimental = "^0.3.3"
langchain-openai = "0.2.2"


[tool.poetry.group.dev.dependencies]
Expand Down
3 changes: 3 additions & 0 deletions scraper/models/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ class ScrapedDocument(BaseModel):
authors: Optional[List[str]] = Field(
default=None, description="List of authors of the document"
)
chunks: Optional[List[dict]] = Field(
default=None, description="List of chunks generated after splitting the document"
)
test_document: Optional[bool] = Field(
# TODO: remove hardcoded value
default=True,
Expand Down
2 changes: 2 additions & 0 deletions scraper/processors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@
from .summarization_processor import SummarizationProcessor
from .topic_extractor_processor import TopicExtractorProcessor
from .vector_embeddings_processor import VectorEmbeddingsProcessor
from .semantic_chunking_processor import SemanticChunkingProcessor

__all__ = [
"ProcessorManager",
"BaseProcessor",
"SummarizationProcessor",
"TopicExtractorProcessor",
"VectorEmbeddingsProcessor",
"SemanticChunkingProcessor"
]
176 changes: 176 additions & 0 deletions scraper/processors/semantic_chunking_processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
import os
import re
import json
import traceback
import tiktoken
import numpy as np
import inspect
from dotenv import load_dotenv

# https://api.python.langchain.com/en/latest/text_splitter/langchain_experimental.text_splitter.SemanticChunker.html
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_openai import ChatOpenAI

from scraper.models import ScrapedDocument
from scraper.processors.base_processor import BaseProcessor
from scraper.registry import processor_registry


@processor_registry.register("semantic_chunking")
class SemanticChunkingProcessor(BaseProcessor):
def __init__(self):
super().__init__()
self.params = self.load_processor_params()

def load_processor_params(self):
"""Load parameters for chunker, llm model names and other params"""
# Default params
params = {
# Semantic chunking params (for SemanticChunker, from documentation)
"breakpoint_threshold_type": "standard_deviation",
"buffer_size": 4,
"breakpoint_threshold_amount": 3,
"sentence_split_regex": r'(?<=[\.\!\?\n])(\n?)\s+',
# Openai models
"oai_embedding_model": "text-embedding-3-small", # Model for embedding
"oai_gen_model": "gpt-4o-mini", # Model used for generating titles
# Other params
"chunk_threshold": 6000, # Chunk only if number of tokens larger than this
"use_markdown_headers": True, # Use markdown headers for splitting
"max_chunk_size": 6000, # Max number of tokens in a chunk
"generate_titles": False, # Use LLM to generate titles for a chunk
}
param_file = os.getenv("SEMANTIC_CHUNKING_PARAMS", None) # A Json file
if param_file:
params_ = json.load(open(param_file))
for k,v in params_:
params[k] = v
if "sentence_split_regex" in params:
params["sentence_split_regex"] = re.compile(params["sentence_split_regex"])
return params

def check_regex_parsable(self, full_body):
found = re.findall(r'\n# .+\n\n', full_body) # H1 headers
found_h2 = re.findall(r'\n## .+\n\n', full_body) # H2 headers
if len(found) > 2 or len(found_h2) > 2:
return True
return False

def chunk_text_by_markdown(self, text, found):
"""Chunk text based on markdown headers found in the text"""
encoding = tiktoken.encoding_for_model(
self.params.get("oai_embedding_model", "text-embedding-3-small"))
delimiters = []
found = [i.strip("\n") for i in found]
delimiters.extend(found)
text_chunks = re.split(r"(" + "|".join([re.escape(i) for i in found]) + ")",
text)
text_chunks = [i for i in text_chunks if len(i)>0]
regex_text_chunks = []
for chunk in text_chunks:
num_tokens = len(encoding.encode(chunk))
if num_tokens > self.params.get("max_chunk_size", 6000):
sub_chunk_delim = re.findall(r'\n## .+\n\n', chunk)
sub_chunk_delim = [i.strip("\n") for i in sub_chunk_delim]
delimiters.extend(sub_chunk_delim)
if len(sub_chunk_delim) > 0:
regex_text_chunks.extend(
re.split(r"(" + "|".join([re.escape(i) for i in sub_chunk_delim]) + ")",
chunk))
else:
regex_text_chunks.append(chunk)
else:
regex_text_chunks.append(chunk)
# Combine delimiter string with next chunk
text_chunks = []
running_str = regex_text_chunks[0]
for chunk in regex_text_chunks[1:]:
if chunk in delimiters:
text_chunks.append(running_str)
running_str = chunk
else:
running_str += chunk
text_chunks.append(running_str)
text_chunks = [(i, "regex") for i in text_chunks]
return text_chunks

def chunk_text_semantically(self, text):
"""Chunk text based on markdown headers"""
found = re.findall(r'\n# .+\n\n', text) # H1 headers
found_h2 = re.findall(r'\n## .+\n\n', text) # H2 headers
found_h3 = re.findall(r'\n### .+\n\n', text) # H3 headers

chunker_possible_params = dict(inspect.signature(SemanticChunker).parameters)
chunker_params = {k:v for k,v in self.params.items() if k in chunker_possible_params}
text_splitter = SemanticChunker(
OpenAIEmbeddings(model=self.params.get("oai_embedding_model", "text-embedding-3-small")),
**chunker_params)
encoding = tiktoken.encoding_for_model(
self.params.get("oai_embedding_model", "text-embedding-3-small"))

# found = found + found_h2 + found_h3 # all become a new chunk
if len(found) < 2 and len(found_h2) > 2:
found = found_h2

if self.params.get("use_markdown_headers", True) and len(found) > 2:
text_chunks = self.chunk_text_by_markdown(text, found)
else:
text_chunks = text_splitter.split_text(text)
text_chunks = [(re.sub(' +', ' ', re.sub(r'\n\s', '\n', i)), "semantic") for i in text_chunks]

final_chunks = []
for chunk, chunk_type in text_chunks:
num_tokens = len(encoding.encode(chunk))
if num_tokens > self.params.get("max_chunk_size", 6000):
num_chunks = num_tokens // self.params.get("max_chunk_size", 6000) + bool(num_tokens % self.params.get("max_chunk_size", 6000))
text_splitter_temp = SemanticChunker(
OpenAIEmbeddings(model=self.params.get("oai_embedding_model", "text-embedding-3-small")),
number_of_chunks=num_chunks)

sub_chunks = text_splitter_temp.split_text(chunk)
sub_chunks = [[i, "semantic_size"] for i in sub_chunks]
final_chunks.extend(sub_chunks)
else:
final_chunks.append([chunk, chunk_type])
return final_chunks

def get_chunk_title(self, chunk):
"""Get chunk title using llm"""
try:
llm = ChatOpenAI(model_name=self.params.get("oai_gen_model", "gpt-4o-mini"))
response = llm.invoke(
f'Respond only with a suitable title for the following text: "{chunk}"')
return response.content
except Exception as e:
traceback.print_exc()
return None

async def process(self, document: ScrapedDocument, chunker_params: dict = None) -> ScrapedDocument:
body = document.body
body_chunks = self.chunk_text_semantically(body)
document.chunks = []
prev_title = None
prev_count = 1
for idx, (chunk, chunk_type) in enumerate(body_chunks):
chunk_title = None
if chunk_type == "regex":
chunk_title = chunk.strip("\n").split("\n")[0].strip("# ")
prev_title = chunk_title
elif chunk_type == "semantic_size" and prev_title is not None:
chunk_title = prev_title + '-' + str(prev_count)
prev_count += 1
else:
prev_title = None
prev_count = 1
chunk_title = None
if self.params.get("generate_titles", False):
chunk_title = self.get_chunk_title(chunk)
# Add chuks to input object
document.chunks.append({
"title": chunk_title,
"id": f"{document.id}-chunk{idx}",
"body": chunk
})

return document
9 changes: 9 additions & 0 deletions scraper/sources.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,27 @@ github:
- name: BIPs
domain: https://github.com/bitcoin/bips
url: https://github.com/bitcoin/bips.git
processors:
- semantic_chunking
- name: BOLTs
domain: https://github.com/lightning/bolts
url: https://github.com/lightning/bolts.git
processors:
- semantic_chunking
- name: BLIPs
domain: https://github.com/lightning/blips
url: https://github.com/lightning/blips.git
processors:
- semantic_chunking
- name: BitcoinOps
domain: https://bitcoinops.org
url: https://github.com/bitcoinops/bitcoinops.github.io.git
directories:
_posts/en: post
_topics/en: topic
processors:
- summarization
- semantic_chunking
- name: BitcoinTranscripts
domain: https://btctranscripts.com
url: https://github.com/bitcointranscripts/bitcointranscripts.git
Expand Down
12 changes: 12 additions & 0 deletions semantic_chunker_params.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"breakpoint_threshold_type": "standard_deviation",
"buffer_size": 4,
"breakpoint_threshold_amount": 3,
"sentence_split_regex": "(?<=[\\.\\!\\?\n])(\n?)\\s+",
"oai_embedding_model": "text-embedding-3-small",
"oai_gen_model": "gpt-4o-mini",
"chunk_threshold": 6000,
"use_markdown_headers": false,
"max_chunk_size": 6000,
"generate_titles": false
}