-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindexers.py
28 lines (24 loc) · 1.28 KB
/
indexers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
from loaders import HtmlDocumentLoader
from preprocessors import GithubBlogpostPreprocessor, ArxivHtmlPaperPreprocessor
from text_splitters import SimpleCharacterTextSplitter
from wcs_client_adapter import WcsClientAdapter
CACHE_PATH = "./loader_cache"
CHUNK_SIZE = 150
OVERLAP_SIZE = 25
GITHUB_BLOG_POST = "https://lilianweng.github.io/posts/2023-06-23-agent/"
ARXIV_RAG_SURVEY_PAPER = "https://arxiv.org/html/2312.10997v5"
class NaiveWcsIndexer:
def __init__(self, doc_uri): # TODO: this calling syntax doesn't make it clear what side effects the constructor has
self._loader = HtmlDocumentLoader(doc_uri, CACHE_PATH)
if doc_uri == GITHUB_BLOG_POST:
self._preprocessor = GithubBlogpostPreprocessor()
elif doc_uri == ARXIV_RAG_SURVEY_PAPER:
self._preprocessor = ArxivHtmlPaperPreprocessor()
else:
raise ValueError(f"Unsupported document URI: {doc_uri}. Expected URIs are GitHub blog post or arXiv RAG survey paper.")
self._text_splitter = SimpleCharacterTextSplitter(CHUNK_SIZE, OVERLAP_SIZE)
document_content = self._loader.load()
cleaned_text = self._preprocessor.get_text(document_content)
text_splits = self._text_splitter.split_text(cleaned_text)
WcsClientAdapter.setup_collection()
WcsClientAdapter.insert_text_splits(text_splits)