-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrag_example.py
107 lines (85 loc) · 3.4 KB
/
rag_example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
"""
Example usage of different document loaders (smart-llm-loader and PyMuPDF) for RAG applications.
"""
import os
from dotenv import load_dotenv
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from smart_llm_loader import SmartLLMLoader
# Load environment variables
load_dotenv()
# OpenAI API key since we are using the gpt-4o-mini model for question-answering
os.environ["OPENAI_API_KEY"] = "YOUR_OPENAI_API_KEY"
# Gemini API key since we are using the gemini flash model
os.environ["GEMINI_API_KEY"] = "YOUR_GEMINI_API_KEY"
def create_rag_chain(retriever, llm):
"""Create a RAG chain with the given retriever and LLM."""
prompt_template = PromptTemplate.from_template(
"""
You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer the question.
If you don't know the answer, just say that you don't know.
Use three sentences maximum and keep the answer concise
Question: {question}
Context: {context}
Answer:"""
)
def format_docs(docs):
return "\n\n".join(doc.page_content for doc in docs)
return (
{"context": retriever | format_docs, "question": RunnablePassthrough()}
| prompt_template
| llm
| StrOutputParser()
)
def process_with_llmloader():
"""Process documents using SmartLLMLoader with Gemini Flash."""
llm = ChatOpenAI(model="gpt-4o-mini")
# Initialize the loader from the smart-llm-loader package
loader = SmartLLMLoader(
file_path="./data/test_ocr_doc.pdf",
chunk_strategy="contextual",
model="gemini/gemini-1.5-flash",
)
docs = loader.load_and_split()
vectorstore = FAISS.from_documents(documents=docs, embedding=OpenAIEmbeddings())
rag_chain = create_rag_chain(vectorstore.as_retriever(), llm)
return rag_chain
def process_with_pymupdf():
"""Process documents using PyMuPDF with recursive chunking."""
llm = ChatOpenAI(model="gpt-4o-mini")
# Load document with PyMuPDF
loader = PyMuPDFLoader("./data/test_ocr_doc.pdf")
documents = loader.load()
# Create text splitter for recursive chunking
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
length_function=len,
is_separator_regex=False,
)
docs = text_splitter.split_documents(documents)
vectorstore = FAISS.from_documents(documents=docs, embedding=OpenAIEmbeddings())
rag_chain = create_rag_chain(vectorstore.as_retriever(), llm)
return rag_chain
def main():
# Example using LLMLoader
print("\n=== Using LLMLoader ===")
llm_chain = process_with_llmloader()
question = "What is the total gross worth for item 1 and item 7?"
answer = llm_chain.invoke(question)
print(f"Question: {question}")
print(f"Answer: {answer}")
# Example using PyMuPDF
print("\n=== Using PyMuPDF ===")
pymupdf_chain = process_with_pymupdf()
answer = pymupdf_chain.invoke(question)
print(f"Question: {question}")
print(f"Answer: {answer}")
if __name__ == "__main__":
main()