| 
 | 1 | +---  | 
 | 2 | +title: '💬 Chat with Webpage'  | 
 | 3 | +description: 'Build a RAG chatbot for any webpage using ScrapeGraph and LanceDB'  | 
 | 4 | +---  | 
 | 5 | + | 
 | 6 | +<img  | 
 | 7 | +  style={{ borderRadius: '0.5rem' }}  | 
 | 8 | +  src="/cookbook/images/chat-webpage-banner.png"  | 
 | 9 | +/>  | 
 | 10 | + | 
 | 11 | +Learn how to build a RAG (Retrieval Augmented Generation) chatbot that can answer questions about any webpage by combining ScrapeGraph's Markdownify service with LanceDB vector store and OpenAI.  | 
 | 12 | + | 
 | 13 | +<Note>  | 
 | 14 | +Try it yourself in our interactive notebooks:  | 
 | 15 | +- [Burr Implementation](https://github.com/ScrapeGraphAI/scrapegraph-sdk/blob/main/cookbook/chat-webpage-simple-rag/scrapegraph_burr_lancedb.ipynb)  | 
 | 16 | +</Note>  | 
 | 17 | + | 
 | 18 | +## The Goal  | 
 | 19 | + | 
 | 20 | +We'll create a chatbot that can:  | 
 | 21 | + | 
 | 22 | +| Feature | Description |  | 
 | 23 | +| ----- | ----------- |  | 
 | 24 | +| Webpage Ingestion | Convert any webpage to markdown format |  | 
 | 25 | +| Content Chunking | Split content into manageable chunks |  | 
 | 26 | +| Vector Storage | Store and index chunks in LanceDB |  | 
 | 27 | +| Question Answering | Answer questions using relevant chunks |  | 
 | 28 | + | 
 | 29 | +## Code Example  | 
 | 30 | + | 
 | 31 | +```python  | 
 | 32 | +from burr.core import action, State, ApplicationBuilder  | 
 | 33 | +from scrapegraph_py import Client  | 
 | 34 | +import lancedb  | 
 | 35 | +from lancedb.pydantic import LanceModel, Vector  | 
 | 36 | +import openai  | 
 | 37 | +import tiktoken  | 
 | 38 | +from typing import List, Optional  | 
 | 39 | + | 
 | 40 | +# Schema for storing text chunks  | 
 | 41 | +class TextDocument(LanceModel):  | 
 | 42 | +    url: str  | 
 | 43 | +    position: int  | 
 | 44 | +    text: str  | 
 | 45 | +    vector: Vector(dim=1536)  # OpenAI embedding dimensions  | 
 | 46 | + | 
 | 47 | +# Action to fetch and convert webpage to markdown  | 
 | 48 | +@action(reads=[], writes=["markdown_content"])  | 
 | 49 | +def fetch_webpage(state: State, webpage_url: str) -> State:  | 
 | 50 | +    client = Client()  | 
 | 51 | +    response = client.markdownify(website_url=webpage_url)  | 
 | 52 | +    return state.update(markdown_content=response["result"])  | 
 | 53 | + | 
 | 54 | +# Action to embed and store chunks  | 
 | 55 | +@action(reads=["markdown_content"], writes=[])  | 
 | 56 | +def embed_and_store(state: State, webpage_url: str) -> State:  | 
 | 57 | +    chunks = get_text_chunks(state["markdown_content"])  | 
 | 58 | +    con = lancedb.connect("./webpages")  | 
 | 59 | +    table = con.create_table("chunks", schema=TextDocument)  | 
 | 60 | +    table.add([{  | 
 | 61 | +        "text": chunk,  | 
 | 62 | +        "url": webpage_url,  | 
 | 63 | +        "position": i  | 
 | 64 | +    } for i, chunk in enumerate(chunks)])  | 
 | 65 | +    return state  | 
 | 66 | + | 
 | 67 | +# Action to answer questions  | 
 | 68 | +@action(reads=[], writes=["llm_answer"])  | 
 | 69 | +def ask_question(state: State, user_query: str) -> State:  | 
 | 70 | +    chunks_table = lancedb.connect("./webpages").open_table("chunks")  | 
 | 71 | +    relevant_chunks = chunks_table.search(user_query).limit(3).to_list()  | 
 | 72 | +      | 
 | 73 | +    response = openai.chat.completions.create(  | 
 | 74 | +        model="gpt-4",  | 
 | 75 | +        messages=[  | 
 | 76 | +            {"role": "system", "content": f"Answer based on: {relevant_chunks}"},  | 
 | 77 | +            {"role": "user", "content": user_query}  | 
 | 78 | +        ]  | 
 | 79 | +    )  | 
 | 80 | +    return state.update(llm_answer=response.choices[0].message.content)  | 
 | 81 | +```  | 
 | 82 | + | 
 | 83 | +## Example Output  | 
 | 84 | + | 
 | 85 | +```json  | 
 | 86 | +{  | 
 | 87 | +    "question": "Who are the founders of ScrapeGraphAI?",  | 
 | 88 | +    "answer": "The founders of ScrapeGraphAI are:\n\n1. Marco Perini - Founder & Technical Lead\n2. Marco Vinciguerra - Founder & Software Engineer\n3. Lorenzo Padoan - Founder & Product Engineer"  | 
 | 89 | +}  | 
 | 90 | +```  | 
 | 91 | + | 
 | 92 | +<CardGroup cols={2}>  | 
 | 93 | +  <Card  | 
 | 94 | +    title="Markdownify"  | 
 | 95 | +    icon="robot"  | 
 | 96 | +    href="/services/markdownify"  | 
 | 97 | +  >  | 
 | 98 | +    Learn more about our webpage-to-markdown service  | 
 | 99 | +  </Card>  | 
 | 100 | +  <Card  | 
 | 101 | +    title="Python SDK"  | 
 | 102 | +    icon="python"  | 
 | 103 | +    href="/sdks/python"  | 
 | 104 | +  >  | 
 | 105 | +    Explore our Python SDK documentation  | 
 | 106 | +  </Card>  | 
 | 107 | +</CardGroup>  | 
 | 108 | + | 
 | 109 | +---  | 
 | 110 | + | 
 | 111 | +<Note>  | 
 | 112 | +Have a suggestion for a new example?  [Contact us ](mailto:[email protected]) with your use case or contribute directly on  [GitHub ](https://github.com/ScrapeGraphAI/scrapegraph-sdk).  | 
 | 113 | +</Note>   | 
0 commit comments