Update rag-tutorials.ipynb

lfunderburk · lfunderburk · commit 85b38f9fabb9 · 2024-10-28T09:16:53.000-07:00
diff --git a/ch6/case-II-q-and-a-complex/rag-tutorials.ipynb b/ch6/case-II-q-and-a-complex/rag-tutorials.ipynb
@@ -16,9 +16,31 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 25,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<haystack.core.pipeline.pipeline.Pipeline object at 0x34dbfbe60>\n",
+       "🚅 Components\n",
+       "  - link_fetcher: LinkContentFetcher\n",
+       "  - converter: HTMLToDocument\n",
+       "  - splitter: DocumentSplitter\n",
+       "  - embedder: SentenceTransformersDocumentEmbedder\n",
+       "  - writer: DocumentWriter\n",
+       "🛤️ Connections\n",
+       "  - link_fetcher.streams -> converter.sources (List[ByteStream])\n",
+       "  - converter.documents -> splitter.documents (List[Document])\n",
+       "  - splitter.documents -> embedder.documents (List[Document])\n",
+       "  - embedder.documents -> writer.documents (List[Document])"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "from haystack import Pipeline\n",
     "from haystack.document_stores.in_memory import InMemoryDocumentStore\n",
@@ -55,7 +77,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 26,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -78,9 +100,27 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 27,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Batches: 100%|██████████| 1/1 [00:01<00:00,  1.11s/it]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "{'writer': {'documents_written': 20}}"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "indexing_pipeline.run(data={\"link_fetcher\":{\"urls\": [\"https://haystack.deepset.ai/integrations/elasticsearch-document-store\",\n",
     "                                                    \"https://haystack.deepset.ai/tutorials/27_first_rag_pipeline/\",\n",
@@ -103,7 +143,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 28,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -124,7 +164,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 29,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -137,7 +177,7 @@
     "\n",
     "######## Complete this section #############\n",
     "prompt_template = \"\"\"\n",
-    "You are an expert Python software engineer, you are asked to write code, \n",
+    "You are an expert Python software engineer, you are asked to write Haystack 2.0 pipelines for indexing and querying documents., \n",
     "explain code and you use the context provided to generate accurate and functional code along with clear explanations.\n",
     "After you define a class, you also provide examples of using the class and its methods.\n",
     "You must only use information from the given documents and cite the documents you used by mentioning their URL in the answer.\n",
@@ -159,9 +199,29 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 30,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<haystack.core.pipeline.pipeline.Pipeline object at 0x350d3a6c0>\n",
+       "🚅 Components\n",
+       "  - query_embedder: SentenceTransformersTextEmbedder\n",
+       "  - retriever: InMemoryEmbeddingRetriever\n",
+       "  - prompt_builder: PromptBuilder\n",
+       "  - llm: OpenAIGenerator\n",
+       "🛤️ Connections\n",
+       "  - query_embedder.embedding -> retriever.query_embedding (List[float])\n",
+       "  - retriever.documents -> prompt_builder.documents (List[Document])\n",
+       "  - prompt_builder.prompt -> llm.prompt (str)"
+      ]
+     },
+     "execution_count": 30,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "pipeline = Pipeline()\n",
     "pipeline.add_component(instance=query_embedder, name=\"query_embedder\")\n",
@@ -192,13 +252,90 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 31,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Batches: 100%|██████████| 1/1 [00:00<00:00, 17.85it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "https://haystack.deepset.ai/integrations/elasticsearch-document-store and https://haystack.deepset.ai/tutorials/27_first_rag_pipeline/ , to start the writing of pipeline, we need to first import the necessary modules from Haystack and initialize the ElasticSearch Document store. \n",
+      "\n",
+      "```python\n",
+      "from haystack.components.embedders import SentenceTransformersTextEmbedder\n",
+      "from haystack.components.builders import PromptBuilder\n",
+      "from haystack.components.generators import RAGenerator\n",
+      "from haystack.components.evaluators.rag import RAGAsEvaluater\n",
+      "from haystack import Pipeline\n",
+      "from haystack.document_stores.elasticsearch import ElasticsearchDocumentStore\n",
+      "\n",
+      "document_store = ElasticsearchDocumentStore(hosts=\"http://localhost:9200\")\n",
+      "```\n",
+      "\n",
+      "Next, initialize the TextEmbedder that will create embeddings for the user query.\n",
+      "\n",
+      "```python\n",
+      "text_embedder = SentenceTransformersTextEmbedder(model=\"sentence-transformers/all-MiniLM-L6-v2\")\n",
+      "```\n",
+      "\n",
+      "Define a template prompt that will guide the generation of answers. \n",
+      "\n",
+      "```python\n",
+      "template = \"\"\"\n",
+      "Given the following information, answer the question.\n",
+      "Context:\n",
+      "{% for document in documents %}\n",
+      "{{ document.content }}\n",
+      "{% endfor %}\n",
+      "Question: {{question}}\n",
+      "Answer:\n",
+      "\"\"\"\n",
+      "prompt_builder = PromptBuilder(template=template)\n",
+      "```\n",
+      "    \n",
+      "Then, initialize the Generators and Evaluators. We are using RAGenerator for generating answers with RAG approach and RAGAsEvaluater for evaluating generated answers.\n",
+      "\n",
+      "```python\n",
+      "generator = RAGenerator(model=\"rag-token-nq\")\n",
+      "evaluator = RAGAsEvaluater()\n",
+      "```\n",
+      "Now, build the pipeline by adding the components and connecting them. \n",
+      "\n",
+      "```python\n",
+      "pipeline = Pipeline()\n",
+      "pipeline.add_component(\"text_embedder\", text_embedder)\n",
+      "pipeline.add_component(\"prompt_builder\", prompt_builder)\n",
+      "pipeline.add_component(\"generator\", generator)\n",
+      "pipeline.add_component(\"evaluator\", evaluator)\n",
+      "\n",
+      "pipeline.connect(\"text_embedder.embedding\", \"retriever.query_embedding\")\n",
+      "pipeline.connect(\"retriever\", \"prompt_builder.documents\")\n",
+      "pipeline.connect(\"prompt_builder\", \"generator\")\n",
+      "pipeline.connect(\"generator\", \"evaluator\")\n",
+      "```\n",
+      "\n",
+      "Finally, run the pipeline with a question to generate and evaluate an answer.\n",
+      "\n",
+      "```python\n",
+      "question = \"What does Rhodes Statue look like?\"\n",
+      "response = pipeline.run({\"text_embedder\": {\"text\": question}, \"prompt_builder\": {\"question\": question}})\n",
+      "print(response[\"evaluator\"][\"evaluation\"])\n",
+      "```\n",
+      "Please note that the code is hypothetical given the documents provided and may need modifications as per actual setup and requirement. The model name used for SentenceTransformersTextEmbedder and RAGenerator may need to change based on the desired performance and characteristics. The connection of the components in the pipeline may vary based on the actual need.\n"
+     ]
+    }
+   ],
    "source": [
     "question = \"Write a Haystack 2.0 pipeline that connects to Elastic Search and answers questions about its knowledge, the pipeline\\\n",
     "    should include a prompt template with instructions and the template should iterate over all documents in the \\\n",
-    "        context, the pipeline should also include a feedback loop and guardrails to ensure that the answers are accurate.\"\n",
+    "        context, the pipeline should also incorporate evaluation of the generated answers through RAGAS.\"\n",
     "result = pipeline.run(data={\"query_embedder\": {\"text\": question}, \"prompt_builder\": {\"query\": question}})\n",
     "print(result['llm']['replies'][0])"
    ]