explodinggradients · ErfanMoosaviMonazzah · Nov 3, 2024
diff --git a/.gitignore b/.gitignore
diff --git a/Makefile b/Makefile
@@ -0,0 +1,2 @@
+run-ci:
+	black src/
diff --git a/README.md b/README.md
@@ -0,0 +1,9 @@
+# Installation
+```bash
+git clone [email protected]:explodinggradients/synthetic-qa-paper.git
+cd synthetic-qa-paper
+pip install -e .
+```
+
+## Usage
+See example [notebook](notebooks/experiment.ipynb) for usage.
diff --git a/_run.ipynb b/_run.ipynb
diff --git a/caches/sample_experiment/cached_augmented_generations.pkl b/caches/sample_experiment/cached_augmented_generations.pkl
diff --git a/caches/sample_experiment/cached_chunks.pkl b/caches/sample_experiment/cached_chunks.pkl
diff --git a/caches/sample_experiment/cached_docs.pkl b/caches/sample_experiment/cached_docs.pkl
diff --git a/caches/sample_experiment/cached_queries.pkl b/caches/sample_experiment/cached_queries.pkl
diff --git a/caches/sample_experiment/cached_reranked_rets.pkl b/caches/sample_experiment/cached_reranked_rets.pkl
diff --git a/caches/sample_experiment/cached_rets.pkl b/caches/sample_experiment/cached_rets.pkl
diff --git a/caches/sample_experiment/cached_vector_db/index.faiss b/caches/sample_experiment/cached_vector_db/index.faiss
diff --git a/caches/sample_experiment/cached_vector_db/index.pkl b/caches/sample_experiment/cached_vector_db/index.pkl
diff --git a/caches/sample_experiment/disabled__cached_reranked_rets.pkl b/caches/sample_experiment/disabled__cached_reranked_rets.pkl
diff --git a/datasets/multihoprag.json b/datasets/multihoprag.json
diff --git a/experiment_configs/sample.yaml b/experiment_configs/sample.yaml
@@ -0,0 +1,42 @@
+--- # TODO:
+# - Reranker is fairly easy to implement, this will remove dependancies (ragatouille, colbert, ninja and etc.).
+
+cache_dir: ./caches/sample_experiment
+
+dataset_path: ./datasets/multihoprag.json
+results_path: ./results/multihoprag.json
+
+text_splitter_model_id: thenlper/gte-small # only if text_splitter set to [hf_tokenizer, tiktoken]
+embedding_model_id: thenlper/gte-small # HF model id like [BAAI/bge-large-en-v1.5 , BAAI/llm-embedder , ...]
+reranker_model_id: colbert-ir/colbertv2.0 # HF model id, use `null` to turn off reranker
+generator_model_id: gpt-4o-mini # model id in form of HF, OpenAI, TogetherAI
+
+text_splitter: hf_tokenizer # select from [char, recursive_char, hf_tokenizer, tiktoken]
+generator: openai # select from [hf, together, openai]
+
+split_overlap: 0.1 # percentage of overlap
+chunk_size: 256
+num_retrievals: 10 # for the retriever
+num_selections: 4 # for the reranker
+
+api_key: PLACE_YOUR_API_KEY
+
+generator_model_config: # only for HF models
+  max_new_tokens: 256
+  return_full_text: False
+  temperature: 0
+
+## RAG Prompts
+system_prompt: |
+  Using the information contained in the context, give a comprehensive answer to the question.
+  Respond only to the question asked, response should be concise and relevant to the question.
+  If the answer cannot be deduced from the context, do not generate any response on your own and just say `answer not found`.
+
+
+context_prompt: |
+  Context:
+  {CONTEXT}
+  ---
+  Now here is the question you need to answer.
+  {QUERY}
+
diff --git a/notebooks/experiment.ipynb b/notebooks/experiment.ipynb
@@ -0,0 +1,323 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": 3,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from ragbench import RAGPipeline, RAGEval, RAGTools\n",
+        "import json"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "f8oGHLfl7yXv"
+      },
+      "source": [
+        "# From YAML Config"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "2tHtDlu_7yKX"
+      },
+      "outputs": [],
+      "source": [
+        "!cat ../experiment_configs/sample.yaml"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "RAGPipeline.run_pipeline_from_yaml('../experiment_configs/sample.yaml')"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "a50pvYFQ7vXX"
+      },
+      "source": [
+        "# Step by Step"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "QrIL6z4wd6FF"
+      },
+      "source": [
+        "### Loading Corpus & Queries and Creating RAG Pipeline"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 4,
+      "metadata": {
+        "id": "azESxxMHg-8_"
+      },
+      "outputs": [],
+      "source": [
+        "system_prompt = '''Using the information contained in the context, give a comprehensive answer to the question.\n",
+        "Respond only to the question asked, response should be concise and relevant to the question.\n",
+        "If the answer cannot be deduced from the context, do not generate any response on your own and just say `answer not found`.\n",
+        "'''\n",
+        "\n",
+        "context_prompt = '''Context:\n",
+        "{CONTEXT}\n",
+        "---\n",
+        "Now here is the question you need to answer.\n",
+        "{QUERY}\n",
+        "'''"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 6,
+      "metadata": {
+        "id": "FBxCmstLr-pV"
+      },
+      "outputs": [],
+      "source": [
+        "with open('../datasets/multihoprag.json', 'r') as file:\n",
+        "    ds = json.load(file)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 8,
+      "metadata": {
+        "id": "mGT3huj4PvjS"
+      },
+      "outputs": [],
+      "source": [
+        "rag_pipe = RAGPipeline('../caches/sample_experiment', system_prompt, context_prompt)\n",
+        "ls_docs = ds['corpus']\n",
+        "ls_queries = list(ds['gold_answers'].keys())"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "bYP1N4TOTNDV"
+      },
+      "source": [
+        "### Vectorize"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "dYrgyZnqPvjS",
+        "outputId": "1c77a7e4-b97a-42ee-ee04-53c562afaa36"
+      },
+      "outputs": [],
+      "source": [
+        "rag_pipe.load_embedding_model_from_hf('thenlper/gte-small')\n",
+        "text_splitter = RAGTools.load_text_splitter_hf_tokenizer(rag_pipe.embedding_tokenizer, 256, 0.1)\n",
+        "ls_chunks = rag_pipe.split_docs(ls_docs, text_splitter)\n",
+        "rag_pipe.prepare_vector_db(ls_chunks)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "q3ysSYT0TQPh"
+      },
+      "source": [
+        "### Retrieve"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "QXMqWAYmSQg_",
+        "outputId": "8648ccff-7d7b-4d74-e25f-e8f3f7d4b53f"
+      },
+      "outputs": [],
+      "source": [
+        "ls_rets = rag_pipe.retrieve(ls_queries, 10)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "rf7GgGAjTSrd"
+      },
+      "source": [
+        "### Rerank"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 9,
+      "metadata": {
+        "id": "YV3dzXM2TUSr"
+      },
+      "outputs": [],
+      "source": [
+        "rag_pipe.load_reranker_model_from_hf('colbert-ir/colbertv2.0')\n",
+        "ls_reranked_rets = rag_pipe.rerank(ls_rets, 4)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "4EFoFQgtWVKh"
+      },
+      "source": [
+        "### Evaluate Retriever"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "rets, golds = ls_reranked_rets, ds['gold_retrieves']\n",
+        "RAGEval.retrieval_metrics(rets, golds)\n",
+        "\n",
+        "# or\n",
+        "\n",
+        "# rets, golds = ls_rets, ds['gold_retrieves']\n",
+        "# hit10 = RAGEval.hits_at(10, rets, golds)\n",
+        "# hit4 = RAGEval.hits_at(4, rets, golds)\n",
+        "# map10 = RAGEval.map_at(10, rets, golds)\n",
+        "# mrr10 = RAGEval.mrr_at(10, rets, golds)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "qwKf6LUBesmW"
+      },
+      "source": [
+        "### Generate Responses"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "pCneBXYMfXGb",
+        "outputId": "ff4d227d-203b-4762-a692-c218f3654bce"
+      },
+      "outputs": [],
+      "source": [
+        "ls_prompts = rag_pipe.create_prompts(ls_rets)\n",
+        "\n",
+        "# Sampling 10 for testing, turn this off for actual experiments\n",
+        "random_queries = list(ls_prompts.keys())[:10]\n",
+        "random_prompts = {k:ls_prompts[k] for k in random_queries}\n",
+        "\n",
+        "rag_pipe.load_generator_model_from_openai('gpt-4o-mini', API_KEY)\n",
+        "ls_responses = rag_pipe.generate_responses(random_prompts)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Iw31kY8Lxv3Z"
+      },
+      "source": [
+        "### Evaluate Answer Generation"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "E6FB5wQ4xvf_",
+        "outputId": "f42e9b66-47b1-4749-858e-6c130194a08f"
+      },
+      "outputs": [],
+      "source": [
+        "metric_per_query = RAGEval.generation_metrics(ls_responses, ds['gold_answers'], rag_pipe.embedding_model)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "G1QI5AkgYelv"
+      },
+      "outputs": [],
+      "source": []
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "po2jAYvyYeiT"
+      },
+      "outputs": [],
+      "source": []
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "xCxfzBVsXqfG"
+      },
+      "outputs": [],
+      "source": []
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "EMNm45aSXqb4"
+      },
+      "outputs": [],
+      "source": []
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "temp4",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.12.7"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}