Merge branch 'main' into reformat_notebooks_kao

zbyosufzai · web-flow · commit ef498ee86df4 · 2024-07-10T16:29:06.000-04:00
diff --git a/notebooks/GenAI/AWS_Bedrock_Intro.ipynb b/notebooks/GenAI/AWS_Bedrock_Intro.ipynb
@@ -290,10 +290,10 @@
     "import pandas as pd\n",
     "import os\n",
     "df = pd.read_csv('oa_comm.filelist.csv')\n",
-    "#first 100 files\n",
-    "first_100=df[0:100]\n",
+    "#first 50 files\n",
+    "first_50=df[0:50]\n",
     "#save new metadata\n",
-    "first_100.to_csv('oa_comm.filelist_100.csv', index=False)"
+    "first_50.to_csv('oa_comm.filelist_100.csv', index=False)"
    ]
   },
   {
@@ -305,7 +305,7 @@
    "source": [
     "import os\n",
     "#gather path to files in bucket\n",
-    "for i in first_100['Key']:\n",
+    "for i in first_50['Key']:\n",
     "    os.system(f'aws s3 cp s3://pmc-oa-opendata/{i} s3://{bucket}/docs/ --sse')"
    ]
   },
diff --git a/notebooks/GenAI/AWS_GenAI_Huggingface.ipynb b/notebooks/GenAI/AWS_GenAI_Huggingface.ipynb
@@ -188,6 +188,14 @@
     "train_dataset, test_dataset = load_dataset(\"ccdv/pubmed-summarization\", split=[\"train\", \"test\"])\n"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "3399abb1-af8f-46ee-92ea-c8344eeddd09",
+   "metadata": {},
+   "source": [
+    "## Finetuning our Model Locally"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "ed6ddff1-2636-4e3b-88ee-e3c86c584245",
@@ -210,9 +218,10 @@
    "outputs": [],
    "source": [
     "from transformers import AutoModelForSeq2SeqLM, AutoTokenizer\n",
+    "model_name=\"google/flan-t5-small\"\n",
     "\n",
-    "model = AutoModelForSeq2SeqLM.from_pretrained(\"google/flan-t5-small\")\n",
-    "tokenizer = AutoTokenizer.from_pretrained(\"google/flan-t5-small\")"
+    "model = AutoModelForSeq2SeqLM.from_pretrained(model_name)\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_name)"
    ]
   },
   {
@@ -253,6 +262,106 @@
     "test_dataset.set_format(\"torch\", columns=[\"input_ids\", \"attention_mask\", \"abstracts\"])"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "b3ffd612-abde-4666-8c85-cc7069de2129",
+   "metadata": {},
+   "source": [
+    "The first step to training our model other than setting up our datasets is to set our **hyperparameters**. Hyperparameters depend on your training script and for this one we need to identify our model, the location of our train and test files, etc. iN this case we are using a one created by Hugging Face."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c06bef19-cc3c-476f-943c-78368e9f49e8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import TrainingArguments\n",
+    "\n",
+    "training_args = TrainingArguments(output_dir=\"test_trainer\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cff31d69-9f54-4235-a377-7c5e758fbca8",
+   "metadata": {},
+   "source": [
+    "Next create setting to evaluate the models accuracy."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "24bbe62e-9140-4bef-88ae-3e5029ddb25c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import evaluate\n",
+    "\n",
+    "metric = evaluate.load(\"accuracy\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b82caeba-2daa-4526-b67d-04f45d4a9934",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def compute_metrics(eval_pred):\n",
+    "    logits, labels = eval_pred\n",
+    "    predictions = np.argmax(logits, axis=-1)\n",
+    "    return metric.compute(predictions=predictions, references=labels)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f5b50ec0-87b8-4578-96aa-e26bda9d99b8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import TrainingArguments, Trainer\n",
+    "\n",
+    "training_args = TrainingArguments(output_dir=\"test_trainer\", evaluation_strategy=\"epoch\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "df2225ac-8e92-4a14-a368-eebff9ead6bf",
+   "metadata": {},
+   "source": [
+    "Finally we can train our model!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e59332ae-c9e3-4a9b-9a7c-7020c87227da",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "trainer = Trainer(\n",
+    "    model=model,\n",
+    "    args=training_args,\n",
+    "    train_dataset=train_dataset,\n",
+    "    eval_dataset=test_dataset,\n",
+    "    compute_metrics=compute_metrics,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f35520bb-b6ca-4996-b87e-2fbfdcfc0dff",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "trainer.train()"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "6ac841f6-c65e-4ebf-8c42-3030e2f92cb0",
@@ -342,7 +451,7 @@
    "id": "9204b6dc-8f6e-407e-8c68-a036a6a5b7c9",
    "metadata": {},
    "source": [
-    "### Training our Model"
+    "### Training our ModelFinetuning our Model via Vertex AI Training API"
    ]
   },
   {
@@ -634,7 +743,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.10.13"
   }
  },
  "nbformat": 4,
diff --git a/notebooks/GenAI/AWS_GenAI_Jumpstart.ipynb b/notebooks/GenAI/AWS_GenAI_Jumpstart.ipynb
@@ -75,20 +75,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "id": "6cf1429a-314e-49b6-a4f7-16a3e52319af",
    "metadata": {
     "tags": []
    },
    "outputs": [],
    "source": [
-    "(\n",
-    "    model_id,\n",
-    "    model_version,\n",
-    ") = (\n",
-    "    \"meta-textgeneration-llama-2-7b-f\",\n",
-    "    \"*\",\n",
-    ")"
+    "model_id, model_version = \"meta-textgeneration-llama-2-13b-f\", \"2.*\""
    ]
   },
   {
@@ -110,7 +104,7 @@
    "source": [
     "from sagemaker.jumpstart.model import JumpStartModel\n",
     "\n",
-    "model = JumpStartModel(model_id=model_id)\n",
+    "model = JumpStartModel(model_id=model_id, model_version=model_version)\n",
     "predictor = model.deploy()\n"
    ]
   },
@@ -254,7 +248,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.10.13"
   }
  },
  "nbformat": 4,
diff --git a/notebooks/GenAI/Pubmed_RAG_chatbot.ipynb b/notebooks/GenAI/Pubmed_RAG_chatbot.ipynb
@@ -77,13 +77,7 @@
    },
    "outputs": [],
    "source": [
-    "(\n",
-    "    model_id,\n",
-    "    model_version,\n",
-    ") = (\n",
-    "    \"meta-textgeneration-llama-2-7b-f\",\n",
-    "    \"*\",\n",
-    ")"
+    "model_id, model_version = \"meta-textgeneration-llama-2-13b-f\", \"2.*\""
    ]
   },
   {
@@ -105,8 +99,8 @@
    "source": [
     "from sagemaker.jumpstart.model import JumpStartModel\n",
     "\n",
-    "model = JumpStartModel(model_id=model_id)\n",
-    "predictor = model.deploy()"
+    "model = JumpStartModel(model_id=model_id, model_version=model_version)\n",
+    "predictor = model.deploy()\n"
    ]
   },
   {
@@ -225,7 +219,7 @@
    "id": "93a8595a-767f-4cad-9273-62d8e2cf60d1",
    "metadata": {},
    "source": [
-    "We only want the metadata of the first 100 files to keep this tutorial short."
+    "We only want the metadata of the first 50 files to keep this tutorial short."
    ]
   },
   {
@@ -242,19 +236,20 @@
     "import os\n",
     "\n",
     "df = pd.read_csv('oa_comm.filelist.csv')\n",
+
     "\n",
-    "#first 100 files\n",
-    "first_100=df[0:101]\n",
+    "#first 50 files\n",
+    "first_50=df[0:50]\n",
     "#save new metadata\n",
-    "first_100.to_csv('oa_comm.filelist_100.csv', index=False)"
+    "first_50.to_csv('oa_comm.filelist_50.csv', index=False)"
    ]
   },
   {
    "cell_type": "markdown",
    "id": "abd1ae93-450e-4c79-83cc-ea46a1b507c1",
    "metadata": {},
    "source": [
-    "Lets look at our metadata! We can see that the bucket path to the files are under the **Key** column. This column is what we will use to loop through the PMC bucket and copy the first 100 files to our bucket."
+    "Lets look at our metadata! We can see that the bucket path to the files are under the **Key** column. This column is what we will use to loop through the PMC bucket and copy the first 50 files to our bucket."
    ]
   },
   {
@@ -264,7 +259,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "first_100"
+    "first_50"
    ]
   },
   {
@@ -276,7 +271,7 @@
    "source": [
     "import os\n",
     "#gather path to files in bucket\n",
-    "for i in first_100['Key']:\n",
+    "for i in first_50['Key']:\n",
     "    os.system(f'aws s3 cp s3://pmc-oa-opendata/{i} s3://{bucket}/docs/ --sse')"
    ]
   },
@@ -295,7 +290,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "! aws s3 cp oa_comm.filelist_100.csv s3://{bucket}/docs/"
+    "! aws s3 cp oa_comm.filelist_50.csv s3://{bucket}/docs/"
    ]
   },
   {
@@ -373,12 +368,12 @@
    },
    "source": [
     "```python\n",
-    "from langchain.retrievers import PubMedRetriever\n",
+    "from langchain_community.retrievers import PubMedRetriever\n",
     "from langchain.retrievers import AmazonKendraRetriever\n",
-    "from langchain.llms import SagemakerEndpoint\n",
+    "from langchain_community.llms import SagemakerEndpoint\n",
+    "from langchain_community.llms.sagemaker_endpoint import LLMContentHandler\n",
     "from langchain.chains import ConversationalRetrievalChain\n",
     "from langchain.prompts import PromptTemplate\n",
-    "from langchain.llms.sagemaker_endpoint import LLMContentHandler\n",
     "import sys\n",
     "import json\n",
     "import os\n",
diff --git a/notebooks/GenAI/example_scripts/langchain_chat_llama_2_zeroshot.py b/notebooks/GenAI/example_scripts/langchain_chat_llama_2_zeroshot.py
@@ -1,12 +1,11 @@
-from langchain.retrievers import PubMedRetriever
+from langchain_community.retrievers import PubMedRetriever
 from langchain.chains import ConversationalRetrievalChain
 from langchain.prompts import PromptTemplate
-#from langchain import SagemakerEndpoint
-from langchain.llms.sagemaker_endpoint import LLMContentHandler
+from langchain_community.llms import SagemakerEndpoint
+from langchain_community.llms.sagemaker_endpoint import LLMContentHandler
 import sys
 import json
 import os
-from langchain.llms import SagemakerEndpoint
 
 
 class bcolors:
@@ -24,7 +23,6 @@ class bcolors:
 
 def build_chain():
   region = os.environ["AWS_REGION"]
-  #kendra_index_id = os.environ["KENDRA_INDEX_ID"]
   endpoint_name = os.environ["LLAMA_2_ENDPOINT"]
 
   class ContentHandler(LLMContentHandler):
@@ -58,7 +56,6 @@ def transform_output(self, output: bytes) -> str:
           content_handler=content_handler,
       )
       
-  #retriever = AmazonKendraRetriever(index_id=kendra_index_id,region_name=region)
   retriever= PubMedRetriever()
     
   prompt_template = """