Skip to content

Commit ef498ee

Browse files
authored
Merge branch 'main' into reformat_notebooks_kao
2 parents 90fe92d + ea6f571 commit ef498ee

5 files changed

+139
-44
lines changed

notebooks/GenAI/AWS_Bedrock_Intro.ipynb

+4-4
Original file line numberDiff line numberDiff line change
@@ -290,10 +290,10 @@
290290
"import pandas as pd\n",
291291
"import os\n",
292292
"df = pd.read_csv('oa_comm.filelist.csv')\n",
293-
"#first 100 files\n",
294-
"first_100=df[0:100]\n",
293+
"#first 50 files\n",
294+
"first_50=df[0:50]\n",
295295
"#save new metadata\n",
296-
"first_100.to_csv('oa_comm.filelist_100.csv', index=False)"
296+
"first_50.to_csv('oa_comm.filelist_100.csv', index=False)"
297297
]
298298
},
299299
{
@@ -305,7 +305,7 @@
305305
"source": [
306306
"import os\n",
307307
"#gather path to files in bucket\n",
308-
"for i in first_100['Key']:\n",
308+
"for i in first_50['Key']:\n",
309309
" os.system(f'aws s3 cp s3://pmc-oa-opendata/{i} s3://{bucket}/docs/ --sse')"
310310
]
311311
},

notebooks/GenAI/AWS_GenAI_Huggingface.ipynb

+113-4
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,14 @@
188188
"train_dataset, test_dataset = load_dataset(\"ccdv/pubmed-summarization\", split=[\"train\", \"test\"])\n"
189189
]
190190
},
191+
{
192+
"cell_type": "markdown",
193+
"id": "3399abb1-af8f-46ee-92ea-c8344eeddd09",
194+
"metadata": {},
195+
"source": [
196+
"## Finetuning our Model Locally"
197+
]
198+
},
191199
{
192200
"cell_type": "markdown",
193201
"id": "ed6ddff1-2636-4e3b-88ee-e3c86c584245",
@@ -210,9 +218,10 @@
210218
"outputs": [],
211219
"source": [
212220
"from transformers import AutoModelForSeq2SeqLM, AutoTokenizer\n",
221+
"model_name=\"google/flan-t5-small\"\n",
213222
"\n",
214-
"model = AutoModelForSeq2SeqLM.from_pretrained(\"google/flan-t5-small\")\n",
215-
"tokenizer = AutoTokenizer.from_pretrained(\"google/flan-t5-small\")"
223+
"model = AutoModelForSeq2SeqLM.from_pretrained(model_name)\n",
224+
"tokenizer = AutoTokenizer.from_pretrained(model_name)"
216225
]
217226
},
218227
{
@@ -253,6 +262,106 @@
253262
"test_dataset.set_format(\"torch\", columns=[\"input_ids\", \"attention_mask\", \"abstracts\"])"
254263
]
255264
},
265+
{
266+
"cell_type": "markdown",
267+
"id": "b3ffd612-abde-4666-8c85-cc7069de2129",
268+
"metadata": {},
269+
"source": [
270+
"The first step to training our model other than setting up our datasets is to set our **hyperparameters**. Hyperparameters depend on your training script and for this one we need to identify our model, the location of our train and test files, etc. iN this case we are using a one created by Hugging Face."
271+
]
272+
},
273+
{
274+
"cell_type": "code",
275+
"execution_count": null,
276+
"id": "c06bef19-cc3c-476f-943c-78368e9f49e8",
277+
"metadata": {},
278+
"outputs": [],
279+
"source": [
280+
"from transformers import TrainingArguments\n",
281+
"\n",
282+
"training_args = TrainingArguments(output_dir=\"test_trainer\")"
283+
]
284+
},
285+
{
286+
"cell_type": "markdown",
287+
"id": "cff31d69-9f54-4235-a377-7c5e758fbca8",
288+
"metadata": {},
289+
"source": [
290+
"Next create setting to evaluate the models accuracy."
291+
]
292+
},
293+
{
294+
"cell_type": "code",
295+
"execution_count": null,
296+
"id": "24bbe62e-9140-4bef-88ae-3e5029ddb25c",
297+
"metadata": {},
298+
"outputs": [],
299+
"source": [
300+
"import numpy as np\n",
301+
"import evaluate\n",
302+
"\n",
303+
"metric = evaluate.load(\"accuracy\")"
304+
]
305+
},
306+
{
307+
"cell_type": "code",
308+
"execution_count": null,
309+
"id": "b82caeba-2daa-4526-b67d-04f45d4a9934",
310+
"metadata": {},
311+
"outputs": [],
312+
"source": [
313+
"def compute_metrics(eval_pred):\n",
314+
" logits, labels = eval_pred\n",
315+
" predictions = np.argmax(logits, axis=-1)\n",
316+
" return metric.compute(predictions=predictions, references=labels)"
317+
]
318+
},
319+
{
320+
"cell_type": "code",
321+
"execution_count": null,
322+
"id": "f5b50ec0-87b8-4578-96aa-e26bda9d99b8",
323+
"metadata": {},
324+
"outputs": [],
325+
"source": [
326+
"from transformers import TrainingArguments, Trainer\n",
327+
"\n",
328+
"training_args = TrainingArguments(output_dir=\"test_trainer\", evaluation_strategy=\"epoch\")"
329+
]
330+
},
331+
{
332+
"cell_type": "markdown",
333+
"id": "df2225ac-8e92-4a14-a368-eebff9ead6bf",
334+
"metadata": {},
335+
"source": [
336+
"Finally we can train our model!"
337+
]
338+
},
339+
{
340+
"cell_type": "code",
341+
"execution_count": null,
342+
"id": "e59332ae-c9e3-4a9b-9a7c-7020c87227da",
343+
"metadata": {},
344+
"outputs": [],
345+
"source": [
346+
"trainer = Trainer(\n",
347+
" model=model,\n",
348+
" args=training_args,\n",
349+
" train_dataset=train_dataset,\n",
350+
" eval_dataset=test_dataset,\n",
351+
" compute_metrics=compute_metrics,\n",
352+
")"
353+
]
354+
},
355+
{
356+
"cell_type": "code",
357+
"execution_count": null,
358+
"id": "f35520bb-b6ca-4996-b87e-2fbfdcfc0dff",
359+
"metadata": {},
360+
"outputs": [],
361+
"source": [
362+
"trainer.train()"
363+
]
364+
},
256365
{
257366
"cell_type": "markdown",
258367
"id": "6ac841f6-c65e-4ebf-8c42-3030e2f92cb0",
@@ -342,7 +451,7 @@
342451
"id": "9204b6dc-8f6e-407e-8c68-a036a6a5b7c9",
343452
"metadata": {},
344453
"source": [
345-
"### Training our Model"
454+
"### Training our ModelFinetuning our Model via Vertex AI Training API"
346455
]
347456
},
348457
{
@@ -634,7 +743,7 @@
634743
"name": "python",
635744
"nbconvert_exporter": "python",
636745
"pygments_lexer": "ipython3",
637-
"version": "3.10.12"
746+
"version": "3.10.13"
638747
}
639748
},
640749
"nbformat": 4,

notebooks/GenAI/AWS_GenAI_Jumpstart.ipynb

+4-10
Original file line numberDiff line numberDiff line change
@@ -75,20 +75,14 @@
7575
},
7676
{
7777
"cell_type": "code",
78-
"execution_count": null,
78+
"execution_count": 1,
7979
"id": "6cf1429a-314e-49b6-a4f7-16a3e52319af",
8080
"metadata": {
8181
"tags": []
8282
},
8383
"outputs": [],
8484
"source": [
85-
"(\n",
86-
" model_id,\n",
87-
" model_version,\n",
88-
") = (\n",
89-
" \"meta-textgeneration-llama-2-7b-f\",\n",
90-
" \"*\",\n",
91-
")"
85+
"model_id, model_version = \"meta-textgeneration-llama-2-13b-f\", \"2.*\""
9286
]
9387
},
9488
{
@@ -110,7 +104,7 @@
110104
"source": [
111105
"from sagemaker.jumpstart.model import JumpStartModel\n",
112106
"\n",
113-
"model = JumpStartModel(model_id=model_id)\n",
107+
"model = JumpStartModel(model_id=model_id, model_version=model_version)\n",
114108
"predictor = model.deploy()\n"
115109
]
116110
},
@@ -254,7 +248,7 @@
254248
"name": "python",
255249
"nbconvert_exporter": "python",
256250
"pygments_lexer": "ipython3",
257-
"version": "3.10.12"
251+
"version": "3.10.13"
258252
}
259253
},
260254
"nbformat": 4,

notebooks/GenAI/Pubmed_RAG_chatbot.ipynb

+15-20
Original file line numberDiff line numberDiff line change
@@ -77,13 +77,7 @@
7777
},
7878
"outputs": [],
7979
"source": [
80-
"(\n",
81-
" model_id,\n",
82-
" model_version,\n",
83-
") = (\n",
84-
" \"meta-textgeneration-llama-2-7b-f\",\n",
85-
" \"*\",\n",
86-
")"
80+
"model_id, model_version = \"meta-textgeneration-llama-2-13b-f\", \"2.*\""
8781
]
8882
},
8983
{
@@ -105,8 +99,8 @@
10599
"source": [
106100
"from sagemaker.jumpstart.model import JumpStartModel\n",
107101
"\n",
108-
"model = JumpStartModel(model_id=model_id)\n",
109-
"predictor = model.deploy()"
102+
"model = JumpStartModel(model_id=model_id, model_version=model_version)\n",
103+
"predictor = model.deploy()\n"
110104
]
111105
},
112106
{
@@ -225,7 +219,7 @@
225219
"id": "93a8595a-767f-4cad-9273-62d8e2cf60d1",
226220
"metadata": {},
227221
"source": [
228-
"We only want the metadata of the first 100 files to keep this tutorial short."
222+
"We only want the metadata of the first 50 files to keep this tutorial short."
229223
]
230224
},
231225
{
@@ -242,19 +236,20 @@
242236
"import os\n",
243237
"\n",
244238
"df = pd.read_csv('oa_comm.filelist.csv')\n",
239+
245240
"\n",
246-
"#first 100 files\n",
247-
"first_100=df[0:101]\n",
241+
"#first 50 files\n",
242+
"first_50=df[0:50]\n",
248243
"#save new metadata\n",
249-
"first_100.to_csv('oa_comm.filelist_100.csv', index=False)"
244+
"first_50.to_csv('oa_comm.filelist_50.csv', index=False)"
250245
]
251246
},
252247
{
253248
"cell_type": "markdown",
254249
"id": "abd1ae93-450e-4c79-83cc-ea46a1b507c1",
255250
"metadata": {},
256251
"source": [
257-
"Lets look at our metadata! We can see that the bucket path to the files are under the **Key** column. This column is what we will use to loop through the PMC bucket and copy the first 100 files to our bucket."
252+
"Lets look at our metadata! We can see that the bucket path to the files are under the **Key** column. This column is what we will use to loop through the PMC bucket and copy the first 50 files to our bucket."
258253
]
259254
},
260255
{
@@ -264,7 +259,7 @@
264259
"metadata": {},
265260
"outputs": [],
266261
"source": [
267-
"first_100"
262+
"first_50"
268263
]
269264
},
270265
{
@@ -276,7 +271,7 @@
276271
"source": [
277272
"import os\n",
278273
"#gather path to files in bucket\n",
279-
"for i in first_100['Key']:\n",
274+
"for i in first_50['Key']:\n",
280275
" os.system(f'aws s3 cp s3://pmc-oa-opendata/{i} s3://{bucket}/docs/ --sse')"
281276
]
282277
},
@@ -295,7 +290,7 @@
295290
"metadata": {},
296291
"outputs": [],
297292
"source": [
298-
"! aws s3 cp oa_comm.filelist_100.csv s3://{bucket}/docs/"
293+
"! aws s3 cp oa_comm.filelist_50.csv s3://{bucket}/docs/"
299294
]
300295
},
301296
{
@@ -373,12 +368,12 @@
373368
},
374369
"source": [
375370
"```python\n",
376-
"from langchain.retrievers import PubMedRetriever\n",
371+
"from langchain_community.retrievers import PubMedRetriever\n",
377372
"from langchain.retrievers import AmazonKendraRetriever\n",
378-
"from langchain.llms import SagemakerEndpoint\n",
373+
"from langchain_community.llms import SagemakerEndpoint\n",
374+
"from langchain_community.llms.sagemaker_endpoint import LLMContentHandler\n",
379375
"from langchain.chains import ConversationalRetrievalChain\n",
380376
"from langchain.prompts import PromptTemplate\n",
381-
"from langchain.llms.sagemaker_endpoint import LLMContentHandler\n",
382377
"import sys\n",
383378
"import json\n",
384379
"import os\n",

notebooks/GenAI/example_scripts/langchain_chat_llama_2_zeroshot.py

+3-6
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,11 @@
1-
from langchain.retrievers import PubMedRetriever
1+
from langchain_community.retrievers import PubMedRetriever
22
from langchain.chains import ConversationalRetrievalChain
33
from langchain.prompts import PromptTemplate
4-
#from langchain import SagemakerEndpoint
5-
from langchain.llms.sagemaker_endpoint import LLMContentHandler
4+
from langchain_community.llms import SagemakerEndpoint
5+
from langchain_community.llms.sagemaker_endpoint import LLMContentHandler
66
import sys
77
import json
88
import os
9-
from langchain.llms import SagemakerEndpoint
109

1110

1211
class bcolors:
@@ -24,7 +23,6 @@ class bcolors:
2423

2524
def build_chain():
2625
region = os.environ["AWS_REGION"]
27-
#kendra_index_id = os.environ["KENDRA_INDEX_ID"]
2826
endpoint_name = os.environ["LLAMA_2_ENDPOINT"]
2927

3028
class ContentHandler(LLMContentHandler):
@@ -58,7 +56,6 @@ def transform_output(self, output: bytes) -> str:
5856
content_handler=content_handler,
5957
)
6058

61-
#retriever = AmazonKendraRetriever(index_id=kendra_index_id,region_name=region)
6259
retriever= PubMedRetriever()
6360

6461
prompt_template = """

0 commit comments

Comments
 (0)