Skip to content

Commit 629970b

Browse files
added more examples for annotation (#16)
Co-authored-by: Shahules786 <[email protected]>
1 parent b30245d commit 629970b

File tree

3 files changed

+846
-0
lines changed

3 files changed

+846
-0
lines changed
+143
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 1,
6+
"id": "22d60861-5bb3-48b0-a6a1-c3008c63c14b",
7+
"metadata": {},
8+
"outputs": [
9+
{
10+
"name": "stderr",
11+
"output_type": "stream",
12+
"text": [
13+
"/home/jjmachan/.pyenv/versions/notes/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
14+
" from .autonotebook import tqdm as notebook_tqdm\n",
15+
"/home/jjmachan/.pyenv/versions/notes/lib/python3.12/site-packages/pysbd/segmenter.py:66: SyntaxWarning: invalid escape sequence '\\s'\n",
16+
" for match in re.finditer('{0}\\s*'.format(re.escape(sent)), self.original_text):\n",
17+
"/home/jjmachan/.pyenv/versions/notes/lib/python3.12/site-packages/pysbd/lang/arabic.py:29: SyntaxWarning: invalid escape sequence '\\.'\n",
18+
" txt = re.sub('(?<={0})\\.'.format(am), '∯', txt)\n",
19+
"/home/jjmachan/.pyenv/versions/notes/lib/python3.12/site-packages/pysbd/lang/persian.py:29: SyntaxWarning: invalid escape sequence '\\.'\n",
20+
" txt = re.sub('(?<={0})\\.'.format(am), '∯', txt)\n"
21+
]
22+
},
23+
{
24+
"ename": "ImportError",
25+
"evalue": "nltk is required for bleu score. Please install it using `pip install nltk`",
26+
"output_type": "error",
27+
"traceback": [
28+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
29+
"\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
30+
"File \u001b[0;32m~/.pyenv/versions/notes/lib/python3.12/site-packages/ragas/metrics/_bleu_score.py:23\u001b[0m, in \u001b[0;36mBleuScore.__post_init__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 22\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m---> 23\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mnltk\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtokenize\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m word_tokenize\n\u001b[1;32m 24\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mnltk\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtranslate\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mbleu_score\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m corpus_bleu\n",
31+
"\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'nltk'",
32+
"\nDuring handling of the above exception, another exception occurred:\n",
33+
"\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)",
34+
"Cell \u001b[0;32mIn[1], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mdatasets\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m load_dataset\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mragas\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m evaluate, EvaluationDataset\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mragas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmetrics\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m AspectCritic\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain_openai\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mchat_models\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ChatOpenAI\n",
35+
"File \u001b[0;32m~/.pyenv/versions/notes/lib/python3.12/site-packages/ragas/__init__.py:2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mragas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdataset_schema\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m EvaluationDataset, MultiTurnSample, SingleTurnSample\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mragas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mevaluation\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m evaluate\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mragas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mrun_config\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m RunConfig\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n",
36+
"File \u001b[0;32m~/.pyenv/versions/notes/lib/python3.12/site-packages/ragas/evaluation.py:29\u001b[0m\n\u001b[1;32m 27\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mragas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mllms\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m llm_factory\n\u001b[1;32m 28\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mragas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mllms\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mbase\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BaseRagasLLM, LangchainLLMWrapper\n\u001b[0;32m---> 29\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mragas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmetrics\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m AspectCritic\n\u001b[1;32m 30\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mragas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmetrics\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_answer_correctness\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m AnswerCorrectness\n\u001b[1;32m 31\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mragas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmetrics\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mbase\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[1;32m 32\u001b[0m Metric,\n\u001b[1;32m 33\u001b[0m MetricWithEmbeddings,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 37\u001b[0m is_reproducable,\n\u001b[1;32m 38\u001b[0m )\n",
37+
"File \u001b[0;32m~/.pyenv/versions/notes/lib/python3.12/site-packages/ragas/metrics/__init__.py:16\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mragas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmetrics\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_answer_similarity\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[1;32m 11\u001b[0m AnswerSimilarity,\n\u001b[1;32m 12\u001b[0m SemanticSimilarity,\n\u001b[1;32m 13\u001b[0m answer_similarity,\n\u001b[1;32m 14\u001b[0m )\n\u001b[1;32m 15\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mragas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmetrics\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_aspect_critic\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m AspectCritic\n\u001b[0;32m---> 16\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mragas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmetrics\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_bleu_score\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BleuScore\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mragas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmetrics\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_context_entities_recall\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[1;32m 18\u001b[0m ContextEntityRecall,\n\u001b[1;32m 19\u001b[0m context_entity_recall,\n\u001b[1;32m 20\u001b[0m )\n\u001b[1;32m 21\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mragas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmetrics\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_context_precision\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[1;32m 22\u001b[0m ContextPrecision,\n\u001b[1;32m 23\u001b[0m ContextUtilization,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 26\u001b[0m context_precision,\n\u001b[1;32m 27\u001b[0m )\n",
38+
"File \u001b[0;32m~/.pyenv/versions/notes/lib/python3.12/site-packages/ragas/metrics/_bleu_score.py:54\u001b[0m\n\u001b[1;32m 50\u001b[0m \u001b[38;5;28;01masync\u001b[39;00m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_ascore\u001b[39m(\u001b[38;5;28mself\u001b[39m, row: t\u001b[38;5;241m.\u001b[39mDict, callbacks: Callbacks) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mfloat\u001b[39m:\n\u001b[1;32m 51\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_single_turn_ascore(SingleTurnSample(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mrow), callbacks)\n\u001b[0;32m---> 54\u001b[0m bleu_score \u001b[38;5;241m=\u001b[39m \u001b[43mBleuScore\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
39+
"File \u001b[0;32m<string>:7\u001b[0m, in \u001b[0;36m__init__\u001b[0;34m(self, _required_columns, name, weights, sentence_segmenter)\u001b[0m\n",
40+
"File \u001b[0;32m~/.pyenv/versions/notes/lib/python3.12/site-packages/ragas/metrics/_bleu_score.py:26\u001b[0m, in \u001b[0;36mBleuScore.__post_init__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mnltk\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtranslate\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mbleu_score\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m corpus_bleu\n\u001b[1;32m 25\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mImportError\u001b[39;00m:\n\u001b[0;32m---> 26\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mImportError\u001b[39;00m(\n\u001b[1;32m 27\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnltk is required for bleu score. Please install it using `pip install nltk`\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 28\u001b[0m )\n\u001b[1;32m 29\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msegmenter \u001b[38;5;241m=\u001b[39m get_segmenter()\n\u001b[1;32m 30\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mword_tokenizer \u001b[38;5;241m=\u001b[39m word_tokenize\n",
41+
"\u001b[0;31mImportError\u001b[0m: nltk is required for bleu score. Please install it using `pip install nltk`"
42+
]
43+
}
44+
],
45+
"source": [
46+
"from datasets import load_dataset\n",
47+
"from ragas import evaluate, EvaluationDataset\n",
48+
"from ragas.metrics import AspectCritic\n",
49+
"from langchain_openai.chat_models import ChatOpenAI\n",
50+
"from ragas.llms import LangchainLLMWrapper\n"
51+
]
52+
},
53+
{
54+
"cell_type": "markdown",
55+
"id": "0dab6de2-680a-4178-823f-8889a144a5d4",
56+
"metadata": {},
57+
"source": [
58+
"## Dataset"
59+
]
60+
},
61+
{
62+
"cell_type": "code",
63+
"execution_count": 13,
64+
"id": "a2e9130c-b934-4331-a0c1-ce63089dcdf8",
65+
"metadata": {},
66+
"outputs": [],
67+
"source": [
68+
"dataset = load_dataset(\"explodinggradients/aspect_critic_answer_correctness\",split=\"train\")\n",
69+
"eval_dataset = EvaluationDataset.from_hf_dataset(dataset)"
70+
]
71+
},
72+
{
73+
"cell_type": "markdown",
74+
"id": "ffe5b8b9-8b1b-4ce3-95ce-51dab58458d0",
75+
"metadata": {},
76+
"source": [
77+
"## Set Model"
78+
]
79+
},
80+
{
81+
"cell_type": "code",
82+
"execution_count": 7,
83+
"id": "2c0d7d0c-d7e7-4c50-b2a5-a7336744288e",
84+
"metadata": {},
85+
"outputs": [
86+
{
87+
"data": {
88+
"text/plain": [
89+
"EvaluationDataset(features=['user_input', 'response', 'reference'], len=50)"
90+
]
91+
},
92+
"execution_count": 7,
93+
"metadata": {},
94+
"output_type": "execute_result"
95+
}
96+
],
97+
"source": [
98+
"\n",
99+
"llm_4o = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\"))"
100+
]
101+
},
102+
{
103+
"cell_type": "markdown",
104+
"id": "cb91f37f-3eb0-425a-8a47-7ca6729e498e",
105+
"metadata": {},
106+
"source": [
107+
"## Evaluate"
108+
]
109+
},
110+
{
111+
"cell_type": "code",
112+
"execution_count": null,
113+
"id": "2e52e40c-10c6-4cb0-8815-c01614225b2e",
114+
"metadata": {},
115+
"outputs": [],
116+
"source": [
117+
"critic = AspectCritic(name=\"answer_correctness\",definition=\"Given the user_input, reference and response. Is the response correct compared with the reference\")\n",
118+
"results = evaluate(eval_dataset,metrics=[critic],llm=llm_4o)"
119+
]
120+
}
121+
],
122+
"metadata": {
123+
"kernelspec": {
124+
"display_name": "notes",
125+
"language": "python",
126+
"name": "python3"
127+
},
128+
"language_info": {
129+
"codemirror_mode": {
130+
"name": "ipython",
131+
"version": 3
132+
},
133+
"file_extension": ".py",
134+
"mimetype": "text/x-python",
135+
"name": "python",
136+
"nbconvert_exporter": "python",
137+
"pygments_lexer": "ipython3",
138+
"version": "3.12.6"
139+
}
140+
},
141+
"nbformat": 4,
142+
"nbformat_minor": 5
143+
}

0 commit comments

Comments
 (0)