|
| 1 | +{ |
| 2 | + "cells": [ |
| 3 | + { |
| 4 | + "cell_type": "code", |
| 5 | + "execution_count": 1, |
| 6 | + "id": "22d60861-5bb3-48b0-a6a1-c3008c63c14b", |
| 7 | + "metadata": {}, |
| 8 | + "outputs": [ |
| 9 | + { |
| 10 | + "name": "stderr", |
| 11 | + "output_type": "stream", |
| 12 | + "text": [ |
| 13 | + "/home/jjmachan/.pyenv/versions/notes/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", |
| 14 | + " from .autonotebook import tqdm as notebook_tqdm\n", |
| 15 | + "/home/jjmachan/.pyenv/versions/notes/lib/python3.12/site-packages/pysbd/segmenter.py:66: SyntaxWarning: invalid escape sequence '\\s'\n", |
| 16 | + " for match in re.finditer('{0}\\s*'.format(re.escape(sent)), self.original_text):\n", |
| 17 | + "/home/jjmachan/.pyenv/versions/notes/lib/python3.12/site-packages/pysbd/lang/arabic.py:29: SyntaxWarning: invalid escape sequence '\\.'\n", |
| 18 | + " txt = re.sub('(?<={0})\\.'.format(am), '∯', txt)\n", |
| 19 | + "/home/jjmachan/.pyenv/versions/notes/lib/python3.12/site-packages/pysbd/lang/persian.py:29: SyntaxWarning: invalid escape sequence '\\.'\n", |
| 20 | + " txt = re.sub('(?<={0})\\.'.format(am), '∯', txt)\n" |
| 21 | + ] |
| 22 | + }, |
| 23 | + { |
| 24 | + "ename": "ImportError", |
| 25 | + "evalue": "nltk is required for bleu score. Please install it using `pip install nltk`", |
| 26 | + "output_type": "error", |
| 27 | + "traceback": [ |
| 28 | + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", |
| 29 | + "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", |
| 30 | + "File \u001b[0;32m~/.pyenv/versions/notes/lib/python3.12/site-packages/ragas/metrics/_bleu_score.py:23\u001b[0m, in \u001b[0;36mBleuScore.__post_init__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 22\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m---> 23\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mnltk\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtokenize\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m word_tokenize\n\u001b[1;32m 24\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mnltk\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtranslate\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mbleu_score\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m corpus_bleu\n", |
| 31 | + "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'nltk'", |
| 32 | + "\nDuring handling of the above exception, another exception occurred:\n", |
| 33 | + "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", |
| 34 | + "Cell \u001b[0;32mIn[1], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mdatasets\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m load_dataset\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mragas\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m evaluate, EvaluationDataset\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mragas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmetrics\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m AspectCritic\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain_openai\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mchat_models\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ChatOpenAI\n", |
| 35 | + "File \u001b[0;32m~/.pyenv/versions/notes/lib/python3.12/site-packages/ragas/__init__.py:2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mragas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdataset_schema\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m EvaluationDataset, MultiTurnSample, SingleTurnSample\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mragas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mevaluation\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m evaluate\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mragas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mrun_config\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m RunConfig\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n", |
| 36 | + "File \u001b[0;32m~/.pyenv/versions/notes/lib/python3.12/site-packages/ragas/evaluation.py:29\u001b[0m\n\u001b[1;32m 27\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mragas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mllms\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m llm_factory\n\u001b[1;32m 28\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mragas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mllms\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mbase\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BaseRagasLLM, LangchainLLMWrapper\n\u001b[0;32m---> 29\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mragas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmetrics\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m AspectCritic\n\u001b[1;32m 30\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mragas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmetrics\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_answer_correctness\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m AnswerCorrectness\n\u001b[1;32m 31\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mragas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmetrics\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mbase\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[1;32m 32\u001b[0m Metric,\n\u001b[1;32m 33\u001b[0m MetricWithEmbeddings,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 37\u001b[0m is_reproducable,\n\u001b[1;32m 38\u001b[0m )\n", |
| 37 | + "File \u001b[0;32m~/.pyenv/versions/notes/lib/python3.12/site-packages/ragas/metrics/__init__.py:16\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mragas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmetrics\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_answer_similarity\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[1;32m 11\u001b[0m AnswerSimilarity,\n\u001b[1;32m 12\u001b[0m SemanticSimilarity,\n\u001b[1;32m 13\u001b[0m answer_similarity,\n\u001b[1;32m 14\u001b[0m )\n\u001b[1;32m 15\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mragas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmetrics\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_aspect_critic\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m AspectCritic\n\u001b[0;32m---> 16\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mragas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmetrics\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_bleu_score\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BleuScore\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mragas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmetrics\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_context_entities_recall\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[1;32m 18\u001b[0m ContextEntityRecall,\n\u001b[1;32m 19\u001b[0m context_entity_recall,\n\u001b[1;32m 20\u001b[0m )\n\u001b[1;32m 21\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mragas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmetrics\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_context_precision\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[1;32m 22\u001b[0m ContextPrecision,\n\u001b[1;32m 23\u001b[0m ContextUtilization,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 26\u001b[0m context_precision,\n\u001b[1;32m 27\u001b[0m )\n", |
| 38 | + "File \u001b[0;32m~/.pyenv/versions/notes/lib/python3.12/site-packages/ragas/metrics/_bleu_score.py:54\u001b[0m\n\u001b[1;32m 50\u001b[0m \u001b[38;5;28;01masync\u001b[39;00m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_ascore\u001b[39m(\u001b[38;5;28mself\u001b[39m, row: t\u001b[38;5;241m.\u001b[39mDict, callbacks: Callbacks) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mfloat\u001b[39m:\n\u001b[1;32m 51\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_single_turn_ascore(SingleTurnSample(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mrow), callbacks)\n\u001b[0;32m---> 54\u001b[0m bleu_score \u001b[38;5;241m=\u001b[39m \u001b[43mBleuScore\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", |
| 39 | + "File \u001b[0;32m<string>:7\u001b[0m, in \u001b[0;36m__init__\u001b[0;34m(self, _required_columns, name, weights, sentence_segmenter)\u001b[0m\n", |
| 40 | + "File \u001b[0;32m~/.pyenv/versions/notes/lib/python3.12/site-packages/ragas/metrics/_bleu_score.py:26\u001b[0m, in \u001b[0;36mBleuScore.__post_init__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mnltk\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtranslate\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mbleu_score\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m corpus_bleu\n\u001b[1;32m 25\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mImportError\u001b[39;00m:\n\u001b[0;32m---> 26\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mImportError\u001b[39;00m(\n\u001b[1;32m 27\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnltk is required for bleu score. Please install it using `pip install nltk`\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 28\u001b[0m )\n\u001b[1;32m 29\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msegmenter \u001b[38;5;241m=\u001b[39m get_segmenter()\n\u001b[1;32m 30\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mword_tokenizer \u001b[38;5;241m=\u001b[39m word_tokenize\n", |
| 41 | + "\u001b[0;31mImportError\u001b[0m: nltk is required for bleu score. Please install it using `pip install nltk`" |
| 42 | + ] |
| 43 | + } |
| 44 | + ], |
| 45 | + "source": [ |
| 46 | + "from datasets import load_dataset\n", |
| 47 | + "from ragas import evaluate, EvaluationDataset\n", |
| 48 | + "from ragas.metrics import AspectCritic\n", |
| 49 | + "from langchain_openai.chat_models import ChatOpenAI\n", |
| 50 | + "from ragas.llms import LangchainLLMWrapper\n" |
| 51 | + ] |
| 52 | + }, |
| 53 | + { |
| 54 | + "cell_type": "markdown", |
| 55 | + "id": "0dab6de2-680a-4178-823f-8889a144a5d4", |
| 56 | + "metadata": {}, |
| 57 | + "source": [ |
| 58 | + "## Dataset" |
| 59 | + ] |
| 60 | + }, |
| 61 | + { |
| 62 | + "cell_type": "code", |
| 63 | + "execution_count": 13, |
| 64 | + "id": "a2e9130c-b934-4331-a0c1-ce63089dcdf8", |
| 65 | + "metadata": {}, |
| 66 | + "outputs": [], |
| 67 | + "source": [ |
| 68 | + "dataset = load_dataset(\"explodinggradients/aspect_critic_answer_correctness\",split=\"train\")\n", |
| 69 | + "eval_dataset = EvaluationDataset.from_hf_dataset(dataset)" |
| 70 | + ] |
| 71 | + }, |
| 72 | + { |
| 73 | + "cell_type": "markdown", |
| 74 | + "id": "ffe5b8b9-8b1b-4ce3-95ce-51dab58458d0", |
| 75 | + "metadata": {}, |
| 76 | + "source": [ |
| 77 | + "## Set Model" |
| 78 | + ] |
| 79 | + }, |
| 80 | + { |
| 81 | + "cell_type": "code", |
| 82 | + "execution_count": 7, |
| 83 | + "id": "2c0d7d0c-d7e7-4c50-b2a5-a7336744288e", |
| 84 | + "metadata": {}, |
| 85 | + "outputs": [ |
| 86 | + { |
| 87 | + "data": { |
| 88 | + "text/plain": [ |
| 89 | + "EvaluationDataset(features=['user_input', 'response', 'reference'], len=50)" |
| 90 | + ] |
| 91 | + }, |
| 92 | + "execution_count": 7, |
| 93 | + "metadata": {}, |
| 94 | + "output_type": "execute_result" |
| 95 | + } |
| 96 | + ], |
| 97 | + "source": [ |
| 98 | + "\n", |
| 99 | + "llm_4o = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\"))" |
| 100 | + ] |
| 101 | + }, |
| 102 | + { |
| 103 | + "cell_type": "markdown", |
| 104 | + "id": "cb91f37f-3eb0-425a-8a47-7ca6729e498e", |
| 105 | + "metadata": {}, |
| 106 | + "source": [ |
| 107 | + "## Evaluate" |
| 108 | + ] |
| 109 | + }, |
| 110 | + { |
| 111 | + "cell_type": "code", |
| 112 | + "execution_count": null, |
| 113 | + "id": "2e52e40c-10c6-4cb0-8815-c01614225b2e", |
| 114 | + "metadata": {}, |
| 115 | + "outputs": [], |
| 116 | + "source": [ |
| 117 | + "critic = AspectCritic(name=\"answer_correctness\",definition=\"Given the user_input, reference and response. Is the response correct compared with the reference\")\n", |
| 118 | + "results = evaluate(eval_dataset,metrics=[critic],llm=llm_4o)" |
| 119 | + ] |
| 120 | + } |
| 121 | + ], |
| 122 | + "metadata": { |
| 123 | + "kernelspec": { |
| 124 | + "display_name": "notes", |
| 125 | + "language": "python", |
| 126 | + "name": "python3" |
| 127 | + }, |
| 128 | + "language_info": { |
| 129 | + "codemirror_mode": { |
| 130 | + "name": "ipython", |
| 131 | + "version": 3 |
| 132 | + }, |
| 133 | + "file_extension": ".py", |
| 134 | + "mimetype": "text/x-python", |
| 135 | + "name": "python", |
| 136 | + "nbconvert_exporter": "python", |
| 137 | + "pygments_lexer": "ipython3", |
| 138 | + "version": "3.12.6" |
| 139 | + } |
| 140 | + }, |
| 141 | + "nbformat": 4, |
| 142 | + "nbformat_minor": 5 |
| 143 | +} |
0 commit comments