feat: update the Ragas evaluator to have the OpenAI client as something that gets passed in to __init__

RobotSail · RobotSail · commit c6b5a70cbb58 · 2025-01-07T19:57:22.000-05:00
Signed-off-by: Oleg S &lt;97077423+RobotSail@users.noreply.github.com&gt;
diff --git a/src/instructlab/eval/ragas.py b/src/instructlab/eval/ragas.py
@@ -5,6 +5,7 @@
 
 # Third Party
 from langchain_community.chat_models import ChatOpenAI
+from openai import Client as OpenAIClient
 from pandas import DataFrame, read_json
 from pydantic import BaseModel, ConfigDict, field_validator
 from ragas.evaluation import EvaluationDataset, EvaluationResult, RunConfig, evaluate
@@ -16,7 +17,6 @@
 
 # Local
 from .evaluator import Evaluator
-from .mt_bench_common import get_openai_client
 
 
 class Sample(TypedDict):
@@ -49,19 +49,12 @@ class Sample(TypedDict):
 class ModelConfig(BaseModel):
     model_config = ConfigDict(protected_namespaces=())
 
-    # URL of the OpenAI server where the model shall be hosted.
-    base_url: str
-
     # name of the model to use.
     model_name: str
 
     # The system prompt to be used when applying the chat template.
     system_prompt: str = _DEFAULT_SYSTEM_PROMPT
 
-    # We do NOT read from OPENAI_API_KEY for the student model for security reasons (e.g. sending the API key to another client)
-    # To provide an OpenAI key, you must set it here; else the default is used.
-    api_key: str = "no-api-key"
-
     # "model randomness" aka likelihood of sampling something other than the likeliest token
     temperature: float = 0.0
 
@@ -87,15 +80,18 @@ def __init__(
         self,
         student_model: ModelConfig | None = None,
         run_config: RunConfig | None = None,
+        openai_client: OpenAIClient | None = None,
     ):
         self.student_model = student_model
         self.run_config = run_config
+        self.openai_client = openai_client
 
     def run(
         self,
         dataset: List[Sample] | Path,
         student_model: ModelConfig | None = None,
         run_config: RunConfig | None = None,
+        openai_client: OpenAIClient | None = None,
     ) -> EvaluationResult:
         """
         Evaluates the quality of model responses against a graded rubric.
@@ -115,12 +111,16 @@ def run(
                 a default one is created containing extremely permissive settings when handling
                 timeouts. This is because by default, OpenAI tier-1 usage accounts have very high
                 rate limits resulting in heavy throttling during evaluations.
+            openai_client (openai.Client | None, optional):
+                The client to use when generating questions from the student model, must be compatible with the OpenAI API.
+                This field is required when `student_model` is provided.
 
         Returns:
             EvaluationResult: The results of all evaluations performed by Ragas
         """
         student_model = student_model if student_model else self.student_model
         run_config = run_config if run_config else self.run_config
+        openai_client = openai_client if openai_client else self.openai_client
 
         if not dataset:
             raise ValueError(
@@ -140,14 +140,20 @@ def run(
         assert input_df is not None
 
         need_to_generate_questions = "response" not in input_df.columns
-        if need_to_generate_questions and not student_model:
+        if need_to_generate_questions and (not student_model or not openai_client):
             raise ValueError(
-                "provided dataset doesn't contain the model `response`, but no `student_model` was provided for inference"
+                "provided dataset doesn't contain the model `response`, but either `student_model` or `openai_client` wasn't provided for inference"
             )
 
         # if the student model was provided then we always generate regardless
         if student_model:
-            input_df = self._generate_answers_from_model(input_df, student_model)
+            if not openai_client:
+                raise ValueError(
+                    "`student_model` was specified but `openai_client` was not provided"
+                )
+            input_df = self._generate_answers_from_model(
+                input_df, student_model, openai_client
+            )
 
         if not run_config:
             # we set extreme timeout/retry values by default since OpenAI tier-1 rate limits
@@ -176,16 +182,15 @@ def run(
         return results
 
     def _generate_answers_from_model(
-        self, questions: DataFrame, student_model: ModelConfig
+        self,
+        questions: DataFrame,
+        student_model: ModelConfig,
+        openai_client: OpenAIClient,
     ) -> DataFrame:
         """
         Given a DataFrame containing `user_input` columns, generates responses from the given model
         and returns a new DataFrame containing its answers in the `response` column.
         """
-        client = get_openai_client(
-            model_api_base=student_model.base_url, api_key=student_model.api_key
-        )
-
         # initialize response to write into
         updated_df = questions.copy()
         updated_df["response"] = ""
@@ -195,7 +200,7 @@ def _generate_answers_from_model(
                 student_model.system_prompt,
                 qna["user_input"],
             ]
-            response = client.chat.completions.create(
+            response = openai_client.chat.completions.create(
                 messages=messages,
                 model=student_model.model_name,
                 # specify the seed so we can at least try to have some reproducibility when the clients support it
diff --git a/tests/test_ragas.py b/tests/test_ragas.py
@@ -11,58 +11,55 @@
 import pandas as pd
 
 # First Party
-from instructlab.eval.ragas import ModelConfig, RagasEvaluator, RunConfig, Sample
+from instructlab.eval.ragas import ModelConfig, RagasEvaluator, RunConfig
 
 
 class TestRagasEvaluator(unittest.TestCase):
-    @patch("instructlab.eval.ragas.get_openai_client")
-    def test_generate_answers_from_model(self, mock_get_openai_client):
+    def test_generate_answers_from_model(self):
         # mock the OpenAI client to always return "london" for chat completions
+        user_input = "What is the capital of France?"
+        model_response = "London"
         mock_client = MagicMock()
         mock_response = MagicMock()
-        mock_response.choices[0].message.content = "London"
+        mock_response.choices = [MagicMock(message=MagicMock(content=model_response))]
         mock_client.chat.completions.create.return_value = mock_response
-        mock_get_openai_client.return_value = mock_client
 
         # get answers
-        questions = pd.DataFrame({"user_input": ["What is the capital of France?"]})
+        questions = pd.DataFrame({"user_input": [user_input]})
         student_model = ModelConfig(
-            base_url="https://your.model.endpoint.com",
-            model_name="jeeves-512B",
-            api_key="test-api-key",
+            model_name="super-jeeves-8x700B",
         )
         evaluator = RagasEvaluator()
-        result_df = evaluator._generate_answers_from_model(questions, student_model)
+        result_df = evaluator._generate_answers_from_model(
+            questions, student_model, mock_client
+        )
 
         # what we expect to see
         expected_df = questions.copy()
-        expected_df["response"] = ["London"]
+        expected_df["response"] = [model_response]
 
         # perform the assertions
         pd.testing.assert_frame_equal(result_df, expected_df)
-        mock_get_openai_client.assert_called_once_with(
-            model_api_base=student_model.base_url, api_key=student_model.api_key
-        )
         mock_client.chat.completions.create.assert_called_once_with(
-            messages=[student_model.system_prompt, "What is the capital of France?"],
+            messages=[student_model.system_prompt, user_input],
             model=student_model.model_name,
             seed=42,
             max_tokens=student_model.max_tokens,
             temperature=student_model.temperature,
         )
 
+    @patch("instructlab.eval.ragas.ChatOpenAI")
     @patch("instructlab.eval.ragas.read_json")
     @patch("instructlab.eval.ragas.evaluate")
-    @patch("instructlab.eval.ragas.ChatOpenAI")
     @patch.object(RagasEvaluator, "_generate_answers_from_model")
     @patch.object(RagasEvaluator, "_get_metrics")
     def test_run(
         self,
         mock_get_metrics: MagicMock,
         mock_generate_answers_from_model: MagicMock,
-        mock_ChatOpenAI: MagicMock,
         mock_evaluate: MagicMock,
         mock_read_json: MagicMock,
+        mock_ChatOpenAI: MagicMock,
     ):
         ########################################################################
         # SETUP EVERYTHING WE NEED FOR THE TESTS
@@ -74,16 +71,20 @@ def test_run(
         student_model_response = "Paris"
         user_question = "What is the capital of France?"
         golden_answer = "The capital of France is Paris."
+        metric = "mocked-metric"
+        metric_score = 4.0
         base_ds = [{"user_input": user_question, "reference": golden_answer}]
-        mocked_metric = "mocked-metric"
-        mocked_metric_score = 4.0
+        student_model = ModelConfig(
+            model_name="super-jeeves-8x700B",
+        )
+        run_config = RunConfig(max_retries=3, max_wait=60, seed=42, timeout=30)
 
         # The following section takes care of mocking function return calls.
         # Ragas is tricky because it has some complex data structures under the hood,
         # so what we have to do is configure the intermediate outputs that we expect
         # to receive from Ragas.
 
-        mock_get_metrics.return_value = [mocked_metric]
+        mock_get_metrics.return_value = [metric]
         interim_df = DataFrame(
             {
                 "user_input": [user_question],
@@ -93,7 +94,12 @@ def test_run(
         )
         mock_generate_answers_from_model.return_value = interim_df.copy()
         mocked_evaluation_ds = EvaluationDataset.from_pandas(interim_df)
-        mock_ChatOpenAI.return_value = MagicMock()
+        mock_client = MagicMock()
+        mock_response = MagicMock()
+        mock_response.choices = [
+            MagicMock(message=MagicMock(content=student_model_response))
+        ]
+        mock_client.chat.completions.create.return_value = mock_response
 
         # Ragas requires this value to instantiate an EvaluationResult object, so we must provide it.
         # It isn't functionally used for our purposes though.
@@ -109,29 +115,20 @@ def test_run(
             )
         }
         mock_evaluate.return_value = EvaluationResult(
-            scores=[{mocked_metric: mocked_metric_score}],
+            scores=[{metric: metric_score}],
             dataset=mocked_evaluation_ds,
             ragas_traces=_unimportant_ragas_traces,
         )
 
-        ########################################################################
-        # Run the tests
-        ########################################################################
-
-        # Configure all other inputs that Ragas does not depend on for proper mocking
-        student_model = ModelConfig(
-            base_url="https://api.openai.com",
-            model_name="pt-3.5-turbo",
-            api_key="test-api-key",
-        )
-        run_config = RunConfig(max_retries=3, max_wait=60, seed=42, timeout=30)
-        evaluator = RagasEvaluator()
-
         ########################################################################
         # Test case: directly passing a dataset
         ########################################################################
+        evaluator = RagasEvaluator()
         result = evaluator.run(
-            dataset=base_ds, student_model=student_model, run_config=run_config
+            dataset=base_ds,
+            student_model=student_model,
+            run_config=run_config,
+            openai_client=mock_client,
         )
 
         self.assertIsInstance(result, EvaluationResult)
@@ -142,11 +139,13 @@ def test_run(
         ########################################################################
         # Test case: passing a dataset in via Path to JSONL file
         ########################################################################
+        evaluator = RagasEvaluator()
         mock_read_json.return_value = DataFrame(base_ds)
         result = evaluator.run(
             dataset=Path("dummy_path.jsonl"),
             student_model=student_model,
             run_config=run_config,
+            openai_client=mock_client,
         )
 
         self.assertIsInstance(result, EvaluationResult)
@@ -156,6 +155,24 @@ def test_run(
         mock_generate_answers_from_model.assert_called()
         mock_evaluate.assert_called()
 
+        ########################################################################
+        # Test case: using the instance attributes
+        ########################################################################
+        evaluator = RagasEvaluator(
+            student_model=student_model,
+            openai_client=mock_client,
+            run_config=run_config,
+        )
+        mock_read_json.return_value = DataFrame(base_ds)
+        result = evaluator.run(dataset=Path("dummy_path.jsonl"))
+
+        self.assertIsInstance(result, EvaluationResult)
+        mock_read_json.assert_called_with(
+            Path("dummy_path.jsonl"), orient="records", lines=True
+        )
+        mock_generate_answers_from_model.assert_called()
+        mock_evaluate.assert_called()
+
 
 if __name__ == "__main__":
     unittest.main()