fix: avoid FaithfulnessEvaluator and ContextRelevanceEvaluator return Nan (#7685)

davidsbatista · web-flow · commit 798dc4a4a5c2 · 2024-05-14T17:08:51.000+02:00
* initial import

* fixing tests

* relaxing condition

* adding safeguard for ContextRelevanceEvaluator as well

* adding release notes
diff --git a/haystack/components/evaluators/context_relevance.py b/haystack/components/evaluators/context_relevance.py
@@ -136,7 +136,10 @@ def run(self, questions: List[str], contexts: List[List[str]]) -> Dict[str, Any]
 
         # calculate average statement relevance score per query
         for res in result["results"]:
-            res["score"] = np_mean(res["statement_scores"])
+            if not res["statements"]:
+                res["score"] = 0
+            else:
+                res["score"] = np_mean(res["statement_scores"])
 
         # calculate average context relevance score over all queries
         result["score"] = np_mean([res["score"] for res in result["results"]])
diff --git a/haystack/components/evaluators/faithfulness.py b/haystack/components/evaluators/faithfulness.py
@@ -154,7 +154,10 @@ def run(self, questions: List[str], contexts: List[List[str]], predicted_answers
 
         # calculate average statement faithfulness score per query
         for res in result["results"]:
-            res["score"] = np_mean(res["statement_scores"])
+            if not res["statements"]:
+                res["score"] = 0
+            else:
+                res["score"] = np_mean(res["statement_scores"])
 
         # calculate average answer faithfulness score over all queries
         result["score"] = np_mean([res["score"] for res in result["results"]])
diff --git a/releasenotes/notes/avoid-LLM-based-evaluators-returning-NaN-579bc4593febb691.yaml b/releasenotes/notes/avoid-LLM-based-evaluators-returning-NaN-579bc4593febb691.yaml
@@ -0,0 +1,4 @@
+---
+fixes:
+  - |
+    `FaithfullnessEvaluator` and `ContextRelevanceEvaluator` now return `0` instead of `NaN` when applied to an empty context or empty statements.
diff --git a/test/components/evaluators/test_context_relevance_evaluator.py b/test/components/evaluators/test_context_relevance_evaluator.py
@@ -121,6 +121,38 @@ def generator_run(self, *args, **kwargs):
             "score": 0.75,
         }
 
+    def test_run_no_statements_extracted(self, monkeypatch):
+        monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
+        component = ContextRelevanceEvaluator()
+
+        def generator_run(self, *args, **kwargs):
+            if "Football" in kwargs["prompt"]:
+                return {"replies": ['{"statements": ["a", "b"], "statement_scores": [1, 0]}']}
+            else:
+                return {"replies": ['{"statements": [], "statement_scores": []}']}
+
+        monkeypatch.setattr("haystack.components.generators.openai.OpenAIGenerator.run", generator_run)
+
+        questions = ["Which is the most popular global sport?", "Who created the Python language?"]
+        contexts = [
+            [
+                "The popularity of sports can be measured in various ways, including TV viewership, social media "
+                "presence, number of participants, and economic impact. Football is undoubtedly the world's most "
+                "popular sport with major events like the FIFA World Cup and sports personalities like Ronaldo and "
+                "Messi, drawing a followership of more than 4 billion people."
+            ],
+            [],
+        ]
+        results = component.run(questions=questions, contexts=contexts)
+        assert results == {
+            "individual_scores": [0.5, 0],
+            "results": [
+                {"score": 0.5, "statement_scores": [1, 0], "statements": ["a", "b"]},
+                {"score": 0, "statement_scores": [], "statements": []},
+            ],
+            "score": 0.25,
+        }
+
     def test_run_missing_parameters(self, monkeypatch):
         monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
         component = ContextRelevanceEvaluator()
diff --git a/test/components/evaluators/test_faithfulness_evaluator.py b/test/components/evaluators/test_faithfulness_evaluator.py
@@ -149,6 +149,42 @@ def generator_run(self, *args, **kwargs):
             "score": 0.75,
         }
 
+    def test_run_no_statements_extracted(self, monkeypatch):
+        monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
+        component = FaithfulnessEvaluator()
+
+        def generator_run(self, *args, **kwargs):
+            if "Football" in kwargs["prompt"]:
+                return {"replies": ['{"statements": ["a", "b"], "statement_scores": [1, 0]}']}
+            else:
+                return {"replies": ['{"statements": [], "statement_scores": []}']}
+
+        monkeypatch.setattr("haystack.components.generators.openai.OpenAIGenerator.run", generator_run)
+
+        questions = ["Which is the most popular global sport?", "Who created the Python language?"]
+        contexts = [
+            [
+                "The popularity of sports can be measured in various ways, including TV viewership, social media "
+                "presence, number of participants, and economic impact. Football is undoubtedly the world's most "
+                "popular sport with major events like the FIFA World Cup and sports personalities like Ronaldo and "
+                "Messi, drawing a followership of more than 4 billion people."
+            ],
+            [],
+        ]
+        predicted_answers = [
+            "Football is the most popular sport with around 4 billion followers worldwide.",
+            "I don't know.",
+        ]
+        results = component.run(questions=questions, contexts=contexts, predicted_answers=predicted_answers)
+        assert results == {
+            "individual_scores": [0.5, 0],
+            "results": [
+                {"score": 0.5, "statement_scores": [1, 0], "statements": ["a", "b"]},
+                {"score": 0, "statement_scores": [], "statements": []},
+            ],
+            "score": 0.25,
+        }
+
     def test_run_missing_parameters(self, monkeypatch):
         monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
         component = FaithfulnessEvaluator()

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +---
 +fixes:
 +  - |
 +    `FaithfullnessEvaluator` and `ContextRelevanceEvaluator` now return `0` instead of `NaN` when applied to an empty context or empty statements.