Skip to content

Commit 798dc4a

Browse files
fix: avoid FaithfulnessEvaluator and ContextRelevanceEvaluator return Nan (#7685)
* initial import * fixing tests * relaxing condition * adding safeguard for ContextRelevanceEvaluator as well * adding release notes
1 parent cc869b1 commit 798dc4a

File tree

5 files changed

+80
-2
lines changed

5 files changed

+80
-2
lines changed

haystack/components/evaluators/context_relevance.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,10 @@ def run(self, questions: List[str], contexts: List[List[str]]) -> Dict[str, Any]
136136

137137
# calculate average statement relevance score per query
138138
for res in result["results"]:
139-
res["score"] = np_mean(res["statement_scores"])
139+
if not res["statements"]:
140+
res["score"] = 0
141+
else:
142+
res["score"] = np_mean(res["statement_scores"])
140143

141144
# calculate average context relevance score over all queries
142145
result["score"] = np_mean([res["score"] for res in result["results"]])

haystack/components/evaluators/faithfulness.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,10 @@ def run(self, questions: List[str], contexts: List[List[str]], predicted_answers
154154

155155
# calculate average statement faithfulness score per query
156156
for res in result["results"]:
157-
res["score"] = np_mean(res["statement_scores"])
157+
if not res["statements"]:
158+
res["score"] = 0
159+
else:
160+
res["score"] = np_mean(res["statement_scores"])
158161

159162
# calculate average answer faithfulness score over all queries
160163
result["score"] = np_mean([res["score"] for res in result["results"]])
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
---
2+
fixes:
3+
- |
4+
`FaithfullnessEvaluator` and `ContextRelevanceEvaluator` now return `0` instead of `NaN` when applied to an empty context or empty statements.

test/components/evaluators/test_context_relevance_evaluator.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,38 @@ def generator_run(self, *args, **kwargs):
121121
"score": 0.75,
122122
}
123123

124+
def test_run_no_statements_extracted(self, monkeypatch):
125+
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
126+
component = ContextRelevanceEvaluator()
127+
128+
def generator_run(self, *args, **kwargs):
129+
if "Football" in kwargs["prompt"]:
130+
return {"replies": ['{"statements": ["a", "b"], "statement_scores": [1, 0]}']}
131+
else:
132+
return {"replies": ['{"statements": [], "statement_scores": []}']}
133+
134+
monkeypatch.setattr("haystack.components.generators.openai.OpenAIGenerator.run", generator_run)
135+
136+
questions = ["Which is the most popular global sport?", "Who created the Python language?"]
137+
contexts = [
138+
[
139+
"The popularity of sports can be measured in various ways, including TV viewership, social media "
140+
"presence, number of participants, and economic impact. Football is undoubtedly the world's most "
141+
"popular sport with major events like the FIFA World Cup and sports personalities like Ronaldo and "
142+
"Messi, drawing a followership of more than 4 billion people."
143+
],
144+
[],
145+
]
146+
results = component.run(questions=questions, contexts=contexts)
147+
assert results == {
148+
"individual_scores": [0.5, 0],
149+
"results": [
150+
{"score": 0.5, "statement_scores": [1, 0], "statements": ["a", "b"]},
151+
{"score": 0, "statement_scores": [], "statements": []},
152+
],
153+
"score": 0.25,
154+
}
155+
124156
def test_run_missing_parameters(self, monkeypatch):
125157
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
126158
component = ContextRelevanceEvaluator()

test/components/evaluators/test_faithfulness_evaluator.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,42 @@ def generator_run(self, *args, **kwargs):
149149
"score": 0.75,
150150
}
151151

152+
def test_run_no_statements_extracted(self, monkeypatch):
153+
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
154+
component = FaithfulnessEvaluator()
155+
156+
def generator_run(self, *args, **kwargs):
157+
if "Football" in kwargs["prompt"]:
158+
return {"replies": ['{"statements": ["a", "b"], "statement_scores": [1, 0]}']}
159+
else:
160+
return {"replies": ['{"statements": [], "statement_scores": []}']}
161+
162+
monkeypatch.setattr("haystack.components.generators.openai.OpenAIGenerator.run", generator_run)
163+
164+
questions = ["Which is the most popular global sport?", "Who created the Python language?"]
165+
contexts = [
166+
[
167+
"The popularity of sports can be measured in various ways, including TV viewership, social media "
168+
"presence, number of participants, and economic impact. Football is undoubtedly the world's most "
169+
"popular sport with major events like the FIFA World Cup and sports personalities like Ronaldo and "
170+
"Messi, drawing a followership of more than 4 billion people."
171+
],
172+
[],
173+
]
174+
predicted_answers = [
175+
"Football is the most popular sport with around 4 billion followers worldwide.",
176+
"I don't know.",
177+
]
178+
results = component.run(questions=questions, contexts=contexts, predicted_answers=predicted_answers)
179+
assert results == {
180+
"individual_scores": [0.5, 0],
181+
"results": [
182+
{"score": 0.5, "statement_scores": [1, 0], "statements": ["a", "b"]},
183+
{"score": 0, "statement_scores": [], "statements": []},
184+
],
185+
"score": 0.25,
186+
}
187+
152188
def test_run_missing_parameters(self, monkeypatch):
153189
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
154190
component = FaithfulnessEvaluator()

0 commit comments

Comments
 (0)