Skip to content

Commit 0082ae6

Browse files
committed
fix: avoid FaithfulnessEvaluator and ContextRelevanceEvaluator return Nan (#7685)
* initial import * fixing tests * relaxing condition * adding safeguard for ContextRelevanceEvaluator as well * adding release notes
1 parent 96da73e commit 0082ae6

File tree

5 files changed

+80
-2
lines changed

5 files changed

+80
-2
lines changed

Diff for: haystack/components/evaluators/context_relevance.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,10 @@ def run(self, questions: List[str], contexts: List[List[str]]) -> Dict[str, Any]
132132

133133
# calculate average statement relevance score per query
134134
for res in result["results"]:
135-
res["score"] = np_mean(res["statement_scores"])
135+
if not res["statements"]:
136+
res["score"] = 0
137+
else:
138+
res["score"] = np_mean(res["statement_scores"])
136139

137140
# calculate average context relevance score over all queries
138141
result["score"] = np_mean([res["score"] for res in result["results"]])

Diff for: haystack/components/evaluators/faithfulness.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,10 @@ def run(self, questions: List[str], contexts: List[List[str]], predicted_answers
150150

151151
# calculate average statement faithfulness score per query
152152
for res in result["results"]:
153-
res["score"] = np_mean(res["statement_scores"])
153+
if not res["statements"]:
154+
res["score"] = 0
155+
else:
156+
res["score"] = np_mean(res["statement_scores"])
154157

155158
# calculate average answer faithfulness score over all queries
156159
result["score"] = np_mean([res["score"] for res in result["results"]])
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
---
2+
fixes:
3+
- |
4+
`FaithfullnessEvaluator` and `ContextRelevanceEvaluator` now return `0` instead of `NaN` when applied to an empty context or empty statements.

Diff for: test/components/evaluators/test_context_relevance_evaluator.py

+32
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,38 @@ def generator_run(self, *args, **kwargs):
118118
"score": 0.75,
119119
}
120120

121+
def test_run_no_statements_extracted(self, monkeypatch):
122+
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
123+
component = ContextRelevanceEvaluator()
124+
125+
def generator_run(self, *args, **kwargs):
126+
if "Football" in kwargs["prompt"]:
127+
return {"replies": ['{"statements": ["a", "b"], "statement_scores": [1, 0]}']}
128+
else:
129+
return {"replies": ['{"statements": [], "statement_scores": []}']}
130+
131+
monkeypatch.setattr("haystack.components.generators.openai.OpenAIGenerator.run", generator_run)
132+
133+
questions = ["Which is the most popular global sport?", "Who created the Python language?"]
134+
contexts = [
135+
[
136+
"The popularity of sports can be measured in various ways, including TV viewership, social media "
137+
"presence, number of participants, and economic impact. Football is undoubtedly the world's most "
138+
"popular sport with major events like the FIFA World Cup and sports personalities like Ronaldo and "
139+
"Messi, drawing a followership of more than 4 billion people."
140+
],
141+
[],
142+
]
143+
results = component.run(questions=questions, contexts=contexts)
144+
assert results == {
145+
"individual_scores": [0.5, 0],
146+
"results": [
147+
{"score": 0.5, "statement_scores": [1, 0], "statements": ["a", "b"]},
148+
{"score": 0, "statement_scores": [], "statements": []},
149+
],
150+
"score": 0.25,
151+
}
152+
121153
def test_run_missing_parameters(self, monkeypatch):
122154
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
123155
component = ContextRelevanceEvaluator()

Diff for: test/components/evaluators/test_faithfulness_evaluator.py

+36
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,42 @@ def generator_run(self, *args, **kwargs):
146146
"score": 0.75,
147147
}
148148

149+
def test_run_no_statements_extracted(self, monkeypatch):
150+
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
151+
component = FaithfulnessEvaluator()
152+
153+
def generator_run(self, *args, **kwargs):
154+
if "Football" in kwargs["prompt"]:
155+
return {"replies": ['{"statements": ["a", "b"], "statement_scores": [1, 0]}']}
156+
else:
157+
return {"replies": ['{"statements": [], "statement_scores": []}']}
158+
159+
monkeypatch.setattr("haystack.components.generators.openai.OpenAIGenerator.run", generator_run)
160+
161+
questions = ["Which is the most popular global sport?", "Who created the Python language?"]
162+
contexts = [
163+
[
164+
"The popularity of sports can be measured in various ways, including TV viewership, social media "
165+
"presence, number of participants, and economic impact. Football is undoubtedly the world's most "
166+
"popular sport with major events like the FIFA World Cup and sports personalities like Ronaldo and "
167+
"Messi, drawing a followership of more than 4 billion people."
168+
],
169+
[],
170+
]
171+
predicted_answers = [
172+
"Football is the most popular sport with around 4 billion followers worldwide.",
173+
"I don't know.",
174+
]
175+
results = component.run(questions=questions, contexts=contexts, predicted_answers=predicted_answers)
176+
assert results == {
177+
"individual_scores": [0.5, 0],
178+
"results": [
179+
{"score": 0.5, "statement_scores": [1, 0], "statements": ["a", "b"]},
180+
{"score": 0, "statement_scores": [], "statements": []},
181+
],
182+
"score": 0.25,
183+
}
184+
149185
def test_run_missing_parameters(self, monkeypatch):
150186
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
151187
component = FaithfulnessEvaluator()

0 commit comments

Comments
 (0)