Skip to content

Commit

Permalink
fix: avoid FaithfulnessEvaluator and ContextRelevanceEvaluator return…
Browse files Browse the repository at this point in the history
… `Nan` (#7685)

* initial import

* fixing tests

* relaxing condition

* adding safeguard for ContextRelevanceEvaluator as well

* adding release notes
  • Loading branch information
davidsbatista authored May 14, 2024
1 parent cc869b1 commit 798dc4a
Show file tree
Hide file tree
Showing 5 changed files with 80 additions and 2 deletions.
5 changes: 4 additions & 1 deletion haystack/components/evaluators/context_relevance.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,10 @@ def run(self, questions: List[str], contexts: List[List[str]]) -> Dict[str, Any]

# calculate average statement relevance score per query
for res in result["results"]:
res["score"] = np_mean(res["statement_scores"])
if not res["statements"]:
res["score"] = 0
else:
res["score"] = np_mean(res["statement_scores"])

# calculate average context relevance score over all queries
result["score"] = np_mean([res["score"] for res in result["results"]])
Expand Down
5 changes: 4 additions & 1 deletion haystack/components/evaluators/faithfulness.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,10 @@ def run(self, questions: List[str], contexts: List[List[str]], predicted_answers

# calculate average statement faithfulness score per query
for res in result["results"]:
res["score"] = np_mean(res["statement_scores"])
if not res["statements"]:
res["score"] = 0
else:
res["score"] = np_mean(res["statement_scores"])

# calculate average answer faithfulness score over all queries
result["score"] = np_mean([res["score"] for res in result["results"]])
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
fixes:
- |
`FaithfullnessEvaluator` and `ContextRelevanceEvaluator` now return `0` instead of `NaN` when applied to an empty context or empty statements.
32 changes: 32 additions & 0 deletions test/components/evaluators/test_context_relevance_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,38 @@ def generator_run(self, *args, **kwargs):
"score": 0.75,
}

def test_run_no_statements_extracted(self, monkeypatch):
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
component = ContextRelevanceEvaluator()

def generator_run(self, *args, **kwargs):
if "Football" in kwargs["prompt"]:
return {"replies": ['{"statements": ["a", "b"], "statement_scores": [1, 0]}']}
else:
return {"replies": ['{"statements": [], "statement_scores": []}']}

monkeypatch.setattr("haystack.components.generators.openai.OpenAIGenerator.run", generator_run)

questions = ["Which is the most popular global sport?", "Who created the Python language?"]
contexts = [
[
"The popularity of sports can be measured in various ways, including TV viewership, social media "
"presence, number of participants, and economic impact. Football is undoubtedly the world's most "
"popular sport with major events like the FIFA World Cup and sports personalities like Ronaldo and "
"Messi, drawing a followership of more than 4 billion people."
],
[],
]
results = component.run(questions=questions, contexts=contexts)
assert results == {
"individual_scores": [0.5, 0],
"results": [
{"score": 0.5, "statement_scores": [1, 0], "statements": ["a", "b"]},
{"score": 0, "statement_scores": [], "statements": []},
],
"score": 0.25,
}

def test_run_missing_parameters(self, monkeypatch):
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
component = ContextRelevanceEvaluator()
Expand Down
36 changes: 36 additions & 0 deletions test/components/evaluators/test_faithfulness_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,42 @@ def generator_run(self, *args, **kwargs):
"score": 0.75,
}

def test_run_no_statements_extracted(self, monkeypatch):
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
component = FaithfulnessEvaluator()

def generator_run(self, *args, **kwargs):
if "Football" in kwargs["prompt"]:
return {"replies": ['{"statements": ["a", "b"], "statement_scores": [1, 0]}']}
else:
return {"replies": ['{"statements": [], "statement_scores": []}']}

monkeypatch.setattr("haystack.components.generators.openai.OpenAIGenerator.run", generator_run)

questions = ["Which is the most popular global sport?", "Who created the Python language?"]
contexts = [
[
"The popularity of sports can be measured in various ways, including TV viewership, social media "
"presence, number of participants, and economic impact. Football is undoubtedly the world's most "
"popular sport with major events like the FIFA World Cup and sports personalities like Ronaldo and "
"Messi, drawing a followership of more than 4 billion people."
],
[],
]
predicted_answers = [
"Football is the most popular sport with around 4 billion followers worldwide.",
"I don't know.",
]
results = component.run(questions=questions, contexts=contexts, predicted_answers=predicted_answers)
assert results == {
"individual_scores": [0.5, 0],
"results": [
{"score": 0.5, "statement_scores": [1, 0], "statements": ["a", "b"]},
{"score": 0, "statement_scores": [], "statements": []},
],
"score": 0.25,
}

def test_run_missing_parameters(self, monkeypatch):
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
component = FaithfulnessEvaluator()
Expand Down

0 comments on commit 798dc4a

Please sign in to comment.