Skip to content

Commit 3ae34b5

Browse files
committed
feat: Add DocumentNDCGEvaluator component (#8419)
* draft new component and tests * draft new component and tests * fix tests, replace usage of get_attr * improve docstrings, refactor tests * add test for mixed documents w/wo scores * add test with multiple lists and update docstring * validate inputs, add tests, make methods static * change fallback to binary relevance * rename validate_init_parameters to validate_inputs
1 parent 70d27e3 commit 3ae34b5

File tree

5 files changed

+342
-1
lines changed

5 files changed

+342
-1
lines changed

docs/pydoc/config/evaluators_api.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ loaders:
77
"context_relevance",
88
"document_map",
99
"document_mrr",
10-
"document_recall",
10+
"document_ndcg",
1111
"document_recall",
1212
"faithfulness",
1313
"llm_evaluator",

haystack/components/evaluators/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from .context_relevance import ContextRelevanceEvaluator
77
from .document_map import DocumentMAPEvaluator
88
from .document_mrr import DocumentMRREvaluator
9+
from .document_ndcg import DocumentNDCGEvaluator
910
from .document_recall import DocumentRecallEvaluator
1011
from .faithfulness import FaithfulnessEvaluator
1112
from .llm_evaluator import LLMEvaluator
@@ -16,6 +17,7 @@
1617
"ContextRelevanceEvaluator",
1718
"DocumentMAPEvaluator",
1819
"DocumentMRREvaluator",
20+
"DocumentNDCGEvaluator",
1921
"DocumentRecallEvaluator",
2022
"FaithfulnessEvaluator",
2123
"LLMEvaluator",
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
# SPDX-FileCopyrightText: 2022-present deepset GmbH <[email protected]>
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
from math import log2
6+
from typing import Any, Dict, List
7+
8+
from haystack import Document, component
9+
10+
11+
@component
12+
class DocumentNDCGEvaluator:
13+
"""
14+
Evaluator that calculates the normalized discounted cumulative gain (NDCG) of retrieved documents.
15+
16+
Each question can have multiple ground truth documents and multiple retrieved documents.
17+
If the ground truth documents have relevance scores, the NDCG calculation uses these scores.
18+
Otherwise, it assumes binary relevance of all ground truth documents.
19+
20+
Usage example:
21+
```python
22+
from haystack import Document
23+
from haystack.components.evaluators import DocumentNDCGEvaluator
24+
25+
evaluator = DocumentNDCGEvaluator()
26+
result = evaluator.run(
27+
ground_truth_documents=[[Document(content="France", score=1.0), Document(content="Paris", score=0.5)]],
28+
retrieved_documents=[[Document(content="France"), Document(content="Germany"), Document(content="Paris")]],
29+
)
30+
print(result["individual_scores"])
31+
# [0.8869]
32+
print(result["score"])
33+
# 0.8869
34+
```
35+
"""
36+
37+
@component.output_types(score=float, individual_scores=List[float])
38+
def run(
39+
self, ground_truth_documents: List[List[Document]], retrieved_documents: List[List[Document]]
40+
) -> Dict[str, Any]:
41+
"""
42+
Run the DocumentNDCGEvaluator on the given inputs.
43+
44+
`ground_truth_documents` and `retrieved_documents` must have the same length.
45+
The list items within `ground_truth_documents` and `retrieved_documents` can differ in length.
46+
47+
:param ground_truth_documents:
48+
Lists of expected documents, one list per question. Binary relevance is used if documents have no scores.
49+
:param retrieved_documents:
50+
Lists of retrieved documents, one list per question.
51+
:returns:
52+
A dictionary with the following outputs:
53+
- `score` - The average of calculated scores.
54+
- `individual_scores` - A list of numbers from 0.0 to 1.0 that represents the NDCG for each question.
55+
"""
56+
self.validate_inputs(ground_truth_documents, retrieved_documents)
57+
58+
individual_scores = []
59+
60+
for gt_docs, ret_docs in zip(ground_truth_documents, retrieved_documents):
61+
dcg = self.calculate_dcg(gt_docs, ret_docs)
62+
idcg = self.calculate_idcg(gt_docs)
63+
ndcg = dcg / idcg if idcg > 0 else 0
64+
individual_scores.append(ndcg)
65+
66+
score = sum(individual_scores) / len(ground_truth_documents)
67+
68+
return {"score": score, "individual_scores": individual_scores}
69+
70+
@staticmethod
71+
def validate_inputs(gt_docs: List[List[Document]], ret_docs: List[List[Document]]):
72+
"""
73+
Validate the input parameters.
74+
75+
:param gt_docs:
76+
The ground_truth_documents to validate.
77+
:param ret_docs:
78+
The retrieved_documents to validate.
79+
80+
:raises ValueError:
81+
If the ground_truth_documents or the retrieved_documents are an empty a list.
82+
If the length of ground_truth_documents and retrieved_documents differs.
83+
If any list of documents in ground_truth_documents contains a mix of documents with and without a score.
84+
"""
85+
if len(gt_docs) == 0 or len(ret_docs) == 0:
86+
msg = "ground_truth_documents and retrieved_documents must be provided."
87+
raise ValueError(msg)
88+
89+
if len(gt_docs) != len(ret_docs):
90+
msg = "The length of ground_truth_documents and retrieved_documents must be the same."
91+
raise ValueError(msg)
92+
93+
for docs in gt_docs:
94+
if any(doc.score is not None for doc in docs) and any(doc.score is None for doc in docs):
95+
msg = "Either none or all documents in each list of ground_truth_documents must have a score."
96+
raise ValueError(msg)
97+
98+
@staticmethod
99+
def calculate_dcg(gt_docs: List[Document], ret_docs: List[Document]) -> float:
100+
"""
101+
Calculate the discounted cumulative gain (DCG) of the retrieved documents.
102+
103+
:param gt_docs:
104+
The ground truth documents.
105+
:param ret_docs:
106+
The retrieved documents.
107+
:returns:
108+
The discounted cumulative gain (DCG) of the retrieved
109+
documents based on the ground truth documents.
110+
"""
111+
dcg = 0.0
112+
relevant_id_to_score = {doc.id: doc.score if doc.score is not None else 1 for doc in gt_docs}
113+
for i, doc in enumerate(ret_docs):
114+
if doc.id in relevant_id_to_score: # TODO Related to https://github.com/deepset-ai/haystack/issues/8412
115+
dcg += relevant_id_to_score[doc.id] / log2(i + 2) # i + 2 because i is 0-indexed
116+
return dcg
117+
118+
@staticmethod
119+
def calculate_idcg(gt_docs: List[Document]) -> float:
120+
"""
121+
Calculate the ideal discounted cumulative gain (IDCG) of the ground truth documents.
122+
123+
:param gt_docs:
124+
The ground truth documents.
125+
:returns:
126+
The ideal discounted cumulative gain (IDCG) of the ground truth documents.
127+
"""
128+
idcg = 0.0
129+
for i, doc in enumerate(sorted(gt_docs, key=lambda x: x.score if x.score is not None else 1, reverse=True)):
130+
# If the document has a score, use it; otherwise, use 1 for binary relevance.
131+
relevance = doc.score if doc.score is not None else 1
132+
idcg += relevance / log2(i + 2) # i + 2 because i is 0-indexed
133+
return idcg
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
---
2+
features:
3+
- |
4+
Added a new component DocumentNDCGEvaluator, which is similar to DocumentMRREvaluator and useful for retrieval evaluation. It calculates the normalized discounted cumulative gain, an evaluation metric useful when there are multiple ground truth relevant documents and the order in which they are retrieved is important.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,202 @@
1+
# SPDX-FileCopyrightText: 2022-present deepset GmbH <[email protected]>
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
import pytest
5+
6+
from haystack import Document
7+
from haystack.components.evaluators.document_ndcg import DocumentNDCGEvaluator
8+
9+
10+
def test_run_with_scores():
11+
evaluator = DocumentNDCGEvaluator()
12+
result = evaluator.run(
13+
ground_truth_documents=[
14+
[
15+
Document(content="doc1", score=3),
16+
Document(content="doc2", score=2),
17+
Document(content="doc3", score=3),
18+
Document(content="doc6", score=2),
19+
Document(content="doc7", score=3),
20+
Document(content="doc8", score=2),
21+
]
22+
],
23+
retrieved_documents=[
24+
[
25+
Document(content="doc1"),
26+
Document(content="doc2"),
27+
Document(content="doc3"),
28+
Document(content="doc4"),
29+
Document(content="doc5"),
30+
]
31+
],
32+
)
33+
assert result["individual_scores"][0] == pytest.approx(0.6592, abs=1e-4)
34+
assert result["score"] == pytest.approx(0.6592, abs=1e-4)
35+
36+
37+
def test_run_without_scores():
38+
evaluator = DocumentNDCGEvaluator()
39+
result = evaluator.run(
40+
ground_truth_documents=[[Document(content="France"), Document(content="Paris")]],
41+
retrieved_documents=[[Document(content="France"), Document(content="Germany"), Document(content="Paris")]],
42+
)
43+
assert result["individual_scores"][0] == pytest.approx(0.9197, abs=1e-4)
44+
assert result["score"] == pytest.approx(0.9197, abs=1e-4)
45+
46+
47+
def test_run_with_multiple_lists_of_docs():
48+
evaluator = DocumentNDCGEvaluator()
49+
result = evaluator.run(
50+
ground_truth_documents=[
51+
[Document(content="France"), Document(content="Paris")],
52+
[
53+
Document(content="doc1", score=3),
54+
Document(content="doc2", score=2),
55+
Document(content="doc3", score=3),
56+
Document(content="doc6", score=2),
57+
Document(content="doc7", score=3),
58+
Document(content="doc8", score=2),
59+
],
60+
],
61+
retrieved_documents=[
62+
[Document(content="France"), Document(content="Germany"), Document(content="Paris")],
63+
[
64+
Document(content="doc1"),
65+
Document(content="doc2"),
66+
Document(content="doc3"),
67+
Document(content="doc4"),
68+
Document(content="doc5"),
69+
],
70+
],
71+
)
72+
assert result["individual_scores"][0] == pytest.approx(0.9197, abs=1e-4)
73+
assert result["individual_scores"][1] == pytest.approx(0.6592, abs=1e-4)
74+
assert result["score"] == pytest.approx(0.7895, abs=1e-4)
75+
76+
77+
def test_run_with_different_lengths():
78+
evaluator = DocumentNDCGEvaluator()
79+
with pytest.raises(ValueError):
80+
evaluator.run(
81+
ground_truth_documents=[[Document(content="Berlin")]],
82+
retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]],
83+
)
84+
with pytest.raises(ValueError):
85+
evaluator.run(
86+
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
87+
retrieved_documents=[[Document(content="Berlin")]],
88+
)
89+
90+
91+
def test_run_with_mixed_documents_with_and_without_scores():
92+
evaluator = DocumentNDCGEvaluator()
93+
with pytest.raises(ValueError):
94+
evaluator.run(
95+
ground_truth_documents=[[Document(content="France", score=3), Document(content="Paris")]],
96+
retrieved_documents=[[Document(content="France"), Document(content="Germany"), Document(content="Paris")]],
97+
)
98+
99+
100+
def test_run_empty_retrieved():
101+
evaluator = DocumentNDCGEvaluator()
102+
result = evaluator.run(ground_truth_documents=[[Document(content="France")]], retrieved_documents=[[]])
103+
assert result["individual_scores"] == [0.0]
104+
assert result["score"] == 0.0
105+
106+
107+
def test_run_empty_ground_truth():
108+
evaluator = DocumentNDCGEvaluator()
109+
result = evaluator.run(ground_truth_documents=[[]], retrieved_documents=[[Document(content="France")]])
110+
assert result["individual_scores"] == [0.0]
111+
assert result["score"] == 0.0
112+
113+
114+
def test_run_empty_retrieved_and_empty_ground_truth():
115+
evaluator = DocumentNDCGEvaluator()
116+
result = evaluator.run(ground_truth_documents=[[]], retrieved_documents=[[]])
117+
assert result["individual_scores"] == [0.0]
118+
assert result["score"] == 0.0
119+
120+
121+
def test_run_no_retrieved():
122+
evaluator = DocumentNDCGEvaluator()
123+
with pytest.raises(ValueError):
124+
result = evaluator.run(ground_truth_documents=[[Document(content="France")]], retrieved_documents=[])
125+
126+
127+
def test_run_no_ground_truth():
128+
evaluator = DocumentNDCGEvaluator()
129+
with pytest.raises(ValueError):
130+
evaluator.run(ground_truth_documents=[], retrieved_documents=[[Document(content="France")]])
131+
132+
133+
def test_run_no_retrieved_and_no_ground_truth():
134+
evaluator = DocumentNDCGEvaluator()
135+
with pytest.raises(ValueError):
136+
evaluator.run(ground_truth_documents=[], retrieved_documents=[])
137+
138+
139+
def test_calculate_dcg_with_scores():
140+
evaluator = DocumentNDCGEvaluator()
141+
gt_docs = [
142+
Document(content="doc1", score=3),
143+
Document(content="doc2", score=2),
144+
Document(content="doc3", score=3),
145+
Document(content="doc4", score=0),
146+
Document(content="doc5", score=1),
147+
Document(content="doc6", score=2),
148+
]
149+
ret_docs = [
150+
Document(content="doc1"),
151+
Document(content="doc2"),
152+
Document(content="doc3"),
153+
Document(content="doc4"),
154+
Document(content="doc5"),
155+
Document(content="doc6"),
156+
]
157+
dcg = evaluator.calculate_dcg(gt_docs, ret_docs)
158+
assert dcg == pytest.approx(6.8611, abs=1e-4)
159+
160+
161+
def test_calculate_dcg_without_scores():
162+
evaluator = DocumentNDCGEvaluator()
163+
gt_docs = [Document(content="doc1"), Document(content="doc2")]
164+
ret_docs = [Document(content="doc2"), Document(content="doc3"), Document(content="doc1")]
165+
dcg = evaluator.calculate_dcg(gt_docs, ret_docs)
166+
assert dcg == pytest.approx(1.5, abs=1e-4)
167+
168+
169+
def test_calculate_dcg_empty():
170+
evaluator = DocumentNDCGEvaluator()
171+
gt_docs = [Document(content="doc1")]
172+
ret_docs = []
173+
dcg = evaluator.calculate_dcg(gt_docs, ret_docs)
174+
assert dcg == 0
175+
176+
177+
def test_calculate_idcg_with_scores():
178+
evaluator = DocumentNDCGEvaluator()
179+
gt_docs = [
180+
Document(content="doc1", score=3),
181+
Document(content="doc2", score=3),
182+
Document(content="doc3", score=2),
183+
Document(content="doc4", score=3),
184+
Document(content="doc5", score=2),
185+
Document(content="doc6", score=2),
186+
]
187+
idcg = evaluator.calculate_idcg(gt_docs)
188+
assert idcg == pytest.approx(8.7403, abs=1e-4)
189+
190+
191+
def test_calculate_idcg_without_scores():
192+
evaluator = DocumentNDCGEvaluator()
193+
gt_docs = [Document(content="doc1"), Document(content="doc2"), Document(content="doc3")]
194+
idcg = evaluator.calculate_idcg(gt_docs)
195+
assert idcg == pytest.approx(2.1309, abs=1e-4)
196+
197+
198+
def test_calculate_idcg_empty():
199+
evaluator = DocumentNDCGEvaluator()
200+
gt_docs = []
201+
idcg = evaluator.calculate_idcg(gt_docs)
202+
assert idcg == 0

0 commit comments

Comments
 (0)