Skip to content

Commit f5683bc

Browse files
fix: document joiner division by zero with distribution based rank fusion (#8520)
* Parametrize document joiner tests with empty lists * Skip loop in _distribution_based_rank_fusion if document list is empty * Parametrize test_empty_list with join_mode * Prevent division by zero in _merge and _reciprocal_rank_fusion * Add release notes --------- Co-authored-by: Silvano Cerza <[email protected]>
1 parent e5a8072 commit f5683bc

File tree

3 files changed

+48
-6
lines changed

3 files changed

+48
-6
lines changed

haystack/components/joiners/document_joiner.py

+11
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,10 @@ def _merge(self, document_lists: List[List[Document]]) -> List[Document]:
166166
"""
167167
Merge multiple lists of Documents and calculate a weighted sum of the scores of duplicate Documents.
168168
"""
169+
# This check prevents a division by zero when no documents are passed
170+
if not document_lists:
171+
return []
172+
169173
scores_map: dict = defaultdict(int)
170174
documents_map = {}
171175
weights = self.weights if self.weights else [1 / len(document_lists)] * len(document_lists)
@@ -187,6 +191,10 @@ def _reciprocal_rank_fusion(self, document_lists: List[List[Document]]) -> List[
187191
The constant k is set to 61 (60 was suggested by the original paper,
188192
plus 1 as python lists are 0-based and the paper used 1-based ranking).
189193
"""
194+
# This check prevents a division by zero when no documents are passed
195+
if not document_lists:
196+
return []
197+
190198
k = 61
191199

192200
scores_map: dict = defaultdict(int)
@@ -217,6 +225,9 @@ def _distribution_based_rank_fusion(self, document_lists: List[List[Document]])
217225
If a Document is in more than one retriever, the one with the highest score is used.
218226
"""
219227
for documents in document_lists:
228+
if len(documents) == 0:
229+
continue
230+
220231
scores_list = []
221232

222233
for doc in documents:
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
---
2+
fixes:
3+
- |
4+
Fix `DocumentJoiner` failing when ran with an empty list of `Document`s

test/components/joiners/test_document_joiner.py

+33-6
Original file line numberDiff line numberDiff line change
@@ -60,18 +60,45 @@ def test_from_dict_customs_parameters(self):
6060
assert document_joiner.top_k == 6
6161
assert not document_joiner.sort_by_score
6262

63-
def test_empty_list(self):
64-
joiner = DocumentJoiner()
63+
@pytest.mark.parametrize(
64+
"join_mode",
65+
[
66+
JoinMode.CONCATENATE,
67+
JoinMode.MERGE,
68+
JoinMode.RECIPROCAL_RANK_FUSION,
69+
JoinMode.DISTRIBUTION_BASED_RANK_FUSION,
70+
],
71+
)
72+
def test_empty_list(self, join_mode: JoinMode):
73+
joiner = DocumentJoiner(join_mode=join_mode)
6574
result = joiner.run([])
6675
assert result == {"documents": []}
6776

68-
def test_list_of_empty_lists(self):
69-
joiner = DocumentJoiner()
77+
@pytest.mark.parametrize(
78+
"join_mode",
79+
[
80+
JoinMode.CONCATENATE,
81+
JoinMode.MERGE,
82+
JoinMode.RECIPROCAL_RANK_FUSION,
83+
JoinMode.DISTRIBUTION_BASED_RANK_FUSION,
84+
],
85+
)
86+
def test_list_of_empty_lists(self, join_mode: JoinMode):
87+
joiner = DocumentJoiner(join_mode=join_mode)
7088
result = joiner.run([[], []])
7189
assert result == {"documents": []}
7290

73-
def test_list_with_one_empty_list(self):
74-
joiner = DocumentJoiner()
91+
@pytest.mark.parametrize(
92+
"join_mode",
93+
[
94+
JoinMode.CONCATENATE,
95+
JoinMode.MERGE,
96+
JoinMode.RECIPROCAL_RANK_FUSION,
97+
JoinMode.DISTRIBUTION_BASED_RANK_FUSION,
98+
],
99+
)
100+
def test_list_with_one_empty_list(self, join_mode: JoinMode):
101+
joiner = DocumentJoiner(join_mode=join_mode)
75102
documents = [Document(content="a"), Document(content="b"), Document(content="c")]
76103
result = joiner.run([[], documents])
77104
assert result == {"documents": documents}

0 commit comments

Comments
 (0)