fix: document joiner division by zero with distribution based rank fusion (#8520)

AnesBenmerzoug · silvanocerza · web-flow · commit f5683bc8fa27 · 2024-11-14T11:41:28.000Z
* Parametrize document joiner tests with empty lists

* Skip loop in _distribution_based_rank_fusion if document list is empty

* Parametrize test_empty_list with join_mode

* Prevent division by zero in _merge and _reciprocal_rank_fusion

* Add release notes

---------

Co-authored-by: Silvano Cerza &lt;silvanocerza@gmail.com&gt;
diff --git a/haystack/components/joiners/document_joiner.py b/haystack/components/joiners/document_joiner.py
@@ -166,6 +166,10 @@ def _merge(self, document_lists: List[List[Document]]) -> List[Document]:
         """
         Merge multiple lists of Documents and calculate a weighted sum of the scores of duplicate Documents.
         """
+        # This check prevents a division by zero when no documents are passed
+        if not document_lists:
+            return []
+
         scores_map: dict = defaultdict(int)
         documents_map = {}
         weights = self.weights if self.weights else [1 / len(document_lists)] * len(document_lists)
@@ -187,6 +191,10 @@ def _reciprocal_rank_fusion(self, document_lists: List[List[Document]]) -> List[
         The constant k is set to 61 (60 was suggested by the original paper,
         plus 1 as python lists are 0-based and the paper used 1-based ranking).
         """
+        # This check prevents a division by zero when no documents are passed
+        if not document_lists:
+            return []
+
         k = 61
 
         scores_map: dict = defaultdict(int)
@@ -217,6 +225,9 @@ def _distribution_based_rank_fusion(self, document_lists: List[List[Document]])
         If a Document is in more than one retriever, the one with the highest score is used.
         """
         for documents in document_lists:
+            if len(documents) == 0:
+                continue
+
             scores_list = []
 
             for doc in documents:
diff --git a/releasenotes/notes/fix-document-joiner-division-by-zero-b24f95d37b007264.yaml b/releasenotes/notes/fix-document-joiner-division-by-zero-b24f95d37b007264.yaml
@@ -0,0 +1,4 @@
+---
+fixes:
+  - |
+    Fix `DocumentJoiner` failing when ran with an empty list of `Document`s
diff --git a/test/components/joiners/test_document_joiner.py b/test/components/joiners/test_document_joiner.py
@@ -60,18 +60,45 @@ def test_from_dict_customs_parameters(self):
         assert document_joiner.top_k == 6
         assert not document_joiner.sort_by_score
 
-    def test_empty_list(self):
-        joiner = DocumentJoiner()
+    @pytest.mark.parametrize(
+        "join_mode",
+        [
+            JoinMode.CONCATENATE,
+            JoinMode.MERGE,
+            JoinMode.RECIPROCAL_RANK_FUSION,
+            JoinMode.DISTRIBUTION_BASED_RANK_FUSION,
+        ],
+    )
+    def test_empty_list(self, join_mode: JoinMode):
+        joiner = DocumentJoiner(join_mode=join_mode)
         result = joiner.run([])
         assert result == {"documents": []}
 
-    def test_list_of_empty_lists(self):
-        joiner = DocumentJoiner()
+    @pytest.mark.parametrize(
+        "join_mode",
+        [
+            JoinMode.CONCATENATE,
+            JoinMode.MERGE,
+            JoinMode.RECIPROCAL_RANK_FUSION,
+            JoinMode.DISTRIBUTION_BASED_RANK_FUSION,
+        ],
+    )
+    def test_list_of_empty_lists(self, join_mode: JoinMode):
+        joiner = DocumentJoiner(join_mode=join_mode)
         result = joiner.run([[], []])
         assert result == {"documents": []}
 
-    def test_list_with_one_empty_list(self):
-        joiner = DocumentJoiner()
+    @pytest.mark.parametrize(
+        "join_mode",
+        [
+            JoinMode.CONCATENATE,
+            JoinMode.MERGE,
+            JoinMode.RECIPROCAL_RANK_FUSION,
+            JoinMode.DISTRIBUTION_BASED_RANK_FUSION,
+        ],
+    )
+    def test_list_with_one_empty_list(self, join_mode: JoinMode):
+        joiner = DocumentJoiner(join_mode=join_mode)
         documents = [Document(content="a"), Document(content="b"), Document(content="c")]
         result = joiner.run([[], documents])
         assert result == {"documents": documents}

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +---
 +fixes:
 +  - |
 +    Fix `DocumentJoiner` failing when ran with an empty list of `Document`s