fix: DocumentRecallEvaluator changing division and adding checks for emptiness of documents (#9380)

davidsbatista · anakin87 · julian-risch · web-flow · commit 42b378950f1e · 2025-05-14T11:37:47.000+02:00
* changing division and adding checks for emptiness of documents

* adding release notes

* adding tests

* Update releasenotes/notes/updated-doc-recall-eval-uniqueness-59b09082cf8e7593.yaml

Co-authored-by: Stefano Fiorucci &lt;stefanofiorucci@gmail.com&gt;

* attending PR comments

* Update releasenotes/notes/updated-doc-recall-eval-uniqueness-59b09082cf8e7593.yaml

* Update releasenotes/notes/updated-doc-recall-eval-uniqueness-59b09082cf8e7593.yaml

Co-authored-by: Julian Risch &lt;julian.risch@deepset.ai&gt;

* Update haystack/components/evaluators/document_recall.py

Co-authored-by: Julian Risch &lt;julian.risch@deepset.ai&gt;

* Update haystack/components/evaluators/document_recall.py

Co-authored-by: Julian Risch &lt;julian.risch@deepset.ai&gt;

* Update haystack/components/evaluators/document_recall.py

Co-authored-by: Julian Risch &lt;julian.risch@deepset.ai&gt;

* Update haystack/components/evaluators/document_recall.py

Co-authored-by: Julian Risch &lt;julian.risch@deepset.ai&gt;

* adding tests

* linting

---------

Co-authored-by: Stefano Fiorucci &lt;stefanofiorucci@gmail.com&gt;
Co-authored-by: Julian Risch &lt;julian.risch@deepset.ai&gt;
diff --git a/haystack/components/evaluators/document_recall.py b/haystack/components/evaluators/document_recall.py
@@ -5,9 +5,11 @@
 from enum import Enum
 from typing import Any, Dict, List, Union
 
-from haystack import component, default_to_dict
+from haystack import component, default_to_dict, logging
 from haystack.dataclasses import Document
 
+logger = logging.getLogger(__name__)
+
 
 class RecallMode(Enum):
     """
@@ -97,7 +99,21 @@ def _recall_multi_hit(ground_truth_documents: List[Document], retrieved_document
         unique_retrievals = {p.content for p in retrieved_documents}
         retrieved_ground_truths = unique_truths.intersection(unique_retrievals)
 
-        return len(retrieved_ground_truths) / len(ground_truth_documents)
+        if not unique_truths or unique_truths == {""}:
+            logger.warning(
+                "There are no ground truth documents or all of them have an empty string as content. "
+                "Score will be set to 0."
+            )
+            return 0.0
+
+        if not unique_retrievals or unique_retrievals == {""}:
+            logger.warning(
+                "There are no retrieved documents or all of them have an empty string as content. "
+                "Score will be set to 0."
+            )
+            return 0.0
+
+        return len(retrieved_ground_truths) / len(unique_truths)
 
     @component.output_types(score=float, individual_scores=List[float])
     def run(
diff --git a/releasenotes/notes/updated-doc-recall-eval-uniqueness-59b09082cf8e7593.yaml b/releasenotes/notes/updated-doc-recall-eval-uniqueness-59b09082cf8e7593.yaml
@@ -0,0 +1,5 @@
+---
+enhancements:
+  - |
+    The `DocumentRecallEvaluator` was updated. Now, when in `MULTI_HIT` mode, the division is over the unique ground truth documents instead of the total number of ground truth documents.
+    We also added checks for emptiness. If there are no retrieved documents or all of them have an empty string as content, we return 0.0 and log a warning. Likewise, if there are no ground truth documents or all of them have an empty string as content, we return 0.0 and log a warning.
diff --git a/test/components/evaluators/test_document_recall.py b/test/components/evaluators/test_document_recall.py
@@ -13,6 +13,14 @@ def test_init_with_unknown_mode_string():
         DocumentRecallEvaluator(mode="unknown_mode")
 
 
+def test_init_with_string_mode():
+    evaluator = DocumentRecallEvaluator(mode="single_hit")
+    assert evaluator.mode == RecallMode.SINGLE_HIT
+
+    evaluator = DocumentRecallEvaluator(mode="multi_hit")
+    assert evaluator.mode == RecallMode.MULTI_HIT
+
+
 class TestDocumentRecallEvaluatorSingleHit:
     @pytest.fixture
     def evaluator(self):
@@ -186,3 +194,27 @@ def test_from_dict(self):
         }
         new_evaluator = default_from_dict(DocumentRecallEvaluator, data)
         assert new_evaluator.mode == RecallMode.MULTI_HIT
+
+    def test_empty_ground_truth_documents(self, evaluator):
+        ground_truth_documents = [[]]
+        retrieved_documents = [[Document(content="test")]]
+        score = evaluator.run(ground_truth_documents, retrieved_documents)
+        assert score == {"individual_scores": [0.0], "score": 0.0}
+
+    def test_empty_retrieved_documents(self, evaluator):
+        ground_truth_documents = [[Document(content="test")]]
+        retrieved_documents = [[]]
+        score = evaluator.run(ground_truth_documents, retrieved_documents)
+        assert score == {"individual_scores": [0.0], "score": 0.0}
+
+    def test_empty_string_ground_truth_documents(self, evaluator):
+        ground_truth_documents = [[Document(content="")]]
+        retrieved_documents = [[Document(content="test")]]
+        score = evaluator.run(ground_truth_documents, retrieved_documents)
+        assert score == {"individual_scores": [0.0], "score": 0.0}
+
+    def test_empty_string_retrieved_documents(self, evaluator):
+        ground_truth_documents = [[Document(content="test")]]
+        retrieved_documents = [[Document(content="")]]
+        score = evaluator.run(ground_truth_documents, retrieved_documents)
+        assert score == {"individual_scores": [0.0], "score": 0.0}