feat: Allow the scores returned by AI Search to be populated in the Document.meta (#1907)

Seth-Peters · Amnah199 · web-flow · commit 4e7092d0a464 · 2025-06-27T17:04:54.000+02:00
* feat: Allow the search scores to be populated in the Document.meta - exposing these scores (as they are very critical information) to users of this integration.

* feat: Allow the search scores to be populated in the Document.meta - exposing these scores (as they are very critical information) to users of this integration.

* chore: running linter

* Add a new param for search scores

* Fix linting

* feat: put the @search.score, if it exists, into the Document.score

* fix: put back accidentally removed # noqa: B008

* fix: running linter

* Update tests

* Update document_store

* Fix linting

* PR comments and test updates

* Fixes

---------

Co-authored-by: Amna Mubashar &lt;amnahkhan.ak@gmail.com&gt;
diff --git a/integrations/azure_ai_search/src/haystack_integrations/document_stores/azure_ai_search/document_store.py b/integrations/azure_ai_search/src/haystack_integrations/document_stores/azure_ai_search/document_store.py
@@ -6,7 +6,11 @@
 from typing import Any, Dict, List, Optional, Type, Union
 
 from azure.core.credentials import AzureKeyCredential
-from azure.core.exceptions import ClientAuthenticationError, HttpResponseError, ResourceNotFoundError
+from azure.core.exceptions import (
+    ClientAuthenticationError,
+    HttpResponseError,
+    ResourceNotFoundError,
+)
 from azure.core.pipeline.policies import UserAgentPolicy
 from azure.identity import DefaultAzureCredential
 from azure.search.documents import SearchClient
@@ -67,7 +71,10 @@
 
 DEFAULT_VECTOR_SEARCH = VectorSearch(
     profiles=[
-        VectorSearchProfile(name="default-vector-config", algorithm_configuration_name="cosine-algorithm-config")
+        VectorSearchProfile(
+            name="default-vector-config",
+            algorithm_configuration_name="cosine-algorithm-config",
+        )
     ],
     algorithms=[
         HnswAlgorithmConfiguration(
@@ -94,6 +101,7 @@ def __init__(
         embedding_dimension: int = 768,
         metadata_fields: Optional[Dict[str, Union[SearchField, type]]] = None,
         vector_search_configuration: Optional[VectorSearch] = None,
+        include_search_metadata: bool = False,
         **index_creation_kwargs: Any,
     ):
         """
@@ -123,6 +131,10 @@ def __init__(
         :param vector_search_configuration: Configuration option related to vector search.
             Default configuration uses the HNSW algorithm with cosine similarity to handle vector searches.
 
+        :param include_search_metadata: Whether to include Azure AI Search metadata fields
+            in the returned documents. When set to True, the `meta` field of the returned
+            documents will contain the @search.score, @search.reranker_score, @search.highlights,
+            @search.captions, and other fields returned by Azure AI Search.
         :param index_creation_kwargs: Optional keyword parameters to be passed to `SearchIndex` class
             during index creation. Some of the supported parameters:
                 - `semantic_search`: Defines semantic configuration of the search index. This parameter is needed
@@ -143,6 +155,7 @@ def __init__(
         self._dummy_vector = [-10.0] * self._embedding_dimension
         self._metadata_fields = self._normalize_metadata_index_fields(metadata_fields)
         self._vector_search_configuration = vector_search_configuration or DEFAULT_VECTOR_SEARCH
+        self._include_search_metadata = include_search_metadata
         self._index_creation_kwargs = index_creation_kwargs
 
     @property
@@ -256,7 +269,9 @@ def _create_index(self) -> None:
             self._index_client.create_index(index)
 
     @staticmethod
-    def _serialize_index_creation_kwargs(index_creation_kwargs: Dict[str, Any]) -> Dict[str, Any]:
+    def _serialize_index_creation_kwargs(
+        index_creation_kwargs: Dict[str, Any],
+    ) -> Dict[str, Any]:
         """
         Serializes the index creation kwargs to a dictionary.
         This is needed to handle serialization of Azure AI Search classes
@@ -300,7 +315,7 @@ def to_dict(self) -> Dict[str, Any]:
         """
         return default_to_dict(
             self,
-            azure_endpoint=self._azure_endpoint.to_dict() if self._azure_endpoint else None,
+            azure_endpoint=(self._azure_endpoint.to_dict() if self._azure_endpoint else None),
             api_key=self._api_key.to_dict() if self._api_key else None,
             index_name=self._index_name,
             embedding_dimension=self._embedding_dimension,
@@ -423,19 +438,28 @@ def _convert_search_result_to_documents(self, azure_docs: List[Dict[str, Any]])
 
         for azure_doc in azure_docs:
             embedding = azure_doc.get("embedding")
+            score = azure_doc.get("@search.score", None)
             if embedding == self._dummy_vector:
                 embedding = None
+            meta = {}
 
             # Anything besides default fields (id, content, and embedding) is considered metadata
-            meta = {
-                key: value
-                for key, value in azure_doc.items()
-                if key not in ["id", "content", "embedding"] and key in self._index_fields and value is not None
-            }
+            if self._include_search_metadata:
+                meta = {key: value for key, value in azure_doc.items() if key not in ["id", "content", "embedding"]}
+            else:
+                meta = {
+                    key: value
+                    for key, value in azure_doc.items()
+                    if key not in ["id", "content", "embedding"] and key in self._index_fields and value is not None
+                }
 
             # Create the document with meta only if it's non-empty
             doc = Document(
-                id=azure_doc["id"], content=azure_doc["content"], embedding=embedding, meta=meta if meta else {}
+                id=azure_doc["id"],
+                content=azure_doc["content"],
+                embedding=embedding,
+                meta=meta,
+                score=score,
             )
 
             documents.append(doc)
diff --git a/integrations/azure_ai_search/tests/conftest.py b/integrations/azure_ai_search/tests/conftest.py
@@ -28,6 +28,7 @@ def document_store(request):
     """
     index_name = f"haystack_test_{uuid.uuid4().hex}"
     metadata_fields = getattr(request, "param", {}).get("metadata_fields", None)
+    include_search_metadata = getattr(request, "param", {}).get("include_search_metadata", False)
 
     azure_endpoint = os.environ["AZURE_AI_SEARCH_ENDPOINT"]
     api_key = os.environ["AZURE_AI_SEARCH_API_KEY"]
@@ -41,6 +42,7 @@ def document_store(request):
         create_index=True,
         embedding_dimension=768,
         metadata_fields=metadata_fields,
+        include_search_metadata=include_search_metadata,
     )
 
     # Override some methods to wait for the documents to be available
diff --git a/integrations/azure_ai_search/tests/test_bm25_retriever.py b/integrations/azure_ai_search/tests/test_bm25_retriever.py
@@ -159,7 +159,23 @@ def test_run(self, document_store: AzureAISearchDocumentStore):
         document_store.write_documents(docs)
         retriever = AzureAISearchBM25Retriever(document_store=document_store)
         res = retriever.run(query="Test document")
-        assert res["documents"] == docs
+        assert res["documents"][0].content == docs[0].content
+        assert res["documents"][0].score is not None
+        assert res["documents"][0].id == docs[0].id
+
+    @pytest.mark.parametrize(
+        "document_store",
+        [
+            {"include_search_metadata": True},
+        ],
+        indirect=True,
+    )
+    def test_run_with_search_metadata(self, document_store: AzureAISearchDocumentStore):
+        docs = [Document(id="1", content="Test document")]
+        document_store.write_documents(docs)
+        retriever = AzureAISearchBM25Retriever(document_store=document_store)
+        res = retriever.run(query="Test document")
+        assert all(key.startswith("@search") for key in res["documents"][0].meta.keys())
 
     def test_document_retrieval(self, document_store: AzureAISearchDocumentStore):
         docs = [
diff --git a/integrations/azure_ai_search/tests/test_document_store.py b/integrations/azure_ai_search/tests/test_document_store.py
@@ -231,12 +231,36 @@ def test_init(_mock_azure_search_client):
     assert document_store._vector_search_configuration == DEFAULT_VECTOR_SEARCH
 
 
+def _assert_documents_are_equal(received: List[Document], expected: List[Document]):
+    """
+    Assert that two lists of Documents are equal.
+
+    This is used in every test, if a Document Store implementation has a different behaviour
+    it should override this method. This can happen for example when the Document Store sets
+    a score to returned Documents. Since we can't know what the score will be, we can't compare
+    the Documents reliably.
+    """
+    sorted_received = sorted(received, key=lambda doc: doc.id)
+    sorted_expected = sorted(expected, key=lambda doc: doc.id)
+    assert len(sorted_received) == len(sorted_expected)
+
+    for received_doc, expected_doc in zip(sorted_received, sorted_expected):
+        # Compare all attributes except score
+        assert received_doc.id == expected_doc.id
+        assert received_doc.content == expected_doc.content
+        assert received_doc.embedding == expected_doc.embedding
+        assert received_doc.meta == expected_doc.meta
+
+
 @pytest.mark.integration
 @pytest.mark.skipif(
     not os.environ.get("AZURE_AI_SEARCH_ENDPOINT", None) and not os.environ.get("AZURE_AI_SEARCH_API_KEY", None),
     reason="Missing AZURE_AI_SEARCH_ENDPOINT or AZURE_AI_SEARCH_API_KEY.",
 )
 class TestDocumentStore(CountDocumentsTest, WriteDocumentsTest, DeleteDocumentsTest):
+    def assert_documents_are_equal(self, received: List[Document], expected: List[Document]):
+        _assert_documents_are_equal(received, expected)
+
     def test_write_documents(self, document_store: AzureAISearchDocumentStore):
         docs = [Document(id="1")]
         assert document_store.write_documents(docs) == 1
@@ -345,17 +369,7 @@ def filterable_docs(self) -> List[Document]:
 
     # Overriding to compare the documents with the same order
     def assert_documents_are_equal(self, received: List[Document], expected: List[Document]):
-        """
-        Assert that two lists of Documents are equal.
-
-        This is used in every test, if a Document Store implementation has a different behaviour
-        it should override this method. This can happen for example when the Document Store sets
-        a score to returned Documents. Since we can't know what the score will be, we can't compare
-        the Documents reliably.
-        """
-        sorted_recieved = sorted(received, key=lambda doc: doc.id)
-        sorted_expected = sorted(expected, key=lambda doc: doc.id)
-        assert sorted_recieved == sorted_expected
+        _assert_documents_are_equal(received, expected)
 
     # Azure search index supports UTC datetime in ISO 8601 format
     def test_comparison_greater_than_with_iso_date(self, document_store, filterable_docs):
diff --git a/integrations/azure_ai_search/tests/test_embedding_retriever.py b/integrations/azure_ai_search/tests/test_embedding_retriever.py
@@ -174,7 +174,7 @@ def test_run(self, document_store: AzureAISearchDocumentStore):
         document_store.write_documents(docs)
         retriever = AzureAISearchEmbeddingRetriever(document_store=document_store)
         res = retriever.run(query_embedding=[0.1] * 768)
-        assert res["documents"] == docs
+        assert res["documents"][0].id == docs[0].id
 
     def test_embedding_retrieval(self, document_store: AzureAISearchDocumentStore):
         query_embedding = [0.1] * 768
diff --git a/integrations/azure_ai_search/tests/test_hybrid_retriever.py b/integrations/azure_ai_search/tests/test_hybrid_retriever.py
@@ -180,7 +180,7 @@ def test_run(self, document_store: AzureAISearchDocumentStore):
         document_store.write_documents(docs)
         retriever = AzureAISearchHybridRetriever(document_store=document_store)
         res = retriever.run(query="Test document", query_embedding=[0.1] * 768)
-        assert res["documents"] == docs
+        assert res["documents"][0].id == docs[0].id
 
     def test_hybrid_retrieval(self, document_store: AzureAISearchDocumentStore):
         query_embedding = [0.1] * 768