feat: add asynchronous embedding methods for GoogleGenAIDocumentEmbedder and GoogleGenAITextEmbedder (#1983)

garybadwal · davidsbatista · web-flow · commit 2563024deb90 · 2025-06-24T14:25:52.000+02:00
* feat: Add GoogleAITextEmbedder and GoogleAIDocumentEmbedder components

* fix: Improve error messages for input type validation in GoogleAITextEmbedder and GoogleAIDocumentEmbedder

* feat: add Google GenAI embedder components for document and text embeddings

* feat: add unit tests for GoogleAIDocumentEmbedder and GoogleAITextEmbedder

* refactor: clean up imports and improve list handling in GoogleAIDocumentEmbedder and GoogleAITextEmbedder tests

* refactor: Rename classes and update imports for Google GenAI components

* feat: Add additional modules for Google GenAI embedders in config

* chore: add 'more-itertools' to lint environment dependencies

* refactor: update GoogleGenAIDocumentEmbedder and GoogleGenAITextEmbedder to use private attributes for initialization

* refactor: update _prepare_texts_to_embed to return a list instead of a dictionary

* refactor: format code for better readability and consistency in document embedder

* refactor: improve code formatting for consistency and readability in document embedder and tests

* refactor: update _prepare_texts_to_embed to return a list instead of a dictionary

* feat: add new author to project metadata in pyproject.toml

* feat: add asynchronous embedding methods for GoogleGenAIDocumentEmbedder and GoogleGenAITextEmbedder

* fix: ensure consistent formatting for pylint

* fix: update return type annotation for run_async method in GoogleGenAIDocumentEmbedder

* fix: update return type annotation for run_async method in GoogleGenAITextEmbedder

* fix: update return type annotation and handle None values in _embed_batch_async method

* fix: remove unnecessary blank line in _embed_batch_async method

---------

Co-authored-by: David S. Batista &lt;dsbatista@gmail.com&gt;
diff --git a/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/document_embedder.py b/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/document_embedder.py
@@ -171,6 +171,37 @@ def _embed_batch(
 
         return all_embeddings, meta
 
+    async def _embed_batch_async(
+        self, texts_to_embed: List[str], batch_size: int
+    ) -> Tuple[List[Optional[List[float]]], Dict[str, Any]]:
+        """
+        Embed a list of texts in batches asynchronously.
+        """
+
+        all_embeddings = []
+        meta: Dict[str, Any] = {}
+        for batch in tqdm(
+            batched(texts_to_embed, batch_size), disable=not self._progress_bar, desc="Calculating embeddings"
+        ):
+            args: Dict[str, Any] = {"model": self._model, "contents": [b[1] for b in batch]}
+            if self._config:
+                args["config"] = types.EmbedContentConfig(**self._config) if self._config else None
+
+            response = await self._client.aio.models.embed_content(**args)
+
+            embeddings = []
+            if response.embeddings:
+                for el in response.embeddings:
+                    embeddings.append(el.values if el.values else None)
+                all_embeddings.extend(embeddings)
+            else:
+                all_embeddings.extend([None] * len(batch))
+
+            if "model" not in meta:
+                meta["model"] = self._model
+
+        return all_embeddings, meta
+
     @component.output_types(documents=List[Document], meta=Dict[str, Any])
     def run(self, documents: List[Document]) -> Union[Dict[str, List[Document]], Dict[str, Any]]:
         """
@@ -200,3 +231,32 @@ def run(self, documents: List[Document]) -> Union[Dict[str, List[Document]], Dic
             doc.embedding = emb
 
         return {"documents": documents, "meta": meta}
+
+    @component.output_types(documents=List[Document], meta=Dict[str, Any])
+    async def run_async(self, documents: List[Document]) -> Union[Dict[str, List[Document]], Dict[str, Any]]:
+        """
+        Embeds a list of documents asynchronously.
+
+        :param documents:
+            A list of documents to embed.
+
+        :returns:
+            A dictionary with the following keys:
+            - `documents`: A list of documents with embeddings.
+            - `meta`: Information about the usage of the model.
+        """
+        if not isinstance(documents, list) or (documents and not isinstance(documents[0], Document)):
+            error_message_documents = (
+                "GoogleGenAIDocumentEmbedder expects a list of Documents as input. "
+                "In case you want to embed a string, please use the GoogleGenAITextEmbedder."
+            )
+            raise TypeError(error_message_documents)
+
+        texts_to_embed = self._prepare_texts_to_embed(documents=documents)
+
+        embeddings, meta = await self._embed_batch_async(texts_to_embed=texts_to_embed, batch_size=self._batch_size)
+
+        for doc, emb in zip(documents, embeddings):
+            doc.embedding = emb
+
+        return {"documents": documents, "meta": meta}
diff --git a/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/text_embedder.py b/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/text_embedder.py
@@ -138,3 +138,23 @@ def run(self, text: str) -> Union[Dict[str, List[float]], Dict[str, Any]]:
         create_kwargs = self._prepare_input(text=text)
         response = self._client.models.embed_content(**create_kwargs)
         return self._prepare_output(result=response)
+
+    @component.output_types(embedding=List[float], meta=Dict[str, Any])
+    async def run_async(self, text: str) -> Union[Dict[str, List[float]], Dict[str, Any]]:
+        """
+        Asynchronously embed a single string.
+
+        This is the asynchronous version of the `run` method. It has the same parameters and return values
+        but can be used with `await` in async code.
+
+        :param text:
+            Text to embed.
+
+        :returns:
+            A dictionary with the following keys:
+            - `embedding`: The embedding of the input text.
+            - `meta`: Information about the usage of the model.
+        """
+        create_kwargs = self._prepare_input(text=text)
+        response = await self._client.aio.models.embed_content(**create_kwargs)
+        return self._prepare_output(result=response)
diff --git a/integrations/google_genai/tests/test_document_embedder.py b/integrations/google_genai/tests/test_document_embedder.py
@@ -233,6 +233,39 @@ def test_run(self):
             assert len(doc.embedding) == 768
             assert all(isinstance(x, float) for x in doc.embedding)
 
+        assert "text" in result["meta"]["model"] and "004" in result["meta"]["model"], (
+            "The model name does not contain 'text' and '004'"
+        )
+
+    @pytest.mark.asyncio
+    @pytest.mark.skipif(
+        not os.environ.get("GOOGLE_API_KEY", None),
+        reason="Export an env var called GOOGLE_API_KEY containing the Google API key to run this test.",
+    )
+    @pytest.mark.integration
+    async def test_run_async(self):
+        docs = [
+            Document(content="I love cheese", meta={"topic": "Cuisine"}),
+            Document(content="A transformer is a deep learning architecture", meta={"topic": "ML"}),
+        ]
+
+        model = "text-embedding-004"
+
+        embedder = GoogleGenAIDocumentEmbedder(model=model, meta_fields_to_embed=["topic"], embedding_separator=" | ")
+
+        result = await embedder.run_async(documents=docs)
+        documents_with_embeddings = result["documents"]
+        assert isinstance(documents_with_embeddings, list)
+        assert len(documents_with_embeddings) == len(docs)
+        for doc in documents_with_embeddings:
+            assert isinstance(doc, Document)
+            assert isinstance(doc.embedding, list)
+            assert len(doc.embedding) == 768
+            assert all(isinstance(x, float) for x in doc.embedding)
+
+        assert "text" in result["meta"]["model"] and "004" in result["meta"]["model"], (
+            "The model name does not contain 'text' and '004'"
+        )
         assert result["documents"][0].meta == {"topic": "Cuisine"}
         assert result["documents"][1].meta == {"topic": "ML"}
         assert result["meta"] == {"model": model}
diff --git a/integrations/google_genai/tests/test_text_embedder.py b/integrations/google_genai/tests/test_text_embedder.py
@@ -160,4 +160,26 @@ def test_run(self):
         assert len(result["embedding"]) == 768
         assert all(isinstance(x, float) for x in result["embedding"])
 
+        assert "text" in result["meta"]["model"] and "004" in result["meta"]["model"], (
+            "The model name does not contain 'text' and '004'"
+        )
+
+    @pytest.mark.asyncio
+    @pytest.mark.skipif(
+        not os.environ.get("GOOGLE_API_KEY", None),
+        reason="Export an env var called GOOGLE_API_KEY containing the Google API key to run this test.",
+    )
+    @pytest.mark.integration
+    async def test_run_async(self):
+        model = "text-embedding-004"
+
+        embedder = GoogleGenAITextEmbedder(model=model)
+        result = await embedder.run_async(text="The food was delicious")
+
+        assert len(result["embedding"]) == 768
+        assert all(isinstance(x, float) for x in result["embedding"])
+
+        assert "text" in result["meta"]["model"] and "004" in result["meta"]["model"], (
+            "The model name does not contain 'text' and '004'"
+        )
         assert result["meta"] == {"model": model}