feat: Converters - allow passing meta in the run method (#6554)

* first impl for html * progressing on other components * fix test * add tests - run with meta * release note * reintroduce patches wrongly deleted * add patch in test * fix tika test * Update haystack/components/converters/azure.py Co-authored-by: Massimiliano Pippi <[email protected]> --------- Co-authored-by: Massimiliano Pippi <[email protected]>
deepset-ai · Dec 15, 2023 · 0c08943 · 0c08943
1 parent 4bffe7f
commit 0c08943
Show file tree

Hide file tree

Showing 12 changed files with 143 additions and 46 deletions.
diff --git a/haystack/components/converters/azure.py b/haystack/components/converters/azure.py
@@ -56,8 +56,8 @@ def __init__(self, endpoint: str, api_key: Optional[str] = None, model_id: str =
         self.endpoint = endpoint
         self.model_id = model_id
 
-    @component.output_types(documents=List[Document], azure=List[Dict])
-    def run(self, sources: List[Union[str, Path, ByteStream]]):
+    @component.output_types(documents=List[Document], raw_azure_response=List[Dict])
+    def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[Dict[str, Any]]] = None):
         """
         Convert files to Documents using Azure's Document Intelligence service.
 
@@ -66,10 +66,20 @@ def run(self, sources: List[Union[str, Path, ByteStream]]):
         the raw responses from Azure's Document Intelligence service.
 
         :param sources: List of file paths or ByteStream objects.
+        :param meta: Optional list of metadata to attach to the Documents.
+          The length of the list must match the number of sources. Defaults to `None`.
+        :return: A dictionary containing a list of Document objects under the 'documents' key
+          and the raw Azure response under the 'raw_azure_response' key.
         """
         documents = []
         azure_output = []
-        for source in sources:
+
+        if meta is None:
+            meta = [{}] * len(sources)
+        elif len(sources) != len(meta):
+            raise ValueError("The length of the metadata list must match the number of sources.")
+
+        for source, metadata in zip(sources, meta):
             try:
                 bytestream = get_bytestream_from_source(source=source)
             except Exception as e:
@@ -87,6 +97,8 @@ def run(self, sources: List[Union[str, Path, ByteStream]]):
                 file_suffix = Path(bytestream.metadata["file_path"]).suffix
 
             document = AzureOCRDocumentConverter._convert_azure_result_to_document(result, file_suffix)
+            merged_metadata = {**bytestream.metadata, **metadata}
+            document.meta = merged_metadata
             documents.append(document)
 
         return {"documents": documents, "raw_azure_response": azure_output}

diff --git a/haystack/components/converters/html.py b/haystack/components/converters/html.py
@@ -35,25 +35,22 @@ def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[D
 
         :param sources: List of HTML file paths or ByteStream objects.
         :param meta: Optional list of metadata to attach to the Documents.
-        The length of the list must match the number of sources. Defaults to `None`.
-        :return: List of converted Documents.
+          The length of the list must match the number of sources. Defaults to `None`.
+        :return: A dictionary containing a list of Document objects under the 'documents' key.
         """
 
         documents = []
 
-        # Create metadata placeholders if not provided
-        if meta:
-            if len(sources) != len(meta):
-                raise ValueError("The length of the metadata list must match the number of sources.")
-        else:
+        if meta is None:
             meta = [{}] * len(sources)
+        elif len(sources) != len(meta):
+            raise ValueError("The length of the metadata list must match the number of sources.")
 
         extractor = extractors.ArticleExtractor(raise_on_failure=False)
 
         for source, metadata in zip(sources, meta):
             try:
                 bytestream = get_bytestream_from_source(source=source)
-                extracted_meta = bytestream.metadata
             except Exception as e:
                 logger.warning("Could not read %s. Skipping it. Error: %s", source, e)
                 continue
@@ -64,11 +61,8 @@ def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[D
                 logger.warning("Failed to extract text from %s. Skipping it. Error: %s", source, conversion_e)
                 continue
 
-            # Merge metadata received from ByteStream with supplied metadata
-            if extracted_meta:
-                # Supplied metadata overwrites metadata from ByteStream for overlapping keys.
-                metadata = {**extracted_meta, **metadata}
-            document = Document(content=text, meta=metadata)
+            merged_metadata = {**bytestream.metadata, **metadata}
+            document = Document(content=text, meta=merged_metadata)
             documents.append(document)
 
         return {"documents": documents}
diff --git a/haystack/components/converters/markdown.py b/haystack/components/converters/markdown.py
@@ -51,15 +51,19 @@ def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[D
 
         :param sources: A list of markdown data sources (file paths or binary objects)
         :param meta: Optional list of metadata to attach to the Documents.
-        The length of the list must match the number of paths. Defaults to `None`.
+          The length of the list must match the number of paths. Defaults to `None`.
+        :return: A dictionary containing a list of Document objects under the 'documents' key.
         """
         parser = MarkdownIt(renderer_cls=RendererPlain)
         if self.table_to_single_line:
             parser.enable("table")
 
         documents = []
+
         if meta is None:
             meta = [{}] * len(sources)
+        elif len(sources) != len(meta):
+            raise ValueError("The length of the metadata list must match the number of sources.")
 
         for source, metadata in tqdm(
             zip(sources, meta),
@@ -79,7 +83,8 @@ def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[D
                 logger.warning("Failed to extract text from %s. Skipping it. Error: %s", source, conversion_e)
                 continue
 
-            document = Document(content=text, meta=metadata)
+            merged_metadata = {**bytestream.metadata, **metadata}
+            document = Document(content=text, meta=merged_metadata)
             documents.append(document)
 
         return {"documents": documents}
diff --git a/haystack/components/converters/pypdf.py b/haystack/components/converters/pypdf.py
@@ -1,6 +1,6 @@
 import io
 import logging
-from typing import List, Union, Protocol, Dict
+from typing import List, Union, Protocol, Dict, Any, Optional
 from pathlib import Path
 
 from haystack.dataclasses import ByteStream
@@ -71,15 +71,23 @@ def to_dict(self):
         return default_to_dict(self, converter_name=self.converter_name)
 
     @component.output_types(documents=List[Document])
-    def run(self, sources: List[Union[str, Path, ByteStream]]):
+    def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[Dict[str, Any]]] = None):
         """
         Converts a list of PDF sources into Document objects using the configured converter.
 
         :param sources: A list of PDF data sources, which can be file paths or ByteStream objects.
+        :param meta: Optional list of metadata to attach to the Documents.
+          The length of the list must match the number of sources. Defaults to `None`.
         :return: A dictionary containing a list of Document objects under the 'documents' key.
         """
         documents = []
-        for source in sources:
+
+        if meta is None:
+            meta = [{}] * len(sources)
+        elif len(sources) != len(meta):
+            raise ValueError("The length of the metadata list must match the number of sources.")
+
+        for source, metadata in zip(sources, meta):
             try:
                 bytestream = get_bytestream_from_source(source)
             except Exception as e:
@@ -91,6 +99,9 @@ def run(self, sources: List[Union[str, Path, ByteStream]]):
             except Exception as e:
                 logger.warning("Could not read %s and convert it to Document, skipping. %s", source, e)
                 continue
+
+            merged_metadata = {**bytestream.metadata, **metadata}
+            document.meta = merged_metadata
             documents.append(document)
 
         return {"documents": documents}
diff --git a/haystack/components/converters/tika.py b/haystack/components/converters/tika.py
@@ -1,6 +1,6 @@
 import logging
 from pathlib import Path
-from typing import List, Union
+from typing import List, Union, Dict, Any, Optional
 import io
 
 from haystack.lazy_imports import LazyImport
@@ -37,15 +37,24 @@ def __init__(self, tika_url: str = "http://localhost:9998/tika"):
         self.tika_url = tika_url
 
     @component.output_types(documents=List[Document])
-    def run(self, sources: List[Union[str, Path, ByteStream]]):
+    def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[Dict[str, Any]]] = None):
         """
         Convert files to Documents.
 
         :param sources: List of file paths or ByteStream objects.
+        :param meta: Optional list of metadata to attach to the Documents.
+          The length of the list must match the number of sources. Defaults to `None`.
+        :return: A dictionary containing a list of Document objects under the 'documents' key.
         """
 
         documents = []
-        for source in sources:
+
+        if meta is None:
+            meta = [{}] * len(sources)
+        elif len(sources) != len(meta):
+            raise ValueError("The length of the metadata list must match the number of sources.")
+
+        for source, metadata in zip(sources, meta):
             try:
                 bytestream = get_bytestream_from_source(source)
             except Exception as e:
@@ -56,6 +65,8 @@ def run(self, sources: List[Union[str, Path, ByteStream]]):
             except Exception as conversion_e:
                 logger.warning("Failed to extract text from %s. Skipping it. Error: %s", source, conversion_e)
                 continue
-            document = Document(content=text)
+
+            merged_metadata = {**bytestream.metadata, **metadata}
+            document = Document(content=text, meta=merged_metadata)
             documents.append(document)
         return {"documents": documents}
diff --git a/haystack/components/converters/txt.py b/haystack/components/converters/txt.py
@@ -1,6 +1,6 @@
 import logging
 from pathlib import Path
-from typing import List, Union
+from typing import List, Union, Dict, Any, Optional
 
 from haystack import Document, component
 from haystack.dataclasses import ByteStream
@@ -27,28 +27,39 @@ def __init__(self, encoding: str = "utf-8"):
         self.encoding = encoding
 
     @component.output_types(documents=List[Document])
-    def run(self, sources: List[Union[str, Path, ByteStream]]):
+    def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[Dict[str, Any]]] = None):
         """
         Convert text files to Documents.
 
-        :param streams: A list of paths to text files or ByteStream objects.
-            Note that if an encoding is specified in the metadata of a ByteStream,
-            it will override the component's default.
-        :return: A dictionary containing the converted documents.
+        :param sources: A list of paths to text files or ByteStream objects.
+          Note that if an encoding is specified in the metadata of a ByteStream,
+          it will override the component's default.
+        :param meta: Optional list of metadata to attach to the Documents.
+          The length of the list must match the number of sources. Defaults to `None`.
+        :return: A dictionary containing a list of Document objects under the 'documents' key.
         """
         documents = []
-        for source in sources:
+
+        if meta is None:
+            meta = [{}] * len(sources)
+        elif len(sources) != len(meta):
+            raise ValueError("The length of the metadata list must match the number of sources.")
+
+        for source, metadata in zip(sources, meta):
             try:
                 bytestream = get_bytestream_from_source(source)
             except Exception as e:
                 logger.warning("Could not read %s. Skipping it. Error: %s", source, e)
                 continue
             try:
                 encoding = bytestream.metadata.get("encoding", self.encoding)
-                document = Document(content=bytestream.data.decode(encoding))
-                document.meta = bytestream.metadata
-                documents.append(document)
+                text = bytestream.data.decode(encoding)
             except Exception as e:
                 logger.warning("Could not convert file %s. Skipping it. Error message: %s", source, e)
+                continue
+
+            merged_metadata = {**bytestream.metadata, **metadata}
+            document = Document(content=text, meta=merged_metadata)
+            documents.append(document)
 
         return {"documents": documents}
diff --git a/releasenotes/notes/converters-allow-passing-meta-70fb498b6eb80468.yaml b/releasenotes/notes/converters-allow-passing-meta-70fb498b6eb80468.yaml
@@ -0,0 +1,6 @@
+---
+enhancements:
+  - |
+    Make all Converters accept `meta` in the `run` method, so that users can
+    provide their own metadata.
+    The length of this list should match the number of `sources`.
diff --git a/test/components/converters/test_azure_ocr_doc_converter.py b/test/components/converters/test_azure_ocr_doc_converter.py
@@ -4,6 +4,7 @@
 import pytest
 
 from haystack.components.converters.azure import AzureOCRDocumentConverter
+from haystack.dataclasses import ByteStream
 
 
 class TestAzureOCRDocumentConverter:
@@ -43,6 +44,18 @@ def test_run(self, test_files_path):
                 "pages": [{"lines": [{"content": "mocked line 1"}, {"content": "mocked line 2"}]}],
             }
 
+    def test_run_with_meta(self):
+        bytestream = ByteStream(data=b"test", metadata={"author": "test_author", "language": "en"})
+
+        with patch("haystack.components.converters.azure.DocumentAnalysisClient"):
+            component = AzureOCRDocumentConverter(endpoint="test_endpoint", api_key="test_credential_key")
+
+        output = component.run(sources=[bytestream], meta=[{"language": "it"}])
+        document = output["documents"][0]
+
+        # check that the metadata from the bytestream is merged with that from the meta parameter
+        assert document.meta == {"author": "test_author", "language": "it"}
+
     @pytest.mark.integration
     @pytest.mark.skipif(not os.environ.get("CORE_AZURE_CS_ENDPOINT", None), reason="Azure credentials not available")
     @pytest.mark.skipif(not os.environ.get("CORE_AZURE_CS_API_KEY", None), reason="Azure credentials not available")

diff --git a/test/components/converters/test_markdown_to_document.py b/test/components/converters/test_markdown_to_document.py
@@ -1,5 +1,6 @@
 import logging
 
+from unittest.mock import patch
 import pytest
 
 from haystack.components.converters.markdown import MarkdownToDocument
@@ -30,19 +31,17 @@ def test_run(self, test_files_path):
             assert "What to build with Haystack" in doc.content
             assert "# git clone https://github.com/deepset-ai/haystack.git" in doc.content
 
-    @pytest.mark.integration
-    def test_run_metadata(self, test_files_path):
+    def test_run_with_meta(self):
+        bytestream = ByteStream(data=b"test", metadata={"author": "test_author", "language": "en"})
+
         converter = MarkdownToDocument()
-        sources = [test_files_path / "markdown" / "sample.md"]
-        metadata = [{"file_name": "sample.md"}]
-        results = converter.run(sources=sources, meta=metadata)
-        docs = results["documents"]
 
-        assert len(docs) == 1
-        for doc in docs:
-            assert "What to build with Haystack" in doc.content
-            assert "# git clone https://github.com/deepset-ai/haystack.git" in doc.content
-            assert doc.meta == {"file_name": "sample.md"}
+        with patch("haystack.components.converters.markdown.MarkdownIt"):
+            output = converter.run(sources=[bytestream], meta=[{"language": "it"}])
+        document = output["documents"][0]
+
+        # check that the metadata from the bytestream is merged with that from the meta parameter
+        assert document.meta == {"author": "test_author", "language": "it"}
 
     @pytest.mark.integration
     def test_run_wrong_file_type(self, test_files_path, caplog):

diff --git a/test/components/converters/test_pypdf_to_document.py b/test/components/converters/test_pypdf_to_document.py
@@ -1,4 +1,5 @@
 import logging
+from unittest.mock import patch
 import pytest
 
 from haystack import Document
@@ -28,6 +29,18 @@ def test_run(self, test_files_path):
         assert len(docs) == 1
         assert "ReAct" in docs[0].content
 
+    def test_run_with_meta(self):
+        bytestream = ByteStream(data=b"test", metadata={"author": "test_author", "language": "en"})
+
+        converter = PyPDFToDocument()
+        with patch("haystack.components.converters.pypdf.PdfReader"):
+            output = converter.run(sources=[bytestream], meta=[{"language": "it"}])
+
+        document = output["documents"][0]
+
+        # check that the metadata from the bytestream is merged with that from the meta parameter
+        assert document.meta == {"author": "test_author", "language": "it"}
+
     def test_run_error_handling(self, test_files_path, caplog):
         """
         Test if the component correctly handles errors.

diff --git a/test/components/converters/test_textfile_to_document.py b/test/components/converters/test_textfile_to_document.py
@@ -56,3 +56,14 @@ def test_encoding_override(self, test_files_path):
         bytestream.metadata["encoding"] = "utf-8"
         output = converter.run(sources=[bytestream])
         assert "Some text for testing." in output["documents"][0].content
+
+    def test_run_with_meta(self):
+        bytestream = ByteStream(data=b"test", metadata={"author": "test_author", "language": "en"})
+
+        converter = TextFileToDocument()
+
+        output = converter.run(sources=[bytestream], meta=[{"language": "it"}])
+        document = output["documents"][0]
+
+        # check that the metadata from the bytestream is merged with that from the meta parameter
+        assert document.meta == {"author": "test_author", "language": "it"}
diff --git a/test/components/converters/test_tika_doc_converter.py b/test/components/converters/test_tika_doc_converter.py
@@ -20,6 +20,17 @@ def test_run(self, mock_get_bytestream_from_source, mock_tika_parser):
         assert len(documents) == 1
         assert documents[0].content == "Content of mock_file.pdf"
 
+    def test_run_with_meta(self):
+        bytestream = ByteStream(data=b"test", metadata={"author": "test_author", "language": "en"})
+
+        converter = TikaDocumentConverter()
+        with patch("haystack.components.converters.tika.tika_parser.from_buffer"):
+            output = converter.run(sources=[bytestream], meta=[{"language": "it"}])
+        document = output["documents"][0]
+
+        # check that the metadata from the bytestream is merged with that from the meta parameter
+        assert document.meta == {"author": "test_author", "language": "it"}
+
     def test_run_nonexistent_file(self, caplog):
         component = TikaDocumentConverter()
         with caplog.at_level("WARNING"):