Skip to content

Commit

Permalink
feat: Converters - allow passing meta in the run method (#6554)
Browse files Browse the repository at this point in the history
* first impl for html

* progressing on other components

* fix test

* add tests - run with meta

* release note

* reintroduce patches wrongly deleted

* add patch in test

* fix tika test

* Update haystack/components/converters/azure.py

Co-authored-by: Massimiliano Pippi <[email protected]>

---------

Co-authored-by: Massimiliano Pippi <[email protected]>
  • Loading branch information
anakin87 and masci authored Dec 15, 2023
1 parent 4bffe7f commit 0c08943
Show file tree
Hide file tree
Showing 12 changed files with 143 additions and 46 deletions.
18 changes: 15 additions & 3 deletions haystack/components/converters/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@ def __init__(self, endpoint: str, api_key: Optional[str] = None, model_id: str =
self.endpoint = endpoint
self.model_id = model_id

@component.output_types(documents=List[Document], azure=List[Dict])
def run(self, sources: List[Union[str, Path, ByteStream]]):
@component.output_types(documents=List[Document], raw_azure_response=List[Dict])
def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[Dict[str, Any]]] = None):
"""
Convert files to Documents using Azure's Document Intelligence service.
Expand All @@ -66,10 +66,20 @@ def run(self, sources: List[Union[str, Path, ByteStream]]):
the raw responses from Azure's Document Intelligence service.
:param sources: List of file paths or ByteStream objects.
:param meta: Optional list of metadata to attach to the Documents.
The length of the list must match the number of sources. Defaults to `None`.
:return: A dictionary containing a list of Document objects under the 'documents' key
and the raw Azure response under the 'raw_azure_response' key.
"""
documents = []
azure_output = []
for source in sources:

if meta is None:
meta = [{}] * len(sources)
elif len(sources) != len(meta):
raise ValueError("The length of the metadata list must match the number of sources.")

for source, metadata in zip(sources, meta):
try:
bytestream = get_bytestream_from_source(source=source)
except Exception as e:
Expand All @@ -87,6 +97,8 @@ def run(self, sources: List[Union[str, Path, ByteStream]]):
file_suffix = Path(bytestream.metadata["file_path"]).suffix

document = AzureOCRDocumentConverter._convert_azure_result_to_document(result, file_suffix)
merged_metadata = {**bytestream.metadata, **metadata}
document.meta = merged_metadata
documents.append(document)

return {"documents": documents, "raw_azure_response": azure_output}
Expand Down
20 changes: 7 additions & 13 deletions haystack/components/converters/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,25 +35,22 @@ def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[D
:param sources: List of HTML file paths or ByteStream objects.
:param meta: Optional list of metadata to attach to the Documents.
The length of the list must match the number of sources. Defaults to `None`.
:return: List of converted Documents.
The length of the list must match the number of sources. Defaults to `None`.
:return: A dictionary containing a list of Document objects under the 'documents' key.
"""

documents = []

# Create metadata placeholders if not provided
if meta:
if len(sources) != len(meta):
raise ValueError("The length of the metadata list must match the number of sources.")
else:
if meta is None:
meta = [{}] * len(sources)
elif len(sources) != len(meta):
raise ValueError("The length of the metadata list must match the number of sources.")

extractor = extractors.ArticleExtractor(raise_on_failure=False)

for source, metadata in zip(sources, meta):
try:
bytestream = get_bytestream_from_source(source=source)
extracted_meta = bytestream.metadata
except Exception as e:
logger.warning("Could not read %s. Skipping it. Error: %s", source, e)
continue
Expand All @@ -64,11 +61,8 @@ def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[D
logger.warning("Failed to extract text from %s. Skipping it. Error: %s", source, conversion_e)
continue

# Merge metadata received from ByteStream with supplied metadata
if extracted_meta:
# Supplied metadata overwrites metadata from ByteStream for overlapping keys.
metadata = {**extracted_meta, **metadata}
document = Document(content=text, meta=metadata)
merged_metadata = {**bytestream.metadata, **metadata}
document = Document(content=text, meta=merged_metadata)
documents.append(document)

return {"documents": documents}
9 changes: 7 additions & 2 deletions haystack/components/converters/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,15 +51,19 @@ def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[D
:param sources: A list of markdown data sources (file paths or binary objects)
:param meta: Optional list of metadata to attach to the Documents.
The length of the list must match the number of paths. Defaults to `None`.
The length of the list must match the number of paths. Defaults to `None`.
:return: A dictionary containing a list of Document objects under the 'documents' key.
"""
parser = MarkdownIt(renderer_cls=RendererPlain)
if self.table_to_single_line:
parser.enable("table")

documents = []

if meta is None:
meta = [{}] * len(sources)
elif len(sources) != len(meta):
raise ValueError("The length of the metadata list must match the number of sources.")

for source, metadata in tqdm(
zip(sources, meta),
Expand All @@ -79,7 +83,8 @@ def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[D
logger.warning("Failed to extract text from %s. Skipping it. Error: %s", source, conversion_e)
continue

document = Document(content=text, meta=metadata)
merged_metadata = {**bytestream.metadata, **metadata}
document = Document(content=text, meta=merged_metadata)
documents.append(document)

return {"documents": documents}
17 changes: 14 additions & 3 deletions haystack/components/converters/pypdf.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import io
import logging
from typing import List, Union, Protocol, Dict
from typing import List, Union, Protocol, Dict, Any, Optional
from pathlib import Path

from haystack.dataclasses import ByteStream
Expand Down Expand Up @@ -71,15 +71,23 @@ def to_dict(self):
return default_to_dict(self, converter_name=self.converter_name)

@component.output_types(documents=List[Document])
def run(self, sources: List[Union[str, Path, ByteStream]]):
def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[Dict[str, Any]]] = None):
"""
Converts a list of PDF sources into Document objects using the configured converter.
:param sources: A list of PDF data sources, which can be file paths or ByteStream objects.
:param meta: Optional list of metadata to attach to the Documents.
The length of the list must match the number of sources. Defaults to `None`.
:return: A dictionary containing a list of Document objects under the 'documents' key.
"""
documents = []
for source in sources:

if meta is None:
meta = [{}] * len(sources)
elif len(sources) != len(meta):
raise ValueError("The length of the metadata list must match the number of sources.")

for source, metadata in zip(sources, meta):
try:
bytestream = get_bytestream_from_source(source)
except Exception as e:
Expand All @@ -91,6 +99,9 @@ def run(self, sources: List[Union[str, Path, ByteStream]]):
except Exception as e:
logger.warning("Could not read %s and convert it to Document, skipping. %s", source, e)
continue

merged_metadata = {**bytestream.metadata, **metadata}
document.meta = merged_metadata
documents.append(document)

return {"documents": documents}
19 changes: 15 additions & 4 deletions haystack/components/converters/tika.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import logging
from pathlib import Path
from typing import List, Union
from typing import List, Union, Dict, Any, Optional
import io

from haystack.lazy_imports import LazyImport
Expand Down Expand Up @@ -37,15 +37,24 @@ def __init__(self, tika_url: str = "http://localhost:9998/tika"):
self.tika_url = tika_url

@component.output_types(documents=List[Document])
def run(self, sources: List[Union[str, Path, ByteStream]]):
def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[Dict[str, Any]]] = None):
"""
Convert files to Documents.
:param sources: List of file paths or ByteStream objects.
:param meta: Optional list of metadata to attach to the Documents.
The length of the list must match the number of sources. Defaults to `None`.
:return: A dictionary containing a list of Document objects under the 'documents' key.
"""

documents = []
for source in sources:

if meta is None:
meta = [{}] * len(sources)
elif len(sources) != len(meta):
raise ValueError("The length of the metadata list must match the number of sources.")

for source, metadata in zip(sources, meta):
try:
bytestream = get_bytestream_from_source(source)
except Exception as e:
Expand All @@ -56,6 +65,8 @@ def run(self, sources: List[Union[str, Path, ByteStream]]):
except Exception as conversion_e:
logger.warning("Failed to extract text from %s. Skipping it. Error: %s", source, conversion_e)
continue
document = Document(content=text)

merged_metadata = {**bytestream.metadata, **metadata}
document = Document(content=text, meta=merged_metadata)
documents.append(document)
return {"documents": documents}
31 changes: 21 additions & 10 deletions haystack/components/converters/txt.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import logging
from pathlib import Path
from typing import List, Union
from typing import List, Union, Dict, Any, Optional

from haystack import Document, component
from haystack.dataclasses import ByteStream
Expand All @@ -27,28 +27,39 @@ def __init__(self, encoding: str = "utf-8"):
self.encoding = encoding

@component.output_types(documents=List[Document])
def run(self, sources: List[Union[str, Path, ByteStream]]):
def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[Dict[str, Any]]] = None):
"""
Convert text files to Documents.
:param streams: A list of paths to text files or ByteStream objects.
Note that if an encoding is specified in the metadata of a ByteStream,
it will override the component's default.
:return: A dictionary containing the converted documents.
:param sources: A list of paths to text files or ByteStream objects.
Note that if an encoding is specified in the metadata of a ByteStream,
it will override the component's default.
:param meta: Optional list of metadata to attach to the Documents.
The length of the list must match the number of sources. Defaults to `None`.
:return: A dictionary containing a list of Document objects under the 'documents' key.
"""
documents = []
for source in sources:

if meta is None:
meta = [{}] * len(sources)
elif len(sources) != len(meta):
raise ValueError("The length of the metadata list must match the number of sources.")

for source, metadata in zip(sources, meta):
try:
bytestream = get_bytestream_from_source(source)
except Exception as e:
logger.warning("Could not read %s. Skipping it. Error: %s", source, e)
continue
try:
encoding = bytestream.metadata.get("encoding", self.encoding)
document = Document(content=bytestream.data.decode(encoding))
document.meta = bytestream.metadata
documents.append(document)
text = bytestream.data.decode(encoding)
except Exception as e:
logger.warning("Could not convert file %s. Skipping it. Error message: %s", source, e)
continue

merged_metadata = {**bytestream.metadata, **metadata}
document = Document(content=text, meta=merged_metadata)
documents.append(document)

return {"documents": documents}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
enhancements:
- |
Make all Converters accept `meta` in the `run` method, so that users can
provide their own metadata.
The length of this list should match the number of `sources`.
13 changes: 13 additions & 0 deletions test/components/converters/test_azure_ocr_doc_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import pytest

from haystack.components.converters.azure import AzureOCRDocumentConverter
from haystack.dataclasses import ByteStream


class TestAzureOCRDocumentConverter:
Expand Down Expand Up @@ -43,6 +44,18 @@ def test_run(self, test_files_path):
"pages": [{"lines": [{"content": "mocked line 1"}, {"content": "mocked line 2"}]}],
}

def test_run_with_meta(self):
bytestream = ByteStream(data=b"test", metadata={"author": "test_author", "language": "en"})

with patch("haystack.components.converters.azure.DocumentAnalysisClient"):
component = AzureOCRDocumentConverter(endpoint="test_endpoint", api_key="test_credential_key")

output = component.run(sources=[bytestream], meta=[{"language": "it"}])
document = output["documents"][0]

# check that the metadata from the bytestream is merged with that from the meta parameter
assert document.meta == {"author": "test_author", "language": "it"}

@pytest.mark.integration
@pytest.mark.skipif(not os.environ.get("CORE_AZURE_CS_ENDPOINT", None), reason="Azure credentials not available")
@pytest.mark.skipif(not os.environ.get("CORE_AZURE_CS_API_KEY", None), reason="Azure credentials not available")
Expand Down
21 changes: 10 additions & 11 deletions test/components/converters/test_markdown_to_document.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import logging

from unittest.mock import patch
import pytest

from haystack.components.converters.markdown import MarkdownToDocument
Expand Down Expand Up @@ -30,19 +31,17 @@ def test_run(self, test_files_path):
assert "What to build with Haystack" in doc.content
assert "# git clone https://github.com/deepset-ai/haystack.git" in doc.content

@pytest.mark.integration
def test_run_metadata(self, test_files_path):
def test_run_with_meta(self):
bytestream = ByteStream(data=b"test", metadata={"author": "test_author", "language": "en"})

converter = MarkdownToDocument()
sources = [test_files_path / "markdown" / "sample.md"]
metadata = [{"file_name": "sample.md"}]
results = converter.run(sources=sources, meta=metadata)
docs = results["documents"]

assert len(docs) == 1
for doc in docs:
assert "What to build with Haystack" in doc.content
assert "# git clone https://github.com/deepset-ai/haystack.git" in doc.content
assert doc.meta == {"file_name": "sample.md"}
with patch("haystack.components.converters.markdown.MarkdownIt"):
output = converter.run(sources=[bytestream], meta=[{"language": "it"}])
document = output["documents"][0]

# check that the metadata from the bytestream is merged with that from the meta parameter
assert document.meta == {"author": "test_author", "language": "it"}

@pytest.mark.integration
def test_run_wrong_file_type(self, test_files_path, caplog):
Expand Down
13 changes: 13 additions & 0 deletions test/components/converters/test_pypdf_to_document.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
from unittest.mock import patch
import pytest

from haystack import Document
Expand Down Expand Up @@ -28,6 +29,18 @@ def test_run(self, test_files_path):
assert len(docs) == 1
assert "ReAct" in docs[0].content

def test_run_with_meta(self):
bytestream = ByteStream(data=b"test", metadata={"author": "test_author", "language": "en"})

converter = PyPDFToDocument()
with patch("haystack.components.converters.pypdf.PdfReader"):
output = converter.run(sources=[bytestream], meta=[{"language": "it"}])

document = output["documents"][0]

# check that the metadata from the bytestream is merged with that from the meta parameter
assert document.meta == {"author": "test_author", "language": "it"}

def test_run_error_handling(self, test_files_path, caplog):
"""
Test if the component correctly handles errors.
Expand Down
11 changes: 11 additions & 0 deletions test/components/converters/test_textfile_to_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,3 +56,14 @@ def test_encoding_override(self, test_files_path):
bytestream.metadata["encoding"] = "utf-8"
output = converter.run(sources=[bytestream])
assert "Some text for testing." in output["documents"][0].content

def test_run_with_meta(self):
bytestream = ByteStream(data=b"test", metadata={"author": "test_author", "language": "en"})

converter = TextFileToDocument()

output = converter.run(sources=[bytestream], meta=[{"language": "it"}])
document = output["documents"][0]

# check that the metadata from the bytestream is merged with that from the meta parameter
assert document.meta == {"author": "test_author", "language": "it"}
11 changes: 11 additions & 0 deletions test/components/converters/test_tika_doc_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,17 @@ def test_run(self, mock_get_bytestream_from_source, mock_tika_parser):
assert len(documents) == 1
assert documents[0].content == "Content of mock_file.pdf"

def test_run_with_meta(self):
bytestream = ByteStream(data=b"test", metadata={"author": "test_author", "language": "en"})

converter = TikaDocumentConverter()
with patch("haystack.components.converters.tika.tika_parser.from_buffer"):
output = converter.run(sources=[bytestream], meta=[{"language": "it"}])
document = output["documents"][0]

# check that the metadata from the bytestream is merged with that from the meta parameter
assert document.meta == {"author": "test_author", "language": "it"}

def test_run_nonexistent_file(self, caplog):
component = TikaDocumentConverter()
with caplog.at_level("WARNING"):
Expand Down

0 comments on commit 0c08943

Please sign in to comment.