Skip to content

Commit b32f85c

Browse files
authored
remove deprecated 'converter' init parameter from PyPDFToDocument component (#8609)
1 parent 3da5bac commit b32f85c

File tree

3 files changed

+7
-128
lines changed

3 files changed

+7
-128
lines changed

haystack/components/converters/pypdf.py

Lines changed: 2 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,12 @@
77
import warnings
88
from enum import Enum
99
from pathlib import Path
10-
from typing import Any, Dict, List, Optional, Protocol, Union
10+
from typing import Any, Dict, List, Optional, Union
1111

1212
from haystack import Document, component, default_from_dict, default_to_dict, logging
1313
from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
1414
from haystack.dataclasses import ByteStream
1515
from haystack.lazy_imports import LazyImport
16-
from haystack.utils.base_serialization import deserialize_class_instance, serialize_class_instance
1716

1817
with LazyImport("Run 'pip install pypdf'") as pypdf_import:
1918
from pypdf import PdfReader
@@ -22,25 +21,6 @@
2221
logger = logging.getLogger(__name__)
2322

2423

25-
class PyPDFConverter(Protocol):
26-
"""
27-
A protocol that defines a converter which takes a PdfReader object and converts it into a Document object.
28-
29-
This is deprecated and will be removed in Haystack 2.9.0.
30-
For in-depth customization of the conversion process, consider implementing a custom component.
31-
"""
32-
33-
def convert(self, reader: "PdfReader") -> Document: # noqa: D102
34-
...
35-
36-
def to_dict(self): # noqa: D102
37-
...
38-
39-
@classmethod
40-
def from_dict(cls, data): # noqa: D102
41-
...
42-
43-
4424
class PyPDFExtractionMode(Enum):
4525
"""
4626
The mode to use for extracting text from a PDF.
@@ -91,7 +71,6 @@ class PyPDFToDocument:
9171

9272
def __init__(
9373
self,
94-
converter: Optional[PyPDFConverter] = None,
9574
*,
9675
extraction_mode: Union[str, PyPDFExtractionMode] = PyPDFExtractionMode.PLAIN,
9776
plain_mode_orientations: tuple = (0, 90, 180, 270),
@@ -105,12 +84,6 @@ def __init__(
10584
"""
10685
Create an PyPDFToDocument component.
10786
108-
:param converter:
109-
An instance of a PyPDFConverter compatible class. This is deprecated and will be removed in Haystack 2.9.0.
110-
For in-depth customization of the conversion process, consider implementing a custom component.
111-
112-
All the following parameters are applied only if `converter` is None.
113-
11487
:param extraction_mode:
11588
The mode to use for extracting text from a PDF.
11689
Layout mode is an experimental mode that adheres to the rendered layout of the PDF.
@@ -139,14 +112,6 @@ def __init__(
139112
"""
140113
pypdf_import.check()
141114

142-
if converter is not None:
143-
msg = (
144-
"The `converter` parameter is deprecated and will be removed in Haystack 2.9.0. "
145-
"For in-depth customization of the conversion process, consider implementing a custom component."
146-
)
147-
warnings.warn(msg, DeprecationWarning)
148-
149-
self.converter = converter
150115
self.store_full_path = store_full_path
151116

152117
if isinstance(extraction_mode, str):
@@ -168,7 +133,6 @@ def to_dict(self):
168133
"""
169134
return default_to_dict(
170135
self,
171-
converter=(serialize_class_instance(self.converter) if self.converter else None),
172136
extraction_mode=str(self.extraction_mode),
173137
plain_mode_orientations=self.plain_mode_orientations,
174138
plain_mode_space_width=self.plain_mode_space_width,
@@ -190,10 +154,6 @@ def from_dict(cls, data):
190154
:returns:
191155
Deserialized component.
192156
"""
193-
init_parameters = data.get("init_parameters", {})
194-
custom_converter_data = init_parameters.get("converter")
195-
if custom_converter_data is not None:
196-
data["init_parameters"]["converter"] = deserialize_class_instance(custom_converter_data)
197157
return default_from_dict(cls, data)
198158

199159
def _default_convert(self, reader: "PdfReader") -> Document:
@@ -246,9 +206,7 @@ def run(
246206
continue
247207
try:
248208
pdf_reader = PdfReader(io.BytesIO(bytestream.data))
249-
document = (
250-
self._default_convert(pdf_reader) if self.converter is None else self.converter.convert(pdf_reader)
251-
)
209+
document = self._default_convert(pdf_reader)
252210
except Exception as e:
253211
logger.warning(
254212
"Could not read {source} and convert it to Document, skipping. {error}", source=source, error=e
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
---
2+
upgrade:
3+
- |
4+
Remove deprecated 'converter' init argument from `PyPDFToDocument`. Use other init arguments instead, or create a custom component.

test/components/converters/test_pypdf_to_document.py

Lines changed: 1 addition & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -16,22 +16,8 @@ def pypdf_component():
1616
return PyPDFToDocument()
1717

1818

19-
class CustomConverter:
20-
def convert(self, reader):
21-
return Document(content="Custom converter")
22-
23-
def to_dict(self):
24-
return {"key": "value", "more": False}
25-
26-
@classmethod
27-
def from_dict(cls, data):
28-
assert data == {"key": "value", "more": False}
29-
return cls()
30-
31-
3219
class TestPyPDFToDocument:
3320
def test_init(self, pypdf_component):
34-
assert pypdf_component.converter is None
3521
assert pypdf_component.extraction_mode == PyPDFExtractionMode.PLAIN
3622
assert pypdf_component.plain_mode_orientations == (0, 90, 180, 270)
3723
assert pypdf_component.plain_mode_space_width == 200.0
@@ -40,10 +26,6 @@ def test_init(self, pypdf_component):
4026
assert pypdf_component.layout_mode_strip_rotated is True
4127
assert pypdf_component.layout_mode_font_height_weight == 1.0
4228

43-
def test_init_converter(self):
44-
pypdf_component = PyPDFToDocument(converter=CustomConverter())
45-
assert isinstance(pypdf_component.converter, CustomConverter)
46-
4729
def test_init_custom_params(self):
4830
pypdf_component = PyPDFToDocument(
4931
extraction_mode="layout",
@@ -72,7 +54,6 @@ def test_to_dict(self, pypdf_component):
7254
assert data == {
7355
"type": "haystack.components.converters.pypdf.PyPDFToDocument",
7456
"init_parameters": {
75-
"converter": None,
7657
"extraction_mode": "plain",
7758
"plain_mode_orientations": (0, 90, 180, 270),
7859
"plain_mode_space_width": 200.0,
@@ -84,32 +65,10 @@ def test_to_dict(self, pypdf_component):
8465
},
8566
}
8667

87-
def test_to_dict_custom_converter(self):
88-
pypdf_component = PyPDFToDocument(converter=CustomConverter(), store_full_path=False)
89-
data = pypdf_component.to_dict()
90-
assert data == {
91-
"type": "haystack.components.converters.pypdf.PyPDFToDocument",
92-
"init_parameters": {
93-
"converter": {
94-
"data": {"key": "value", "more": False},
95-
"type": "converters.test_pypdf_to_document.CustomConverter",
96-
},
97-
"extraction_mode": "plain",
98-
"plain_mode_orientations": (0, 90, 180, 270),
99-
"plain_mode_space_width": 200.0,
100-
"layout_mode_space_vertically": True,
101-
"layout_mode_scale_weight": 1.25,
102-
"layout_mode_strip_rotated": True,
103-
"layout_mode_font_height_weight": 1.0,
104-
"store_full_path": False,
105-
},
106-
}
107-
10868
def test_from_dict(self):
10969
data = {
11070
"type": "haystack.components.converters.pypdf.PyPDFToDocument",
11171
"init_parameters": {
112-
"converter": None,
11372
"extraction_mode": "plain",
11473
"plain_mode_orientations": (0, 90, 180, 270),
11574
"plain_mode_space_width": 200.0,
@@ -122,7 +81,6 @@ def test_from_dict(self):
12281

12382
instance = PyPDFToDocument.from_dict(data)
12483
assert isinstance(instance, PyPDFToDocument)
125-
assert instance.converter is None
12684
assert instance.extraction_mode == PyPDFExtractionMode.PLAIN
12785
assert instance.plain_mode_orientations == (0, 90, 180, 270)
12886
assert instance.plain_mode_space_width == 200.0
@@ -135,21 +93,7 @@ def test_from_dict_defaults(self):
13593
data = {"type": "haystack.components.converters.pypdf.PyPDFToDocument", "init_parameters": {}}
13694
instance = PyPDFToDocument.from_dict(data)
13795
assert isinstance(instance, PyPDFToDocument)
138-
assert instance.converter is None
139-
140-
def test_from_dict_custom_converter(self):
141-
data = {
142-
"type": "haystack.components.converters.pypdf.PyPDFToDocument",
143-
"init_parameters": {
144-
"converter": {
145-
"data": {"key": "value", "more": False},
146-
"type": "converters.test_pypdf_to_document.CustomConverter",
147-
}
148-
},
149-
}
150-
instance = PyPDFToDocument.from_dict(data)
151-
assert isinstance(instance, PyPDFToDocument)
152-
assert isinstance(instance.converter, CustomConverter)
96+
assert instance.extraction_mode == PyPDFExtractionMode.PLAIN
15397

15498
def test_default_convert(self):
15599
mock_page1 = Mock()
@@ -259,33 +203,6 @@ def test_mixed_sources_run(self, test_files_path, pypdf_component):
259203
assert "History and standardization" in docs[0].content
260204
assert "History and standardization" in docs[1].content
261205

262-
@pytest.mark.integration
263-
def test_custom_converter(self, test_files_path):
264-
"""
265-
Test if the component correctly handles custom converters.
266-
"""
267-
from pypdf import PdfReader
268-
269-
paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
270-
271-
class MyCustomConverter:
272-
def convert(self, reader: PdfReader) -> Document:
273-
return Document(content="I don't care about converting given pdfs, I always return this")
274-
275-
def to_dict(self):
276-
return default_to_dict(self)
277-
278-
@classmethod
279-
def from_dict(cls, data):
280-
return default_from_dict(cls, data)
281-
282-
component = PyPDFToDocument(converter=MyCustomConverter())
283-
output = component.run(sources=paths)
284-
docs = output["documents"]
285-
assert len(docs) == 1
286-
assert "ReAct" not in docs[0].content
287-
assert "I don't care about converting given pdfs, I always return this" in docs[0].content
288-
289206
def test_run_empty_document(self, caplog, test_files_path):
290207
paths = [test_files_path / "pdf" / "non_text_searchable.pdf"]
291208
with caplog.at_level(logging.WARNING):

0 commit comments

Comments
 (0)