Skip to content

Commit 0bf24c0

Browse files
shadeMejulian-risch
authored andcommitted
fix: PyPDFToDocument correctly serializes custom converters, deprecate DefaultConverter (#8430)
* fix: `PyPDFToDocument` correctly serializes custom converters, deprecate `DefaultConverter` * Remove `auto` prefix from serde util function names, add unit tests
1 parent 3ae34b5 commit 0bf24c0

File tree

5 files changed

+193
-11
lines changed

5 files changed

+193
-11
lines changed

Diff for: haystack/components/converters/pypdf.py

+21-6
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,15 @@
33
# SPDX-License-Identifier: Apache-2.0
44

55
import io
6+
import warnings
67
from pathlib import Path
78
from typing import Any, Dict, List, Optional, Protocol, Union
89

910
from haystack import Document, component, default_from_dict, default_to_dict, logging
1011
from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
1112
from haystack.dataclasses import ByteStream
1213
from haystack.lazy_imports import LazyImport
13-
from haystack.utils.type_serialization import deserialize_type
14+
from haystack.utils.base_serialization import deserialize_class_instance, serialize_class_instance
1415

1516
with LazyImport("Run 'pip install pypdf'") as pypdf_import:
1617
from pypdf import PdfReader
@@ -40,6 +41,11 @@ class DefaultConverter:
4041
The default converter class that extracts text from a PdfReader object's pages and returns a Document.
4142
"""
4243

44+
def __init__(self):
45+
warnings.warn(
46+
"This class is deprecated and will be merged into `PyPDFToDocument` in 2.7.0.", DeprecationWarning
47+
)
48+
4349
def convert(self, reader: "PdfReader") -> Document:
4450
"""Extract text from the PDF and return a Document object with the text content."""
4551
text = "\f".join(page.extract_text() for page in reader.pages)
@@ -86,7 +92,7 @@ def __init__(self, converter: Optional[PyPDFConverter] = None):
8692
"""
8793
pypdf_import.check()
8894

89-
self.converter = converter or DefaultConverter()
95+
self.converter = converter
9096

9197
def to_dict(self):
9298
"""
@@ -95,7 +101,9 @@ def to_dict(self):
95101
:returns:
96102
Dictionary with serialized data.
97103
"""
98-
return default_to_dict(self, converter=self.converter.to_dict())
104+
return default_to_dict(
105+
self, converter=(serialize_class_instance(self.converter) if self.converter is not None else None)
106+
)
99107

100108
@classmethod
101109
def from_dict(cls, data):
@@ -108,10 +116,15 @@ def from_dict(cls, data):
108116
:returns:
109117
Deserialized component.
110118
"""
111-
converter_class = deserialize_type(data["init_parameters"]["converter"]["type"])
112-
data["init_parameters"]["converter"] = converter_class.from_dict(data["init_parameters"]["converter"])
119+
custom_converter_data = data["init_parameters"]["converter"]
120+
if custom_converter_data is not None:
121+
data["init_parameters"]["converter"] = deserialize_class_instance(custom_converter_data)
113122
return default_from_dict(cls, data)
114123

124+
def _default_convert(self, reader: "PdfReader") -> Document:
125+
text = "\f".join(page.extract_text() for page in reader.pages)
126+
return Document(content=text)
127+
115128
@component.output_types(documents=List[Document])
116129
def run(
117130
self,
@@ -145,7 +158,9 @@ def run(
145158
continue
146159
try:
147160
pdf_reader = PdfReader(io.BytesIO(bytestream.data))
148-
document = self.converter.convert(pdf_reader)
161+
document = (
162+
self._default_convert(pdf_reader) if self.converter is None else self.converter.convert(pdf_reader)
163+
)
149164
except Exception as e:
150165
logger.warning(
151166
"Could not read {source} and convert it to Document, skipping. {error}", source=source, error=e

Diff for: haystack/utils/base_serialization.py

+54
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
# SPDX-FileCopyrightText: 2022-present deepset GmbH <[email protected]>
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
from typing import Any, Dict
6+
7+
from haystack.core.errors import DeserializationError, SerializationError
8+
from haystack.core.serialization import generate_qualified_class_name, import_class_by_name
9+
10+
11+
def serialize_class_instance(obj: Any) -> Dict[str, Any]:
12+
"""
13+
Serializes an object that has a `to_dict` method into a dictionary.
14+
15+
:param obj:
16+
The object to be serialized.
17+
:returns:
18+
A dictionary representation of the object.
19+
:raises SerializationError:
20+
If the object does not have a `to_dict` method.
21+
"""
22+
if not hasattr(obj, "to_dict"):
23+
raise SerializationError(f"Object of class '{type(obj).__name__}' does not have a 'to_dict' method")
24+
25+
output = obj.to_dict()
26+
return {"type": generate_qualified_class_name(type(obj)), "data": output}
27+
28+
29+
def deserialize_class_instance(data: Dict[str, Any]) -> Any:
30+
"""
31+
Deserializes an object from a dictionary representation generated by `auto_serialize_class_instance`.
32+
33+
:param data:
34+
The dictionary to deserialize from.
35+
:returns:
36+
The deserialized object.
37+
:raises DeserializationError:
38+
If the serialization data is malformed, the class type cannot be imported, or the
39+
class does not have a `from_dict` method.
40+
"""
41+
if "type" not in data:
42+
raise DeserializationError("Missing 'type' in serialization data")
43+
if "data" not in data:
44+
raise DeserializationError("Missing 'data' in serialization data")
45+
46+
try:
47+
obj_class = import_class_by_name(data["type"])
48+
except ImportError as e:
49+
raise DeserializationError(f"Class '{data['type']}' not correctly imported") from e
50+
51+
if not hasattr(obj_class, "from_dict"):
52+
raise DeserializationError(f"Class '{data['type']}' does not have a 'from_dict' method")
53+
54+
return obj_class.from_dict(data["data"])
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
---
2+
deprecations:
3+
- |
4+
The `DefaultConverter` class used by the `PyPDFToDocument` component has been deprecated. Its functionality will be merged into the component in 2.7.0.
5+
fixes:
6+
- |
7+
Fix the serialization of `PyPDFToDocument` component to prevent the default converter from being serialized unnecessarily.

Diff for: test/components/converters/test_pypdf_to_document.py

+44-5
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import pytest
88

99
from haystack import Document, default_from_dict, default_to_dict
10-
from haystack.components.converters.pypdf import DefaultConverter, PyPDFToDocument
10+
from haystack.components.converters.pypdf import PyPDFToDocument
1111
from haystack.dataclasses import ByteStream
1212

1313

@@ -16,29 +16,68 @@ def pypdf_converter():
1616
return PyPDFToDocument()
1717

1818

19+
class CustomConverter:
20+
def convert(self, reader):
21+
return Document(content="Custom converter")
22+
23+
def to_dict(self):
24+
return {"key": "value", "more": False}
25+
26+
@classmethod
27+
def from_dict(cls, data):
28+
assert data == {"key": "value", "more": False}
29+
return cls()
30+
31+
1932
class TestPyPDFToDocument:
2033
def test_init(self, pypdf_converter):
21-
assert isinstance(pypdf_converter.converter, DefaultConverter)
34+
assert pypdf_converter.converter is None
35+
36+
def test_init_params(self):
37+
pypdf_converter = PyPDFToDocument(converter=CustomConverter())
38+
assert isinstance(pypdf_converter.converter, CustomConverter)
2239

2340
def test_to_dict(self, pypdf_converter):
41+
data = pypdf_converter.to_dict()
42+
assert data == {
43+
"type": "haystack.components.converters.pypdf.PyPDFToDocument",
44+
"init_parameters": {"converter": None},
45+
}
46+
47+
def test_to_dict_custom_converter(self):
48+
pypdf_converter = PyPDFToDocument(converter=CustomConverter())
2449
data = pypdf_converter.to_dict()
2550
assert data == {
2651
"type": "haystack.components.converters.pypdf.PyPDFToDocument",
2752
"init_parameters": {
28-
"converter": {"type": "haystack.components.converters.pypdf.DefaultConverter", "init_parameters": {}}
53+
"converter": {
54+
"data": {"key": "value", "more": False},
55+
"type": "converters.test_pypdf_to_document.CustomConverter",
56+
}
2957
},
3058
}
3159

3260
def test_from_dict(self):
61+
data = {"type": "haystack.components.converters.pypdf.PyPDFToDocument", "init_parameters": {"converter": None}}
62+
instance = PyPDFToDocument.from_dict(data)
63+
assert isinstance(instance, PyPDFToDocument)
64+
assert instance.converter is None
65+
66+
def test_from_dict_custom_converter(self):
67+
pypdf_converter = PyPDFToDocument(converter=CustomConverter())
68+
data = pypdf_converter.to_dict()
3369
data = {
3470
"type": "haystack.components.converters.pypdf.PyPDFToDocument",
3571
"init_parameters": {
36-
"converter": {"type": "haystack.components.converters.pypdf.DefaultConverter", "init_parameters": {}}
72+
"converter": {
73+
"data": {"key": "value", "more": False},
74+
"type": "converters.test_pypdf_to_document.CustomConverter",
75+
}
3776
},
3877
}
3978
instance = PyPDFToDocument.from_dict(data)
4079
assert isinstance(instance, PyPDFToDocument)
41-
assert isinstance(instance.converter, DefaultConverter)
80+
assert isinstance(instance.converter, CustomConverter)
4281

4382
@pytest.mark.integration
4483
def test_run(self, test_files_path, pypdf_converter):

Diff for: test/utils/test_base_serialization.py

+67
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
# SPDX-FileCopyrightText: 2022-present deepset GmbH <[email protected]>
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
from haystack.core.errors import DeserializationError, SerializationError
5+
import pytest
6+
7+
from haystack.utils.base_serialization import serialize_class_instance, deserialize_class_instance
8+
9+
10+
class CustomClass:
11+
def to_dict(self):
12+
return {"key": "value", "more": False}
13+
14+
@classmethod
15+
def from_dict(cls, data):
16+
assert data == {"key": "value", "more": False}
17+
return cls()
18+
19+
20+
class CustomClassNoToDict:
21+
@classmethod
22+
def from_dict(cls, data):
23+
assert data == {"key": "value", "more": False}
24+
return cls()
25+
26+
27+
class CustomClassNoFromDict:
28+
def to_dict(self):
29+
return {"key": "value", "more": False}
30+
31+
32+
def test_serialize_class_instance():
33+
result = serialize_class_instance(CustomClass())
34+
assert result == {"data": {"key": "value", "more": False}, "type": "test_base_serialization.CustomClass"}
35+
36+
37+
def test_serialize_class_instance_missing_method():
38+
with pytest.raises(SerializationError, match="does not have a 'to_dict' method"):
39+
serialize_class_instance(CustomClassNoToDict())
40+
41+
42+
def test_deserialize_class_instance():
43+
data = {"data": {"key": "value", "more": False}, "type": "test_base_serialization.CustomClass"}
44+
45+
result = deserialize_class_instance(data)
46+
assert isinstance(result, CustomClass)
47+
48+
49+
def test_deserialize_class_instance_invalid_data():
50+
data = {"data": {"key": "value", "more": False}, "type": "test_base_serialization.CustomClass"}
51+
52+
with pytest.raises(DeserializationError, match="Missing 'type'"):
53+
deserialize_class_instance({})
54+
55+
with pytest.raises(DeserializationError, match="Missing 'data'"):
56+
deserialize_class_instance({"type": "test_base_serialization.CustomClass"})
57+
58+
with pytest.raises(
59+
DeserializationError, match="Class 'test_base_serialization.CustomClass1' not correctly imported"
60+
):
61+
deserialize_class_instance({"type": "test_base_serialization.CustomClass1", "data": {}})
62+
63+
with pytest.raises(
64+
DeserializationError,
65+
match="Class 'test_base_serialization.CustomClassNoFromDict' does not have a 'from_dict' method",
66+
):
67+
deserialize_class_instance({"type": "test_base_serialization.CustomClassNoFromDict", "data": {}})

0 commit comments

Comments
 (0)