fix: PyPDFToDocument correctly serializes custom converters, deprecate DefaultConverter (#8430)

shadeMe · julian-risch · commit 0bf24c0638e4 · 2024-10-01T16:37:16.000+02:00
* fix: `PyPDFToDocument` correctly serializes custom converters, deprecate `DefaultConverter`

* Remove `auto` prefix from serde util function names, add unit tests
diff --git a/haystack/components/converters/pypdf.py b/haystack/components/converters/pypdf.py
@@ -3,14 +3,15 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import io
+import warnings
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Protocol, Union
 
 from haystack import Document, component, default_from_dict, default_to_dict, logging
 from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
 from haystack.dataclasses import ByteStream
 from haystack.lazy_imports import LazyImport
-from haystack.utils.type_serialization import deserialize_type
+from haystack.utils.base_serialization import deserialize_class_instance, serialize_class_instance
 
 with LazyImport("Run 'pip install pypdf'") as pypdf_import:
     from pypdf import PdfReader
@@ -40,6 +41,11 @@ class DefaultConverter:
     The default converter class that extracts text from a PdfReader object's pages and returns a Document.
     """
 
+    def __init__(self):
+        warnings.warn(
+            "This class is deprecated and will be merged into `PyPDFToDocument` in 2.7.0.", DeprecationWarning
+        )
+
     def convert(self, reader: "PdfReader") -> Document:
         """Extract text from the PDF and return a Document object with the text content."""
         text = "\f".join(page.extract_text() for page in reader.pages)
@@ -86,7 +92,7 @@ def __init__(self, converter: Optional[PyPDFConverter] = None):
         """
         pypdf_import.check()
 
-        self.converter = converter or DefaultConverter()
+        self.converter = converter
 
     def to_dict(self):
         """
@@ -95,7 +101,9 @@ def to_dict(self):
         :returns:
             Dictionary with serialized data.
         """
-        return default_to_dict(self, converter=self.converter.to_dict())
+        return default_to_dict(
+            self, converter=(serialize_class_instance(self.converter) if self.converter is not None else None)
+        )
 
     @classmethod
     def from_dict(cls, data):
@@ -108,10 +116,15 @@ def from_dict(cls, data):
         :returns:
             Deserialized component.
         """
-        converter_class = deserialize_type(data["init_parameters"]["converter"]["type"])
-        data["init_parameters"]["converter"] = converter_class.from_dict(data["init_parameters"]["converter"])
+        custom_converter_data = data["init_parameters"]["converter"]
+        if custom_converter_data is not None:
+            data["init_parameters"]["converter"] = deserialize_class_instance(custom_converter_data)
         return default_from_dict(cls, data)
 
+    def _default_convert(self, reader: "PdfReader") -> Document:
+        text = "\f".join(page.extract_text() for page in reader.pages)
+        return Document(content=text)
+
     @component.output_types(documents=List[Document])
     def run(
         self,
@@ -145,7 +158,9 @@ def run(
                 continue
             try:
                 pdf_reader = PdfReader(io.BytesIO(bytestream.data))
-                document = self.converter.convert(pdf_reader)
+                document = (
+                    self._default_convert(pdf_reader) if self.converter is None else self.converter.convert(pdf_reader)
+                )
             except Exception as e:
                 logger.warning(
                     "Could not read {source} and convert it to Document, skipping. {error}", source=source, error=e
diff --git a/haystack/utils/base_serialization.py b/haystack/utils/base_serialization.py
@@ -0,0 +1,54 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Dict
+
+from haystack.core.errors import DeserializationError, SerializationError
+from haystack.core.serialization import generate_qualified_class_name, import_class_by_name
+
+
+def serialize_class_instance(obj: Any) -> Dict[str, Any]:
+    """
+    Serializes an object that has a `to_dict` method into a dictionary.
+
+    :param obj:
+        The object to be serialized.
+    :returns:
+        A dictionary representation of the object.
+    :raises SerializationError:
+        If the object does not have a `to_dict` method.
+    """
+    if not hasattr(obj, "to_dict"):
+        raise SerializationError(f"Object of class '{type(obj).__name__}' does not have a 'to_dict' method")
+
+    output = obj.to_dict()
+    return {"type": generate_qualified_class_name(type(obj)), "data": output}
+
+
+def deserialize_class_instance(data: Dict[str, Any]) -> Any:
+    """
+    Deserializes an object from a dictionary representation generated by `auto_serialize_class_instance`.
+
+    :param data:
+        The dictionary to deserialize from.
+    :returns:
+        The deserialized object.
+    :raises DeserializationError:
+        If the serialization data is malformed, the class type cannot be imported, or the
+        class does not have a `from_dict` method.
+    """
+    if "type" not in data:
+        raise DeserializationError("Missing 'type' in serialization data")
+    if "data" not in data:
+        raise DeserializationError("Missing 'data' in serialization data")
+
+    try:
+        obj_class = import_class_by_name(data["type"])
+    except ImportError as e:
+        raise DeserializationError(f"Class '{data['type']}' not correctly imported") from e
+
+    if not hasattr(obj_class, "from_dict"):
+        raise DeserializationError(f"Class '{data['type']}' does not have a 'from_dict' method")
+
+    return obj_class.from_dict(data["data"])
diff --git a/releasenotes/notes/pypdf-serde-fixes-f33c29830508ee01.yaml b/releasenotes/notes/pypdf-serde-fixes-f33c29830508ee01.yaml
@@ -0,0 +1,7 @@
+---
+deprecations:
+  - |
+    The `DefaultConverter` class used by the `PyPDFToDocument` component has been deprecated. Its functionality will be merged into the component in 2.7.0.
+fixes:
+  - |
+    Fix the serialization of `PyPDFToDocument` component to prevent the default converter from being serialized unnecessarily.
diff --git a/test/components/converters/test_pypdf_to_document.py b/test/components/converters/test_pypdf_to_document.py
@@ -7,7 +7,7 @@
 import pytest
 
 from haystack import Document, default_from_dict, default_to_dict
-from haystack.components.converters.pypdf import DefaultConverter, PyPDFToDocument
+from haystack.components.converters.pypdf import PyPDFToDocument
 from haystack.dataclasses import ByteStream
 
 
@@ -16,29 +16,68 @@ def pypdf_converter():
     return PyPDFToDocument()
 
 
+class CustomConverter:
+    def convert(self, reader):
+        return Document(content="Custom converter")
+
+    def to_dict(self):
+        return {"key": "value", "more": False}
+
+    @classmethod
+    def from_dict(cls, data):
+        assert data == {"key": "value", "more": False}
+        return cls()
+
+
 class TestPyPDFToDocument:
     def test_init(self, pypdf_converter):
-        assert isinstance(pypdf_converter.converter, DefaultConverter)
+        assert pypdf_converter.converter is None
+
+    def test_init_params(self):
+        pypdf_converter = PyPDFToDocument(converter=CustomConverter())
+        assert isinstance(pypdf_converter.converter, CustomConverter)
 
     def test_to_dict(self, pypdf_converter):
+        data = pypdf_converter.to_dict()
+        assert data == {
+            "type": "haystack.components.converters.pypdf.PyPDFToDocument",
+            "init_parameters": {"converter": None},
+        }
+
+    def test_to_dict_custom_converter(self):
+        pypdf_converter = PyPDFToDocument(converter=CustomConverter())
         data = pypdf_converter.to_dict()
         assert data == {
             "type": "haystack.components.converters.pypdf.PyPDFToDocument",
             "init_parameters": {
-                "converter": {"type": "haystack.components.converters.pypdf.DefaultConverter", "init_parameters": {}}
+                "converter": {
+                    "data": {"key": "value", "more": False},
+                    "type": "converters.test_pypdf_to_document.CustomConverter",
+                }
             },
         }
 
     def test_from_dict(self):
+        data = {"type": "haystack.components.converters.pypdf.PyPDFToDocument", "init_parameters": {"converter": None}}
+        instance = PyPDFToDocument.from_dict(data)
+        assert isinstance(instance, PyPDFToDocument)
+        assert instance.converter is None
+
+    def test_from_dict_custom_converter(self):
+        pypdf_converter = PyPDFToDocument(converter=CustomConverter())
+        data = pypdf_converter.to_dict()
         data = {
             "type": "haystack.components.converters.pypdf.PyPDFToDocument",
             "init_parameters": {
-                "converter": {"type": "haystack.components.converters.pypdf.DefaultConverter", "init_parameters": {}}
+                "converter": {
+                    "data": {"key": "value", "more": False},
+                    "type": "converters.test_pypdf_to_document.CustomConverter",
+                }
             },
         }
         instance = PyPDFToDocument.from_dict(data)
         assert isinstance(instance, PyPDFToDocument)
-        assert isinstance(instance.converter, DefaultConverter)
+        assert isinstance(instance.converter, CustomConverter)
 
     @pytest.mark.integration
     def test_run(self, test_files_path, pypdf_converter):
diff --git a/test/utils/test_base_serialization.py b/test/utils/test_base_serialization.py
@@ -0,0 +1,67 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+from haystack.core.errors import DeserializationError, SerializationError
+import pytest
+
+from haystack.utils.base_serialization import serialize_class_instance, deserialize_class_instance
+
+
+class CustomClass:
+    def to_dict(self):
+        return {"key": "value", "more": False}
+
+    @classmethod
+    def from_dict(cls, data):
+        assert data == {"key": "value", "more": False}
+        return cls()
+
+
+class CustomClassNoToDict:
+    @classmethod
+    def from_dict(cls, data):
+        assert data == {"key": "value", "more": False}
+        return cls()
+
+
+class CustomClassNoFromDict:
+    def to_dict(self):
+        return {"key": "value", "more": False}
+
+
+def test_serialize_class_instance():
+    result = serialize_class_instance(CustomClass())
+    assert result == {"data": {"key": "value", "more": False}, "type": "test_base_serialization.CustomClass"}
+
+
+def test_serialize_class_instance_missing_method():
+    with pytest.raises(SerializationError, match="does not have a 'to_dict' method"):
+        serialize_class_instance(CustomClassNoToDict())
+
+
+def test_deserialize_class_instance():
+    data = {"data": {"key": "value", "more": False}, "type": "test_base_serialization.CustomClass"}
+
+    result = deserialize_class_instance(data)
+    assert isinstance(result, CustomClass)
+
+
+def test_deserialize_class_instance_invalid_data():
+    data = {"data": {"key": "value", "more": False}, "type": "test_base_serialization.CustomClass"}
+
+    with pytest.raises(DeserializationError, match="Missing 'type'"):
+        deserialize_class_instance({})
+
+    with pytest.raises(DeserializationError, match="Missing 'data'"):
+        deserialize_class_instance({"type": "test_base_serialization.CustomClass"})
+
+    with pytest.raises(
+        DeserializationError, match="Class 'test_base_serialization.CustomClass1' not correctly imported"
+    ):
+        deserialize_class_instance({"type": "test_base_serialization.CustomClass1", "data": {}})
+
+    with pytest.raises(
+        DeserializationError,
+        match="Class 'test_base_serialization.CustomClassNoFromDict' does not have a 'from_dict' method",
+    ):
+        deserialize_class_instance({"type": "test_base_serialization.CustomClassNoFromDict", "data": {}})