feat: PyPDFToDocument - add new customization parameters (#8574)

anakin87 · web-flow · commit fb42c035c554 · 2024-11-26T16:37:59.000+01:00
* deprecat converter in pypdf

* fix linting of MetaFieldGroupingRanker

* linting

* pypdftodocument: add customization params

* fix mypy

* incorporate feedback
diff --git a/haystack/components/converters/pypdf.py b/haystack/components/converters/pypdf.py
@@ -4,6 +4,7 @@
 
 import io
 import warnings
+from enum import Enum
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Protocol, Union
 
@@ -39,6 +40,33 @@ def from_dict(cls, data):  # noqa: D102
         ...
 
 
+class PyPDFExtractionMode(Enum):
+    """
+    The mode to use for extracting text from a PDF.
+    """
+
+    PLAIN = "plain"
+    LAYOUT = "layout"
+
+    def __str__(self) -> str:
+        """
+        Convert a PyPDFExtractionMode enum to a string.
+        """
+        return self.value
+
+    @staticmethod
+    def from_str(string: str) -> "PyPDFExtractionMode":
+        """
+        Convert a string to a PyPDFExtractionMode enum.
+        """
+        enum_map = {e.value: e for e in PyPDFExtractionMode}
+        mode = enum_map.get(string)
+        if mode is None:
+            msg = f"Unknown extraction mode '{string}'. Supported modes are: {list(enum_map.keys())}"
+            raise ValueError(msg)
+        return mode
+
+
 @component
 class PyPDFToDocument:
     """
@@ -60,13 +88,49 @@ class PyPDFToDocument:
     ```
     """
 
-    def __init__(self, converter: Optional[PyPDFConverter] = None):
+    def __init__(
+        self,
+        converter: Optional[PyPDFConverter] = None,
+        *,
+        extraction_mode: Union[str, PyPDFExtractionMode] = PyPDFExtractionMode.PLAIN,
+        plain_mode_orientations: tuple = (0, 90, 180, 270),
+        plain_mode_space_width: float = 200.0,
+        layout_mode_space_vertically: bool = True,
+        layout_mode_scale_weight: float = 1.25,
+        layout_mode_strip_rotated: bool = True,
+        layout_mode_font_height_weight: float = 1.0,
+    ):
         """
         Create an PyPDFToDocument component.
 
         :param converter:
             An instance of a PyPDFConverter compatible class. This is deprecated and will be removed in Haystack 2.9.0.
             For in-depth customization of the conversion process, consider implementing a custom component.
+
+        All the following parameters are applied only if `converter` is None.
+
+        :param extraction_mode:
+            The mode to use for extracting text from a PDF.
+            Layout mode is an experimental mode that adheres to the rendered layout of the PDF.
+        :param plain_mode_orientations:
+            Tuple of orientations to look for when extracting text from a PDF in plain mode.
+            Ignored if `extraction_mode` is `PyPDFExtractionMode.LAYOUT`.
+        :param plain_mode_space_width:
+            Forces default space width if not extracted from font.
+            Ignored if `extraction_mode` is `PyPDFExtractionMode.LAYOUT`.
+        :param layout_mode_space_vertically:
+            Whether to include blank lines inferred from y distance + font height.
+            Ignored if `extraction_mode` is `PyPDFExtractionMode.PLAIN`.
+        :param layout_mode_scale_weight:
+            Multiplier for string length when calculating weighted average character width.
+            Ignored if `extraction_mode` is `PyPDFExtractionMode.PLAIN`.
+        :param layout_mode_strip_rotated:
+            Layout mode does not support rotated text. Set to `False` to include rotated text anyway.
+            If rotated text is discovered, layout will be degraded and a warning will be logged.
+            Ignored if `extraction_mode` is `PyPDFExtractionMode.PLAIN`.
+        :param layout_mode_font_height_weight:
+            Multiplier for font height when calculating blank line height.
+            Ignored if `extraction_mode` is `PyPDFExtractionMode.PLAIN`.
         """
         pypdf_import.check()
 
@@ -79,6 +143,16 @@ def __init__(self, converter: Optional[PyPDFConverter] = None):
 
         self.converter = converter
 
+        if isinstance(extraction_mode, str):
+            extraction_mode = PyPDFExtractionMode.from_str(extraction_mode)
+        self.extraction_mode = extraction_mode
+        self.plain_mode_orientations = plain_mode_orientations
+        self.plain_mode_space_width = plain_mode_space_width
+        self.layout_mode_space_vertically = layout_mode_space_vertically
+        self.layout_mode_scale_weight = layout_mode_scale_weight
+        self.layout_mode_strip_rotated = layout_mode_strip_rotated
+        self.layout_mode_font_height_weight = layout_mode_font_height_weight
+
     def to_dict(self):
         """
         Serializes the component to a dictionary.
@@ -87,7 +161,15 @@ def to_dict(self):
             Dictionary with serialized data.
         """
         return default_to_dict(
-            self, converter=(serialize_class_instance(self.converter) if self.converter is not None else None)
+            self,
+            converter=(serialize_class_instance(self.converter) if self.converter else None),
+            extraction_mode=str(self.extraction_mode),
+            plain_mode_orientations=self.plain_mode_orientations,
+            plain_mode_space_width=self.plain_mode_space_width,
+            layout_mode_space_vertically=self.layout_mode_space_vertically,
+            layout_mode_scale_weight=self.layout_mode_scale_weight,
+            layout_mode_strip_rotated=self.layout_mode_strip_rotated,
+            layout_mode_font_height_weight=self.layout_mode_font_height_weight,
         )
 
     @classmethod
@@ -108,7 +190,20 @@ def from_dict(cls, data):
         return default_from_dict(cls, data)
 
     def _default_convert(self, reader: "PdfReader") -> Document:
-        text = "\f".join(page.extract_text() for page in reader.pages)
+        texts = []
+        for page in reader.pages:
+            texts.append(
+                page.extract_text(
+                    orientations=self.plain_mode_orientations,
+                    extraction_mode=self.extraction_mode.value,
+                    space_width=self.plain_mode_space_width,
+                    layout_mode_space_vertically=self.layout_mode_space_vertically,
+                    layout_mode_scale_weight=self.layout_mode_scale_weight,
+                    layout_mode_strip_rotated=self.layout_mode_strip_rotated,
+                    layout_mode_font_height_weight=self.layout_mode_font_height_weight,
+                )
+            )
+        text = "\f".join(texts)
         return Document(content=text)
 
     @component.output_types(documents=List[Document])
diff --git a/releasenotes/notes/pypdf-add-customization-params-3da578deff7f83a5.yaml b/releasenotes/notes/pypdf-add-customization-params-3da578deff7f83a5.yaml
@@ -0,0 +1,5 @@
+---
+enhancements:
+  - |
+    Added new initialization parameters to the `PyPDFToDocument` component to customize the text extraction process
+    from PDF files.
diff --git a/test/components/converters/test_pypdf_to_document.py b/test/components/converters/test_pypdf_to_document.py
@@ -2,17 +2,17 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 import logging
-from unittest.mock import patch
+from unittest.mock import patch, Mock
 
 import pytest
 
 from haystack import Document, default_from_dict, default_to_dict
-from haystack.components.converters.pypdf import PyPDFToDocument
+from haystack.components.converters.pypdf import PyPDFToDocument, PyPDFExtractionMode
 from haystack.dataclasses import ByteStream
 
 
 @pytest.fixture
-def pypdf_converter():
+def pypdf_component():
     return PyPDFToDocument()
 
 
@@ -30,38 +30,104 @@ def from_dict(cls, data):
 
 
 class TestPyPDFToDocument:
-    def test_init(self, pypdf_converter):
-        assert pypdf_converter.converter is None
+    def test_init(self, pypdf_component):
+        assert pypdf_component.converter is None
+        assert pypdf_component.extraction_mode == PyPDFExtractionMode.PLAIN
+        assert pypdf_component.plain_mode_orientations == (0, 90, 180, 270)
+        assert pypdf_component.plain_mode_space_width == 200.0
+        assert pypdf_component.layout_mode_space_vertically is True
+        assert pypdf_component.layout_mode_scale_weight == 1.25
+        assert pypdf_component.layout_mode_strip_rotated is True
+        assert pypdf_component.layout_mode_font_height_weight == 1.0
 
-    def test_init_params(self):
-        pypdf_converter = PyPDFToDocument(converter=CustomConverter())
-        assert isinstance(pypdf_converter.converter, CustomConverter)
+    def test_init_converter(self):
+        pypdf_component = PyPDFToDocument(converter=CustomConverter())
+        assert isinstance(pypdf_component.converter, CustomConverter)
 
-    def test_to_dict(self, pypdf_converter):
-        data = pypdf_converter.to_dict()
+    def test_init_custom_params(self):
+        pypdf_component = PyPDFToDocument(
+            extraction_mode="layout",
+            plain_mode_orientations=(0, 90),
+            plain_mode_space_width=150.0,
+            layout_mode_space_vertically=False,
+            layout_mode_scale_weight=2.0,
+            layout_mode_strip_rotated=False,
+            layout_mode_font_height_weight=0.5,
+        )
+
+        assert pypdf_component.extraction_mode == PyPDFExtractionMode.LAYOUT
+        assert pypdf_component.plain_mode_orientations == (0, 90)
+        assert pypdf_component.plain_mode_space_width == 150.0
+        assert pypdf_component.layout_mode_space_vertically is False
+        assert pypdf_component.layout_mode_scale_weight == 2.0
+        assert pypdf_component.layout_mode_strip_rotated is False
+        assert pypdf_component.layout_mode_font_height_weight == 0.5
+
+    def test_init_invalid_extraction_mode(self):
+        with pytest.raises(ValueError):
+            PyPDFToDocument(extraction_mode="invalid")
+
+    def test_to_dict(self, pypdf_component):
+        data = pypdf_component.to_dict()
         assert data == {
             "type": "haystack.components.converters.pypdf.PyPDFToDocument",
-            "init_parameters": {"converter": None},
+            "init_parameters": {
+                "converter": None,
+                "extraction_mode": "plain",
+                "plain_mode_orientations": (0, 90, 180, 270),
+                "plain_mode_space_width": 200.0,
+                "layout_mode_space_vertically": True,
+                "layout_mode_scale_weight": 1.25,
+                "layout_mode_strip_rotated": True,
+                "layout_mode_font_height_weight": 1.0,
+            },
         }
 
     def test_to_dict_custom_converter(self):
-        pypdf_converter = PyPDFToDocument(converter=CustomConverter())
-        data = pypdf_converter.to_dict()
+        pypdf_component = PyPDFToDocument(converter=CustomConverter())
+        data = pypdf_component.to_dict()
         assert data == {
             "type": "haystack.components.converters.pypdf.PyPDFToDocument",
             "init_parameters": {
                 "converter": {
                     "data": {"key": "value", "more": False},
                     "type": "converters.test_pypdf_to_document.CustomConverter",
-                }
+                },
+                "extraction_mode": "plain",
+                "plain_mode_orientations": (0, 90, 180, 270),
+                "plain_mode_space_width": 200.0,
+                "layout_mode_space_vertically": True,
+                "layout_mode_scale_weight": 1.25,
+                "layout_mode_strip_rotated": True,
+                "layout_mode_font_height_weight": 1.0,
             },
         }
 
     def test_from_dict(self):
-        data = {"type": "haystack.components.converters.pypdf.PyPDFToDocument", "init_parameters": {"converter": None}}
+        data = {
+            "type": "haystack.components.converters.pypdf.PyPDFToDocument",
+            "init_parameters": {
+                "converter": None,
+                "extraction_mode": "plain",
+                "plain_mode_orientations": (0, 90, 180, 270),
+                "plain_mode_space_width": 200.0,
+                "layout_mode_space_vertically": True,
+                "layout_mode_scale_weight": 1.25,
+                "layout_mode_strip_rotated": True,
+                "layout_mode_font_height_weight": 1.0,
+            },
+        }
+
         instance = PyPDFToDocument.from_dict(data)
         assert isinstance(instance, PyPDFToDocument)
         assert instance.converter is None
+        assert instance.extraction_mode == PyPDFExtractionMode.PLAIN
+        assert instance.plain_mode_orientations == (0, 90, 180, 270)
+        assert instance.plain_mode_space_width == 200.0
+        assert instance.layout_mode_space_vertically is True
+        assert instance.layout_mode_scale_weight == 1.25
+        assert instance.layout_mode_strip_rotated is True
+        assert instance.layout_mode_font_height_weight == 1.0
 
     def test_from_dict_defaults(self):
         data = {"type": "haystack.components.converters.pypdf.PyPDFToDocument", "init_parameters": {}}
@@ -83,30 +149,63 @@ def test_from_dict_custom_converter(self):
         assert isinstance(instance, PyPDFToDocument)
         assert isinstance(instance.converter, CustomConverter)
 
+    def test_default_convert(self):
+        mock_page1 = Mock()
+        mock_page2 = Mock()
+        mock_page1.extract_text.return_value = "Page 1 content"
+        mock_page2.extract_text.return_value = "Page 2 content"
+        mock_reader = Mock()
+        mock_reader.pages = [mock_page1, mock_page2]
+
+        converter = PyPDFToDocument(
+            extraction_mode="layout",
+            plain_mode_orientations=(0, 90),
+            plain_mode_space_width=150.0,
+            layout_mode_space_vertically=False,
+            layout_mode_scale_weight=2.0,
+            layout_mode_strip_rotated=False,
+            layout_mode_font_height_weight=1.5,
+        )
+
+        doc = converter._default_convert(mock_reader)
+        assert doc.content == "Page 1 content\fPage 2 content"
+
+        expected_params = {
+            "extraction_mode": "layout",
+            "orientations": (0, 90),
+            "space_width": 150.0,
+            "layout_mode_space_vertically": False,
+            "layout_mode_scale_weight": 2.0,
+            "layout_mode_strip_rotated": False,
+            "layout_mode_font_height_weight": 1.5,
+        }
+        for mock_page in mock_reader.pages:
+            mock_page.extract_text.assert_called_once_with(**expected_params)
+
     @pytest.mark.integration
-    def test_run(self, test_files_path, pypdf_converter):
+    def test_run(self, test_files_path, pypdf_component):
         """
         Test if the component runs correctly.
         """
         paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
-        output = pypdf_converter.run(sources=paths)
+        output = pypdf_component.run(sources=paths)
         docs = output["documents"]
         assert len(docs) == 1
         assert "History" in docs[0].content
 
     @pytest.mark.integration
-    def test_page_breaks_added(self, test_files_path, pypdf_converter):
+    def test_page_breaks_added(self, test_files_path, pypdf_component):
         paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
-        output = pypdf_converter.run(sources=paths)
+        output = pypdf_component.run(sources=paths)
         docs = output["documents"]
         assert len(docs) == 1
         assert docs[0].content.count("\f") == 3
 
-    def test_run_with_meta(self, test_files_path, pypdf_converter):
+    def test_run_with_meta(self, test_files_path, pypdf_component):
         bytestream = ByteStream(data=b"test", meta={"author": "test_author", "language": "en"})
 
         with patch("haystack.components.converters.pypdf.PdfReader"):
-            output = pypdf_converter.run(
+            output = pypdf_component.run(
                 sources=[bytestream, test_files_path / "pdf" / "sample_pdf_1.pdf"], meta={"language": "it"}
             )
 
@@ -115,25 +214,25 @@ def test_run_with_meta(self, test_files_path, pypdf_converter):
         assert output["documents"][0].meta["language"] == "it"
         assert output["documents"][1].meta["language"] == "it"
 
-    def test_run_error_handling(self, test_files_path, pypdf_converter, caplog):
+    def test_run_error_handling(self, test_files_path, pypdf_component, caplog):
         """
         Test if the component correctly handles errors.
         """
         paths = ["non_existing_file.pdf"]
         with caplog.at_level(logging.WARNING):
-            pypdf_converter.run(sources=paths)
+            pypdf_component.run(sources=paths)
             assert "Could not read non_existing_file.pdf" in caplog.text
 
     @pytest.mark.integration
-    def test_mixed_sources_run(self, test_files_path, pypdf_converter):
+    def test_mixed_sources_run(self, test_files_path, pypdf_component):
         """
         Test if the component runs correctly when mixed sources are provided.
         """
         paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
         with open(test_files_path / "pdf" / "sample_pdf_1.pdf", "rb") as f:
             paths.append(ByteStream(f.read()))
 
-        output = pypdf_converter.run(sources=paths)
+        output = pypdf_component.run(sources=paths)
         docs = output["documents"]
         assert len(docs) == 2
         assert "History and standardization" in docs[0].content