Skip to content

Commit fb42c03

Browse files
authored
feat: PyPDFToDocument - add new customization parameters (#8574)
* deprecat converter in pypdf * fix linting of MetaFieldGroupingRanker * linting * pypdftodocument: add customization params * fix mypy * incorporate feedback
1 parent 2440a5e commit fb42c03

File tree

3 files changed

+227
-28
lines changed

3 files changed

+227
-28
lines changed

Diff for: haystack/components/converters/pypdf.py

+98-3
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
import io
66
import warnings
7+
from enum import Enum
78
from pathlib import Path
89
from typing import Any, Dict, List, Optional, Protocol, Union
910

@@ -39,6 +40,33 @@ def from_dict(cls, data): # noqa: D102
3940
...
4041

4142

43+
class PyPDFExtractionMode(Enum):
44+
"""
45+
The mode to use for extracting text from a PDF.
46+
"""
47+
48+
PLAIN = "plain"
49+
LAYOUT = "layout"
50+
51+
def __str__(self) -> str:
52+
"""
53+
Convert a PyPDFExtractionMode enum to a string.
54+
"""
55+
return self.value
56+
57+
@staticmethod
58+
def from_str(string: str) -> "PyPDFExtractionMode":
59+
"""
60+
Convert a string to a PyPDFExtractionMode enum.
61+
"""
62+
enum_map = {e.value: e for e in PyPDFExtractionMode}
63+
mode = enum_map.get(string)
64+
if mode is None:
65+
msg = f"Unknown extraction mode '{string}'. Supported modes are: {list(enum_map.keys())}"
66+
raise ValueError(msg)
67+
return mode
68+
69+
4270
@component
4371
class PyPDFToDocument:
4472
"""
@@ -60,13 +88,49 @@ class PyPDFToDocument:
6088
```
6189
"""
6290

63-
def __init__(self, converter: Optional[PyPDFConverter] = None):
91+
def __init__(
92+
self,
93+
converter: Optional[PyPDFConverter] = None,
94+
*,
95+
extraction_mode: Union[str, PyPDFExtractionMode] = PyPDFExtractionMode.PLAIN,
96+
plain_mode_orientations: tuple = (0, 90, 180, 270),
97+
plain_mode_space_width: float = 200.0,
98+
layout_mode_space_vertically: bool = True,
99+
layout_mode_scale_weight: float = 1.25,
100+
layout_mode_strip_rotated: bool = True,
101+
layout_mode_font_height_weight: float = 1.0,
102+
):
64103
"""
65104
Create an PyPDFToDocument component.
66105
67106
:param converter:
68107
An instance of a PyPDFConverter compatible class. This is deprecated and will be removed in Haystack 2.9.0.
69108
For in-depth customization of the conversion process, consider implementing a custom component.
109+
110+
All the following parameters are applied only if `converter` is None.
111+
112+
:param extraction_mode:
113+
The mode to use for extracting text from a PDF.
114+
Layout mode is an experimental mode that adheres to the rendered layout of the PDF.
115+
:param plain_mode_orientations:
116+
Tuple of orientations to look for when extracting text from a PDF in plain mode.
117+
Ignored if `extraction_mode` is `PyPDFExtractionMode.LAYOUT`.
118+
:param plain_mode_space_width:
119+
Forces default space width if not extracted from font.
120+
Ignored if `extraction_mode` is `PyPDFExtractionMode.LAYOUT`.
121+
:param layout_mode_space_vertically:
122+
Whether to include blank lines inferred from y distance + font height.
123+
Ignored if `extraction_mode` is `PyPDFExtractionMode.PLAIN`.
124+
:param layout_mode_scale_weight:
125+
Multiplier for string length when calculating weighted average character width.
126+
Ignored if `extraction_mode` is `PyPDFExtractionMode.PLAIN`.
127+
:param layout_mode_strip_rotated:
128+
Layout mode does not support rotated text. Set to `False` to include rotated text anyway.
129+
If rotated text is discovered, layout will be degraded and a warning will be logged.
130+
Ignored if `extraction_mode` is `PyPDFExtractionMode.PLAIN`.
131+
:param layout_mode_font_height_weight:
132+
Multiplier for font height when calculating blank line height.
133+
Ignored if `extraction_mode` is `PyPDFExtractionMode.PLAIN`.
70134
"""
71135
pypdf_import.check()
72136

@@ -79,6 +143,16 @@ def __init__(self, converter: Optional[PyPDFConverter] = None):
79143

80144
self.converter = converter
81145

146+
if isinstance(extraction_mode, str):
147+
extraction_mode = PyPDFExtractionMode.from_str(extraction_mode)
148+
self.extraction_mode = extraction_mode
149+
self.plain_mode_orientations = plain_mode_orientations
150+
self.plain_mode_space_width = plain_mode_space_width
151+
self.layout_mode_space_vertically = layout_mode_space_vertically
152+
self.layout_mode_scale_weight = layout_mode_scale_weight
153+
self.layout_mode_strip_rotated = layout_mode_strip_rotated
154+
self.layout_mode_font_height_weight = layout_mode_font_height_weight
155+
82156
def to_dict(self):
83157
"""
84158
Serializes the component to a dictionary.
@@ -87,7 +161,15 @@ def to_dict(self):
87161
Dictionary with serialized data.
88162
"""
89163
return default_to_dict(
90-
self, converter=(serialize_class_instance(self.converter) if self.converter is not None else None)
164+
self,
165+
converter=(serialize_class_instance(self.converter) if self.converter else None),
166+
extraction_mode=str(self.extraction_mode),
167+
plain_mode_orientations=self.plain_mode_orientations,
168+
plain_mode_space_width=self.plain_mode_space_width,
169+
layout_mode_space_vertically=self.layout_mode_space_vertically,
170+
layout_mode_scale_weight=self.layout_mode_scale_weight,
171+
layout_mode_strip_rotated=self.layout_mode_strip_rotated,
172+
layout_mode_font_height_weight=self.layout_mode_font_height_weight,
91173
)
92174

93175
@classmethod
@@ -108,7 +190,20 @@ def from_dict(cls, data):
108190
return default_from_dict(cls, data)
109191

110192
def _default_convert(self, reader: "PdfReader") -> Document:
111-
text = "\f".join(page.extract_text() for page in reader.pages)
193+
texts = []
194+
for page in reader.pages:
195+
texts.append(
196+
page.extract_text(
197+
orientations=self.plain_mode_orientations,
198+
extraction_mode=self.extraction_mode.value,
199+
space_width=self.plain_mode_space_width,
200+
layout_mode_space_vertically=self.layout_mode_space_vertically,
201+
layout_mode_scale_weight=self.layout_mode_scale_weight,
202+
layout_mode_strip_rotated=self.layout_mode_strip_rotated,
203+
layout_mode_font_height_weight=self.layout_mode_font_height_weight,
204+
)
205+
)
206+
text = "\f".join(texts)
112207
return Document(content=text)
113208

114209
@component.output_types(documents=List[Document])
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
enhancements:
3+
- |
4+
Added new initialization parameters to the `PyPDFToDocument` component to customize the text extraction process
5+
from PDF files.

Diff for: test/components/converters/test_pypdf_to_document.py

+124-25
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,17 @@
22
#
33
# SPDX-License-Identifier: Apache-2.0
44
import logging
5-
from unittest.mock import patch
5+
from unittest.mock import patch, Mock
66

77
import pytest
88

99
from haystack import Document, default_from_dict, default_to_dict
10-
from haystack.components.converters.pypdf import PyPDFToDocument
10+
from haystack.components.converters.pypdf import PyPDFToDocument, PyPDFExtractionMode
1111
from haystack.dataclasses import ByteStream
1212

1313

1414
@pytest.fixture
15-
def pypdf_converter():
15+
def pypdf_component():
1616
return PyPDFToDocument()
1717

1818

@@ -30,38 +30,104 @@ def from_dict(cls, data):
3030

3131

3232
class TestPyPDFToDocument:
33-
def test_init(self, pypdf_converter):
34-
assert pypdf_converter.converter is None
33+
def test_init(self, pypdf_component):
34+
assert pypdf_component.converter is None
35+
assert pypdf_component.extraction_mode == PyPDFExtractionMode.PLAIN
36+
assert pypdf_component.plain_mode_orientations == (0, 90, 180, 270)
37+
assert pypdf_component.plain_mode_space_width == 200.0
38+
assert pypdf_component.layout_mode_space_vertically is True
39+
assert pypdf_component.layout_mode_scale_weight == 1.25
40+
assert pypdf_component.layout_mode_strip_rotated is True
41+
assert pypdf_component.layout_mode_font_height_weight == 1.0
3542

36-
def test_init_params(self):
37-
pypdf_converter = PyPDFToDocument(converter=CustomConverter())
38-
assert isinstance(pypdf_converter.converter, CustomConverter)
43+
def test_init_converter(self):
44+
pypdf_component = PyPDFToDocument(converter=CustomConverter())
45+
assert isinstance(pypdf_component.converter, CustomConverter)
3946

40-
def test_to_dict(self, pypdf_converter):
41-
data = pypdf_converter.to_dict()
47+
def test_init_custom_params(self):
48+
pypdf_component = PyPDFToDocument(
49+
extraction_mode="layout",
50+
plain_mode_orientations=(0, 90),
51+
plain_mode_space_width=150.0,
52+
layout_mode_space_vertically=False,
53+
layout_mode_scale_weight=2.0,
54+
layout_mode_strip_rotated=False,
55+
layout_mode_font_height_weight=0.5,
56+
)
57+
58+
assert pypdf_component.extraction_mode == PyPDFExtractionMode.LAYOUT
59+
assert pypdf_component.plain_mode_orientations == (0, 90)
60+
assert pypdf_component.plain_mode_space_width == 150.0
61+
assert pypdf_component.layout_mode_space_vertically is False
62+
assert pypdf_component.layout_mode_scale_weight == 2.0
63+
assert pypdf_component.layout_mode_strip_rotated is False
64+
assert pypdf_component.layout_mode_font_height_weight == 0.5
65+
66+
def test_init_invalid_extraction_mode(self):
67+
with pytest.raises(ValueError):
68+
PyPDFToDocument(extraction_mode="invalid")
69+
70+
def test_to_dict(self, pypdf_component):
71+
data = pypdf_component.to_dict()
4272
assert data == {
4373
"type": "haystack.components.converters.pypdf.PyPDFToDocument",
44-
"init_parameters": {"converter": None},
74+
"init_parameters": {
75+
"converter": None,
76+
"extraction_mode": "plain",
77+
"plain_mode_orientations": (0, 90, 180, 270),
78+
"plain_mode_space_width": 200.0,
79+
"layout_mode_space_vertically": True,
80+
"layout_mode_scale_weight": 1.25,
81+
"layout_mode_strip_rotated": True,
82+
"layout_mode_font_height_weight": 1.0,
83+
},
4584
}
4685

4786
def test_to_dict_custom_converter(self):
48-
pypdf_converter = PyPDFToDocument(converter=CustomConverter())
49-
data = pypdf_converter.to_dict()
87+
pypdf_component = PyPDFToDocument(converter=CustomConverter())
88+
data = pypdf_component.to_dict()
5089
assert data == {
5190
"type": "haystack.components.converters.pypdf.PyPDFToDocument",
5291
"init_parameters": {
5392
"converter": {
5493
"data": {"key": "value", "more": False},
5594
"type": "converters.test_pypdf_to_document.CustomConverter",
56-
}
95+
},
96+
"extraction_mode": "plain",
97+
"plain_mode_orientations": (0, 90, 180, 270),
98+
"plain_mode_space_width": 200.0,
99+
"layout_mode_space_vertically": True,
100+
"layout_mode_scale_weight": 1.25,
101+
"layout_mode_strip_rotated": True,
102+
"layout_mode_font_height_weight": 1.0,
57103
},
58104
}
59105

60106
def test_from_dict(self):
61-
data = {"type": "haystack.components.converters.pypdf.PyPDFToDocument", "init_parameters": {"converter": None}}
107+
data = {
108+
"type": "haystack.components.converters.pypdf.PyPDFToDocument",
109+
"init_parameters": {
110+
"converter": None,
111+
"extraction_mode": "plain",
112+
"plain_mode_orientations": (0, 90, 180, 270),
113+
"plain_mode_space_width": 200.0,
114+
"layout_mode_space_vertically": True,
115+
"layout_mode_scale_weight": 1.25,
116+
"layout_mode_strip_rotated": True,
117+
"layout_mode_font_height_weight": 1.0,
118+
},
119+
}
120+
62121
instance = PyPDFToDocument.from_dict(data)
63122
assert isinstance(instance, PyPDFToDocument)
64123
assert instance.converter is None
124+
assert instance.extraction_mode == PyPDFExtractionMode.PLAIN
125+
assert instance.plain_mode_orientations == (0, 90, 180, 270)
126+
assert instance.plain_mode_space_width == 200.0
127+
assert instance.layout_mode_space_vertically is True
128+
assert instance.layout_mode_scale_weight == 1.25
129+
assert instance.layout_mode_strip_rotated is True
130+
assert instance.layout_mode_font_height_weight == 1.0
65131

66132
def test_from_dict_defaults(self):
67133
data = {"type": "haystack.components.converters.pypdf.PyPDFToDocument", "init_parameters": {}}
@@ -83,30 +149,63 @@ def test_from_dict_custom_converter(self):
83149
assert isinstance(instance, PyPDFToDocument)
84150
assert isinstance(instance.converter, CustomConverter)
85151

152+
def test_default_convert(self):
153+
mock_page1 = Mock()
154+
mock_page2 = Mock()
155+
mock_page1.extract_text.return_value = "Page 1 content"
156+
mock_page2.extract_text.return_value = "Page 2 content"
157+
mock_reader = Mock()
158+
mock_reader.pages = [mock_page1, mock_page2]
159+
160+
converter = PyPDFToDocument(
161+
extraction_mode="layout",
162+
plain_mode_orientations=(0, 90),
163+
plain_mode_space_width=150.0,
164+
layout_mode_space_vertically=False,
165+
layout_mode_scale_weight=2.0,
166+
layout_mode_strip_rotated=False,
167+
layout_mode_font_height_weight=1.5,
168+
)
169+
170+
doc = converter._default_convert(mock_reader)
171+
assert doc.content == "Page 1 content\fPage 2 content"
172+
173+
expected_params = {
174+
"extraction_mode": "layout",
175+
"orientations": (0, 90),
176+
"space_width": 150.0,
177+
"layout_mode_space_vertically": False,
178+
"layout_mode_scale_weight": 2.0,
179+
"layout_mode_strip_rotated": False,
180+
"layout_mode_font_height_weight": 1.5,
181+
}
182+
for mock_page in mock_reader.pages:
183+
mock_page.extract_text.assert_called_once_with(**expected_params)
184+
86185
@pytest.mark.integration
87-
def test_run(self, test_files_path, pypdf_converter):
186+
def test_run(self, test_files_path, pypdf_component):
88187
"""
89188
Test if the component runs correctly.
90189
"""
91190
paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
92-
output = pypdf_converter.run(sources=paths)
191+
output = pypdf_component.run(sources=paths)
93192
docs = output["documents"]
94193
assert len(docs) == 1
95194
assert "History" in docs[0].content
96195

97196
@pytest.mark.integration
98-
def test_page_breaks_added(self, test_files_path, pypdf_converter):
197+
def test_page_breaks_added(self, test_files_path, pypdf_component):
99198
paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
100-
output = pypdf_converter.run(sources=paths)
199+
output = pypdf_component.run(sources=paths)
101200
docs = output["documents"]
102201
assert len(docs) == 1
103202
assert docs[0].content.count("\f") == 3
104203

105-
def test_run_with_meta(self, test_files_path, pypdf_converter):
204+
def test_run_with_meta(self, test_files_path, pypdf_component):
106205
bytestream = ByteStream(data=b"test", meta={"author": "test_author", "language": "en"})
107206

108207
with patch("haystack.components.converters.pypdf.PdfReader"):
109-
output = pypdf_converter.run(
208+
output = pypdf_component.run(
110209
sources=[bytestream, test_files_path / "pdf" / "sample_pdf_1.pdf"], meta={"language": "it"}
111210
)
112211

@@ -115,25 +214,25 @@ def test_run_with_meta(self, test_files_path, pypdf_converter):
115214
assert output["documents"][0].meta["language"] == "it"
116215
assert output["documents"][1].meta["language"] == "it"
117216

118-
def test_run_error_handling(self, test_files_path, pypdf_converter, caplog):
217+
def test_run_error_handling(self, test_files_path, pypdf_component, caplog):
119218
"""
120219
Test if the component correctly handles errors.
121220
"""
122221
paths = ["non_existing_file.pdf"]
123222
with caplog.at_level(logging.WARNING):
124-
pypdf_converter.run(sources=paths)
223+
pypdf_component.run(sources=paths)
125224
assert "Could not read non_existing_file.pdf" in caplog.text
126225

127226
@pytest.mark.integration
128-
def test_mixed_sources_run(self, test_files_path, pypdf_converter):
227+
def test_mixed_sources_run(self, test_files_path, pypdf_component):
129228
"""
130229
Test if the component runs correctly when mixed sources are provided.
131230
"""
132231
paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
133232
with open(test_files_path / "pdf" / "sample_pdf_1.pdf", "rb") as f:
134233
paths.append(ByteStream(f.read()))
135234

136-
output = pypdf_converter.run(sources=paths)
235+
output = pypdf_component.run(sources=paths)
137236
docs = output["documents"]
138237
assert len(docs) == 2
139238
assert "History and standardization" in docs[0].content

0 commit comments

Comments
 (0)