2
2
#
3
3
# SPDX-License-Identifier: Apache-2.0
4
4
import logging
5
- from unittest .mock import patch
5
+ from unittest .mock import patch , Mock
6
6
7
7
import pytest
8
8
9
9
from haystack import Document , default_from_dict , default_to_dict
10
- from haystack .components .converters .pypdf import PyPDFToDocument
10
+ from haystack .components .converters .pypdf import PyPDFToDocument , PyPDFExtractionMode
11
11
from haystack .dataclasses import ByteStream
12
12
13
13
14
14
@pytest .fixture
15
- def pypdf_converter ():
15
+ def pypdf_component ():
16
16
return PyPDFToDocument ()
17
17
18
18
@@ -30,38 +30,104 @@ def from_dict(cls, data):
30
30
31
31
32
32
class TestPyPDFToDocument :
33
- def test_init (self , pypdf_converter ):
34
- assert pypdf_converter .converter is None
33
+ def test_init (self , pypdf_component ):
34
+ assert pypdf_component .converter is None
35
+ assert pypdf_component .extraction_mode == PyPDFExtractionMode .PLAIN
36
+ assert pypdf_component .plain_mode_orientations == (0 , 90 , 180 , 270 )
37
+ assert pypdf_component .plain_mode_space_width == 200.0
38
+ assert pypdf_component .layout_mode_space_vertically is True
39
+ assert pypdf_component .layout_mode_scale_weight == 1.25
40
+ assert pypdf_component .layout_mode_strip_rotated is True
41
+ assert pypdf_component .layout_mode_font_height_weight == 1.0
35
42
36
- def test_init_params (self ):
37
- pypdf_converter = PyPDFToDocument (converter = CustomConverter ())
38
- assert isinstance (pypdf_converter .converter , CustomConverter )
43
+ def test_init_converter (self ):
44
+ pypdf_component = PyPDFToDocument (converter = CustomConverter ())
45
+ assert isinstance (pypdf_component .converter , CustomConverter )
39
46
40
- def test_to_dict (self , pypdf_converter ):
41
- data = pypdf_converter .to_dict ()
47
+ def test_init_custom_params (self ):
48
+ pypdf_component = PyPDFToDocument (
49
+ extraction_mode = "layout" ,
50
+ plain_mode_orientations = (0 , 90 ),
51
+ plain_mode_space_width = 150.0 ,
52
+ layout_mode_space_vertically = False ,
53
+ layout_mode_scale_weight = 2.0 ,
54
+ layout_mode_strip_rotated = False ,
55
+ layout_mode_font_height_weight = 0.5 ,
56
+ )
57
+
58
+ assert pypdf_component .extraction_mode == PyPDFExtractionMode .LAYOUT
59
+ assert pypdf_component .plain_mode_orientations == (0 , 90 )
60
+ assert pypdf_component .plain_mode_space_width == 150.0
61
+ assert pypdf_component .layout_mode_space_vertically is False
62
+ assert pypdf_component .layout_mode_scale_weight == 2.0
63
+ assert pypdf_component .layout_mode_strip_rotated is False
64
+ assert pypdf_component .layout_mode_font_height_weight == 0.5
65
+
66
+ def test_init_invalid_extraction_mode (self ):
67
+ with pytest .raises (ValueError ):
68
+ PyPDFToDocument (extraction_mode = "invalid" )
69
+
70
+ def test_to_dict (self , pypdf_component ):
71
+ data = pypdf_component .to_dict ()
42
72
assert data == {
43
73
"type" : "haystack.components.converters.pypdf.PyPDFToDocument" ,
44
- "init_parameters" : {"converter" : None },
74
+ "init_parameters" : {
75
+ "converter" : None ,
76
+ "extraction_mode" : "plain" ,
77
+ "plain_mode_orientations" : (0 , 90 , 180 , 270 ),
78
+ "plain_mode_space_width" : 200.0 ,
79
+ "layout_mode_space_vertically" : True ,
80
+ "layout_mode_scale_weight" : 1.25 ,
81
+ "layout_mode_strip_rotated" : True ,
82
+ "layout_mode_font_height_weight" : 1.0 ,
83
+ },
45
84
}
46
85
47
86
def test_to_dict_custom_converter (self ):
48
- pypdf_converter = PyPDFToDocument (converter = CustomConverter ())
49
- data = pypdf_converter .to_dict ()
87
+ pypdf_component = PyPDFToDocument (converter = CustomConverter ())
88
+ data = pypdf_component .to_dict ()
50
89
assert data == {
51
90
"type" : "haystack.components.converters.pypdf.PyPDFToDocument" ,
52
91
"init_parameters" : {
53
92
"converter" : {
54
93
"data" : {"key" : "value" , "more" : False },
55
94
"type" : "converters.test_pypdf_to_document.CustomConverter" ,
56
- }
95
+ },
96
+ "extraction_mode" : "plain" ,
97
+ "plain_mode_orientations" : (0 , 90 , 180 , 270 ),
98
+ "plain_mode_space_width" : 200.0 ,
99
+ "layout_mode_space_vertically" : True ,
100
+ "layout_mode_scale_weight" : 1.25 ,
101
+ "layout_mode_strip_rotated" : True ,
102
+ "layout_mode_font_height_weight" : 1.0 ,
57
103
},
58
104
}
59
105
60
106
def test_from_dict (self ):
61
- data = {"type" : "haystack.components.converters.pypdf.PyPDFToDocument" , "init_parameters" : {"converter" : None }}
107
+ data = {
108
+ "type" : "haystack.components.converters.pypdf.PyPDFToDocument" ,
109
+ "init_parameters" : {
110
+ "converter" : None ,
111
+ "extraction_mode" : "plain" ,
112
+ "plain_mode_orientations" : (0 , 90 , 180 , 270 ),
113
+ "plain_mode_space_width" : 200.0 ,
114
+ "layout_mode_space_vertically" : True ,
115
+ "layout_mode_scale_weight" : 1.25 ,
116
+ "layout_mode_strip_rotated" : True ,
117
+ "layout_mode_font_height_weight" : 1.0 ,
118
+ },
119
+ }
120
+
62
121
instance = PyPDFToDocument .from_dict (data )
63
122
assert isinstance (instance , PyPDFToDocument )
64
123
assert instance .converter is None
124
+ assert instance .extraction_mode == PyPDFExtractionMode .PLAIN
125
+ assert instance .plain_mode_orientations == (0 , 90 , 180 , 270 )
126
+ assert instance .plain_mode_space_width == 200.0
127
+ assert instance .layout_mode_space_vertically is True
128
+ assert instance .layout_mode_scale_weight == 1.25
129
+ assert instance .layout_mode_strip_rotated is True
130
+ assert instance .layout_mode_font_height_weight == 1.0
65
131
66
132
def test_from_dict_defaults (self ):
67
133
data = {"type" : "haystack.components.converters.pypdf.PyPDFToDocument" , "init_parameters" : {}}
@@ -83,30 +149,63 @@ def test_from_dict_custom_converter(self):
83
149
assert isinstance (instance , PyPDFToDocument )
84
150
assert isinstance (instance .converter , CustomConverter )
85
151
152
+ def test_default_convert (self ):
153
+ mock_page1 = Mock ()
154
+ mock_page2 = Mock ()
155
+ mock_page1 .extract_text .return_value = "Page 1 content"
156
+ mock_page2 .extract_text .return_value = "Page 2 content"
157
+ mock_reader = Mock ()
158
+ mock_reader .pages = [mock_page1 , mock_page2 ]
159
+
160
+ converter = PyPDFToDocument (
161
+ extraction_mode = "layout" ,
162
+ plain_mode_orientations = (0 , 90 ),
163
+ plain_mode_space_width = 150.0 ,
164
+ layout_mode_space_vertically = False ,
165
+ layout_mode_scale_weight = 2.0 ,
166
+ layout_mode_strip_rotated = False ,
167
+ layout_mode_font_height_weight = 1.5 ,
168
+ )
169
+
170
+ doc = converter ._default_convert (mock_reader )
171
+ assert doc .content == "Page 1 content\f Page 2 content"
172
+
173
+ expected_params = {
174
+ "extraction_mode" : "layout" ,
175
+ "orientations" : (0 , 90 ),
176
+ "space_width" : 150.0 ,
177
+ "layout_mode_space_vertically" : False ,
178
+ "layout_mode_scale_weight" : 2.0 ,
179
+ "layout_mode_strip_rotated" : False ,
180
+ "layout_mode_font_height_weight" : 1.5 ,
181
+ }
182
+ for mock_page in mock_reader .pages :
183
+ mock_page .extract_text .assert_called_once_with (** expected_params )
184
+
86
185
@pytest .mark .integration
87
- def test_run (self , test_files_path , pypdf_converter ):
186
+ def test_run (self , test_files_path , pypdf_component ):
88
187
"""
89
188
Test if the component runs correctly.
90
189
"""
91
190
paths = [test_files_path / "pdf" / "sample_pdf_1.pdf" ]
92
- output = pypdf_converter .run (sources = paths )
191
+ output = pypdf_component .run (sources = paths )
93
192
docs = output ["documents" ]
94
193
assert len (docs ) == 1
95
194
assert "History" in docs [0 ].content
96
195
97
196
@pytest .mark .integration
98
- def test_page_breaks_added (self , test_files_path , pypdf_converter ):
197
+ def test_page_breaks_added (self , test_files_path , pypdf_component ):
99
198
paths = [test_files_path / "pdf" / "sample_pdf_1.pdf" ]
100
- output = pypdf_converter .run (sources = paths )
199
+ output = pypdf_component .run (sources = paths )
101
200
docs = output ["documents" ]
102
201
assert len (docs ) == 1
103
202
assert docs [0 ].content .count ("\f " ) == 3
104
203
105
- def test_run_with_meta (self , test_files_path , pypdf_converter ):
204
+ def test_run_with_meta (self , test_files_path , pypdf_component ):
106
205
bytestream = ByteStream (data = b"test" , meta = {"author" : "test_author" , "language" : "en" })
107
206
108
207
with patch ("haystack.components.converters.pypdf.PdfReader" ):
109
- output = pypdf_converter .run (
208
+ output = pypdf_component .run (
110
209
sources = [bytestream , test_files_path / "pdf" / "sample_pdf_1.pdf" ], meta = {"language" : "it" }
111
210
)
112
211
@@ -115,25 +214,25 @@ def test_run_with_meta(self, test_files_path, pypdf_converter):
115
214
assert output ["documents" ][0 ].meta ["language" ] == "it"
116
215
assert output ["documents" ][1 ].meta ["language" ] == "it"
117
216
118
- def test_run_error_handling (self , test_files_path , pypdf_converter , caplog ):
217
+ def test_run_error_handling (self , test_files_path , pypdf_component , caplog ):
119
218
"""
120
219
Test if the component correctly handles errors.
121
220
"""
122
221
paths = ["non_existing_file.pdf" ]
123
222
with caplog .at_level (logging .WARNING ):
124
- pypdf_converter .run (sources = paths )
223
+ pypdf_component .run (sources = paths )
125
224
assert "Could not read non_existing_file.pdf" in caplog .text
126
225
127
226
@pytest .mark .integration
128
- def test_mixed_sources_run (self , test_files_path , pypdf_converter ):
227
+ def test_mixed_sources_run (self , test_files_path , pypdf_component ):
129
228
"""
130
229
Test if the component runs correctly when mixed sources are provided.
131
230
"""
132
231
paths = [test_files_path / "pdf" / "sample_pdf_1.pdf" ]
133
232
with open (test_files_path / "pdf" / "sample_pdf_1.pdf" , "rb" ) as f :
134
233
paths .append (ByteStream (f .read ()))
135
234
136
- output = pypdf_converter .run (sources = paths )
235
+ output = pypdf_component .run (sources = paths )
137
236
docs = output ["documents" ]
138
237
assert len (docs ) == 2
139
238
assert "History and standardization" in docs [0 ].content
0 commit comments