@@ -16,22 +16,8 @@ def pypdf_component():
16
16
return PyPDFToDocument ()
17
17
18
18
19
- class CustomConverter :
20
- def convert (self , reader ):
21
- return Document (content = "Custom converter" )
22
-
23
- def to_dict (self ):
24
- return {"key" : "value" , "more" : False }
25
-
26
- @classmethod
27
- def from_dict (cls , data ):
28
- assert data == {"key" : "value" , "more" : False }
29
- return cls ()
30
-
31
-
32
19
class TestPyPDFToDocument :
33
20
def test_init (self , pypdf_component ):
34
- assert pypdf_component .converter is None
35
21
assert pypdf_component .extraction_mode == PyPDFExtractionMode .PLAIN
36
22
assert pypdf_component .plain_mode_orientations == (0 , 90 , 180 , 270 )
37
23
assert pypdf_component .plain_mode_space_width == 200.0
@@ -40,10 +26,6 @@ def test_init(self, pypdf_component):
40
26
assert pypdf_component .layout_mode_strip_rotated is True
41
27
assert pypdf_component .layout_mode_font_height_weight == 1.0
42
28
43
- def test_init_converter (self ):
44
- pypdf_component = PyPDFToDocument (converter = CustomConverter ())
45
- assert isinstance (pypdf_component .converter , CustomConverter )
46
-
47
29
def test_init_custom_params (self ):
48
30
pypdf_component = PyPDFToDocument (
49
31
extraction_mode = "layout" ,
@@ -72,7 +54,6 @@ def test_to_dict(self, pypdf_component):
72
54
assert data == {
73
55
"type" : "haystack.components.converters.pypdf.PyPDFToDocument" ,
74
56
"init_parameters" : {
75
- "converter" : None ,
76
57
"extraction_mode" : "plain" ,
77
58
"plain_mode_orientations" : (0 , 90 , 180 , 270 ),
78
59
"plain_mode_space_width" : 200.0 ,
@@ -84,32 +65,10 @@ def test_to_dict(self, pypdf_component):
84
65
},
85
66
}
86
67
87
- def test_to_dict_custom_converter (self ):
88
- pypdf_component = PyPDFToDocument (converter = CustomConverter (), store_full_path = False )
89
- data = pypdf_component .to_dict ()
90
- assert data == {
91
- "type" : "haystack.components.converters.pypdf.PyPDFToDocument" ,
92
- "init_parameters" : {
93
- "converter" : {
94
- "data" : {"key" : "value" , "more" : False },
95
- "type" : "converters.test_pypdf_to_document.CustomConverter" ,
96
- },
97
- "extraction_mode" : "plain" ,
98
- "plain_mode_orientations" : (0 , 90 , 180 , 270 ),
99
- "plain_mode_space_width" : 200.0 ,
100
- "layout_mode_space_vertically" : True ,
101
- "layout_mode_scale_weight" : 1.25 ,
102
- "layout_mode_strip_rotated" : True ,
103
- "layout_mode_font_height_weight" : 1.0 ,
104
- "store_full_path" : False ,
105
- },
106
- }
107
-
108
68
def test_from_dict (self ):
109
69
data = {
110
70
"type" : "haystack.components.converters.pypdf.PyPDFToDocument" ,
111
71
"init_parameters" : {
112
- "converter" : None ,
113
72
"extraction_mode" : "plain" ,
114
73
"plain_mode_orientations" : (0 , 90 , 180 , 270 ),
115
74
"plain_mode_space_width" : 200.0 ,
@@ -122,7 +81,6 @@ def test_from_dict(self):
122
81
123
82
instance = PyPDFToDocument .from_dict (data )
124
83
assert isinstance (instance , PyPDFToDocument )
125
- assert instance .converter is None
126
84
assert instance .extraction_mode == PyPDFExtractionMode .PLAIN
127
85
assert instance .plain_mode_orientations == (0 , 90 , 180 , 270 )
128
86
assert instance .plain_mode_space_width == 200.0
@@ -135,21 +93,7 @@ def test_from_dict_defaults(self):
135
93
data = {"type" : "haystack.components.converters.pypdf.PyPDFToDocument" , "init_parameters" : {}}
136
94
instance = PyPDFToDocument .from_dict (data )
137
95
assert isinstance (instance , PyPDFToDocument )
138
- assert instance .converter is None
139
-
140
- def test_from_dict_custom_converter (self ):
141
- data = {
142
- "type" : "haystack.components.converters.pypdf.PyPDFToDocument" ,
143
- "init_parameters" : {
144
- "converter" : {
145
- "data" : {"key" : "value" , "more" : False },
146
- "type" : "converters.test_pypdf_to_document.CustomConverter" ,
147
- }
148
- },
149
- }
150
- instance = PyPDFToDocument .from_dict (data )
151
- assert isinstance (instance , PyPDFToDocument )
152
- assert isinstance (instance .converter , CustomConverter )
96
+ assert instance .extraction_mode == PyPDFExtractionMode .PLAIN
153
97
154
98
def test_default_convert (self ):
155
99
mock_page1 = Mock ()
@@ -259,33 +203,6 @@ def test_mixed_sources_run(self, test_files_path, pypdf_component):
259
203
assert "History and standardization" in docs [0 ].content
260
204
assert "History and standardization" in docs [1 ].content
261
205
262
- @pytest .mark .integration
263
- def test_custom_converter (self , test_files_path ):
264
- """
265
- Test if the component correctly handles custom converters.
266
- """
267
- from pypdf import PdfReader
268
-
269
- paths = [test_files_path / "pdf" / "sample_pdf_1.pdf" ]
270
-
271
- class MyCustomConverter :
272
- def convert (self , reader : PdfReader ) -> Document :
273
- return Document (content = "I don't care about converting given pdfs, I always return this" )
274
-
275
- def to_dict (self ):
276
- return default_to_dict (self )
277
-
278
- @classmethod
279
- def from_dict (cls , data ):
280
- return default_from_dict (cls , data )
281
-
282
- component = PyPDFToDocument (converter = MyCustomConverter ())
283
- output = component .run (sources = paths )
284
- docs = output ["documents" ]
285
- assert len (docs ) == 1
286
- assert "ReAct" not in docs [0 ].content
287
- assert "I don't care about converting given pdfs, I always return this" in docs [0 ].content
288
-
289
206
def test_run_empty_document (self , caplog , test_files_path ):
290
207
paths = [test_files_path / "pdf" / "non_text_searchable.pdf" ]
291
208
with caplog .at_level (logging .WARNING ):
0 commit comments