@@ -196,7 +196,7 @@ def from_dict(cls, data):
196
196
data ["init_parameters" ]["converter" ] = deserialize_class_instance (custom_converter_data )
197
197
return default_from_dict (cls , data )
198
198
199
- def _default_convert (self , reader : "PdfReader" ) -> Document :
199
+ def _default_convert (self , reader : "PdfReader" ) -> str :
200
200
texts = []
201
201
for page in reader .pages :
202
202
texts .append (
@@ -211,7 +211,7 @@ def _default_convert(self, reader: "PdfReader") -> Document:
211
211
)
212
212
)
213
213
text = "\f " .join (texts )
214
- return Document ( content = text )
214
+ return text
215
215
216
216
@component .output_types (documents = List [Document ])
217
217
def run (
@@ -246,16 +246,18 @@ def run(
246
246
continue
247
247
try :
248
248
pdf_reader = PdfReader (io .BytesIO (bytestream .data ))
249
- document = (
250
- self ._default_convert (pdf_reader ) if self .converter is None else self .converter .convert (pdf_reader )
249
+ text = (
250
+ self ._default_convert (pdf_reader )
251
+ if self .converter is None
252
+ else self .converter .convert (pdf_reader ).content
251
253
)
252
254
except Exception as e :
253
255
logger .warning (
254
256
"Could not read {source} and convert it to Document, skipping. {error}" , source = source , error = e
255
257
)
256
258
continue
257
259
258
- if document . content is None or document . content .strip () == "" :
260
+ if text is None or text .strip () == "" :
259
261
logger .warning (
260
262
"PyPDFToDocument could not extract text from the file {source}. Returning an empty document." ,
261
263
source = source ,
@@ -270,7 +272,7 @@ def run(
270
272
)
271
273
if not self .store_full_path and (file_path := bytestream .meta .get ("file_path" )):
272
274
merged_metadata ["file_path" ] = os .path .basename (file_path )
273
- document . meta = merged_metadata
275
+ document = Document ( content = text , meta = merged_metadata )
274
276
documents .append (document )
275
277
276
278
return {"documents" : documents }
0 commit comments