Skip to content

Commit 35334f1

Browse files
committed
fix: PyPDFToDocument initializes documents with content and meta
1 parent 167a48e commit 35334f1

File tree

3 files changed

+18
-8
lines changed

3 files changed

+18
-8
lines changed

haystack/components/converters/pypdf.py

+8-6
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,7 @@ def from_dict(cls, data):
196196
data["init_parameters"]["converter"] = deserialize_class_instance(custom_converter_data)
197197
return default_from_dict(cls, data)
198198

199-
def _default_convert(self, reader: "PdfReader") -> Document:
199+
def _default_convert(self, reader: "PdfReader") -> str:
200200
texts = []
201201
for page in reader.pages:
202202
texts.append(
@@ -211,7 +211,7 @@ def _default_convert(self, reader: "PdfReader") -> Document:
211211
)
212212
)
213213
text = "\f".join(texts)
214-
return Document(content=text)
214+
return text
215215

216216
@component.output_types(documents=List[Document])
217217
def run(
@@ -246,16 +246,18 @@ def run(
246246
continue
247247
try:
248248
pdf_reader = PdfReader(io.BytesIO(bytestream.data))
249-
document = (
250-
self._default_convert(pdf_reader) if self.converter is None else self.converter.convert(pdf_reader)
249+
text = (
250+
self._default_convert(pdf_reader)
251+
if self.converter is None
252+
else self.converter.convert(pdf_reader).content
251253
)
252254
except Exception as e:
253255
logger.warning(
254256
"Could not read {source} and convert it to Document, skipping. {error}", source=source, error=e
255257
)
256258
continue
257259

258-
if document.content is None or document.content.strip() == "":
260+
if text is None or text.strip() == "":
259261
logger.warning(
260262
"PyPDFToDocument could not extract text from the file {source}. Returning an empty document.",
261263
source=source,
@@ -270,7 +272,7 @@ def run(
270272
)
271273
if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
272274
merged_metadata["file_path"] = os.path.basename(file_path)
273-
document.meta = merged_metadata
275+
document = Document(content=text, meta=merged_metadata)
274276
documents.append(document)
275277

276278
return {"documents": documents}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
---
2+
fixes:
3+
- |
4+
PyPDFToDocument now creates documents with id based on converted text and meta data. Before it didn't take the meta data into account.

test/components/converters/test_pypdf_to_document.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -169,8 +169,8 @@ def test_default_convert(self):
169169
layout_mode_font_height_weight=1.5,
170170
)
171171

172-
doc = converter._default_convert(mock_reader)
173-
assert doc.content == "Page 1 content\fPage 2 content"
172+
text = converter._default_convert(mock_reader)
173+
assert text == "Page 1 content\fPage 2 content"
174174

175175
expected_params = {
176176
"extraction_mode": "layout",
@@ -292,3 +292,7 @@ def test_run_empty_document(self, caplog, test_files_path):
292292
output = PyPDFToDocument().run(sources=paths)
293293
assert "PyPDFToDocument could not extract text from the file" in caplog.text
294294
assert output["documents"][0].content == ""
295+
296+
# Check that meta is used when the returned document is initialized and thus when doc id is generated
297+
assert "non_text_searchable.pdf" in output["documents"][0].meta["file_path"]
298+
assert output["documents"][0].id != Document(content="").id

0 commit comments

Comments
 (0)