Skip to content

Commit 5af2888

Browse files
fix: PDFMinerToDocument convert function - adding double new lines between each container_text so that passages can be detected. (#8729)
* initial import * adding double new lines between container_texts so that passages can be detected * reducing type specification to avoid import error * adding release notes * renaming variable
1 parent 424bce2 commit 5af2888

File tree

3 files changed

+45
-7
lines changed

3 files changed

+45
-7
lines changed

Diff for: haystack/components/converters/pdfminer.py

+11-7
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import io
66
import os
77
from pathlib import Path
8-
from typing import Any, Dict, List, Optional, Union
8+
from typing import Any, Dict, Iterator, List, Optional, Union
99

1010
from haystack import Document, component, logging
1111
from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
@@ -98,23 +98,27 @@ def __init__( # pylint: disable=too-many-positional-arguments
9898
)
9999
self.store_full_path = store_full_path
100100

101-
def _converter(self, extractor) -> str:
101+
@staticmethod
102+
def _converter(lt_page_objs: Iterator) -> str:
102103
"""
103104
Extracts text from PDF pages then converts the text into a single str
104105
105-
:param extractor:
106+
:param lt_page_objs:
106107
Python generator that yields PDF pages.
107108
108109
:returns:
109110
PDF text converted to single str
110111
"""
111112
pages = []
112-
for page in extractor:
113+
for page in lt_page_objs:
113114
text = ""
114115
for container in page:
115116
# Keep text only
116117
if isinstance(container, LTTextContainer):
117-
text += container.get_text()
118+
container_text = container.get_text()
119+
if container_text:
120+
text += "\n\n"
121+
text += container_text
118122
pages.append(text)
119123

120124
# Add a page delimiter
@@ -156,8 +160,8 @@ def run(
156160
logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)
157161
continue
158162
try:
159-
pdf_reader = extract_pages(io.BytesIO(bytestream.data), laparams=self.layout_params)
160-
text = self._converter(pdf_reader)
163+
pages = extract_pages(io.BytesIO(bytestream.data), laparams=self.layout_params)
164+
text = self._converter(pages)
161165
except Exception as e:
162166
logger.warning(
163167
"Could not read {source} and convert it to Document, skipping. {error}", source=source, error=e
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
---
2+
fixes:
3+
- |
4+
Updated `PDFMinerToDocument` convert function to to double new lines between container_text so that passages can later by `DocumentSplitter`.

Diff for: test/components/converters/test_pdfminer_to_document.py

+30
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import pytest
77

88
from haystack import Document
9+
from haystack.components.preprocessors import DocumentSplitter
910
from haystack.dataclasses import ByteStream
1011
from haystack.components.converters.pdfminer import PDFMinerToDocument
1112

@@ -155,3 +156,32 @@ def test_run_empty_document(self, caplog, test_files_path):
155156
# Check that not only content is used when the returned document is initialized and doc id is generated
156157
assert results["documents"][0].meta["file_path"] == "non_text_searchable.pdf"
157158
assert results["documents"][0].id != Document(content="").id
159+
160+
def test_run_detect_pages_and_split_by_passage(self, test_files_path):
161+
converter = PDFMinerToDocument()
162+
sources = [test_files_path / "pdf" / "sample_pdf_2.pdf"]
163+
pdf_doc = converter.run(sources=sources)
164+
splitter = DocumentSplitter(split_length=1, split_by="page")
165+
docs = splitter.run(pdf_doc["documents"])
166+
assert len(docs["documents"]) == 4
167+
168+
def test_run_detect_paragraphs_to_be_used_in_split_passage(self, test_files_path):
169+
converter = PDFMinerToDocument()
170+
sources = [test_files_path / "pdf" / "sample_pdf_2.pdf"]
171+
pdf_doc = converter.run(sources=sources)
172+
splitter = DocumentSplitter(split_length=1, split_by="passage")
173+
docs = splitter.run(pdf_doc["documents"])
174+
175+
assert len(docs["documents"]) == 29
176+
177+
expected = (
178+
"\nA wiki (/ˈwɪki/ (About this soundlisten) WIK-ee) is a hypertext publication collaboratively"
179+
" \nedited and managed by its own audience directly using a web browser. A typical wiki \ncontains "
180+
"multiple pages for the subjects or scope of the project and may be either open \nto the public or "
181+
"limited to use within an organization for maintaining its internal knowledge \nbase. Wikis are "
182+
"enabled by wiki software, otherwise known as wiki engines. A wiki engine, \nbeing a form of a "
183+
"content management system, differs from other web-based systems \nsuch as blog software, in that "
184+
"the content is created without any defined owner or leader, \nand wikis have little inherent "
185+
"structure, allowing structure to emerge according to the \nneeds of the users.[1] \n\n"
186+
)
187+
assert docs["documents"][6].content == expected

0 commit comments

Comments
 (0)