Skip to content

Commit 2c84266

Browse files
test: adding test for PyPDF to extract passages so that they are detect by DocumentSplitter (#8739)
1 parent 21dd03d commit 2c84266

File tree

2 files changed

+31
-10
lines changed

2 files changed

+31
-10
lines changed

Diff for: haystack/components/converters/pypdf.py

+9-10
Original file line numberDiff line numberDiff line change
@@ -158,17 +158,16 @@ def from_dict(cls, data):
158158
def _default_convert(self, reader: "PdfReader") -> str:
159159
texts = []
160160
for page in reader.pages:
161-
texts.append(
162-
page.extract_text(
163-
orientations=self.plain_mode_orientations,
164-
extraction_mode=self.extraction_mode.value,
165-
space_width=self.plain_mode_space_width,
166-
layout_mode_space_vertically=self.layout_mode_space_vertically,
167-
layout_mode_scale_weight=self.layout_mode_scale_weight,
168-
layout_mode_strip_rotated=self.layout_mode_strip_rotated,
169-
layout_mode_font_height_weight=self.layout_mode_font_height_weight,
170-
)
161+
extracted_text = page.extract_text(
162+
orientations=self.plain_mode_orientations,
163+
extraction_mode=self.extraction_mode.value,
164+
space_width=self.plain_mode_space_width,
165+
layout_mode_space_vertically=self.layout_mode_space_vertically,
166+
layout_mode_scale_weight=self.layout_mode_scale_weight,
167+
layout_mode_strip_rotated=self.layout_mode_strip_rotated,
168+
layout_mode_font_height_weight=self.layout_mode_font_height_weight,
171169
)
170+
texts.append(extracted_text)
172171
text = "\f".join(texts)
173172
return text
174173

Diff for: test/components/converters/test_pypdf_to_document.py

+22
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
from haystack import Document, default_from_dict, default_to_dict
1010
from haystack.components.converters.pypdf import PyPDFToDocument, PyPDFExtractionMode
11+
from haystack.components.preprocessors import DocumentSplitter
1112
from haystack.dataclasses import ByteStream
1213

1314

@@ -213,3 +214,24 @@ def test_run_empty_document(self, caplog, test_files_path):
213214
# Check that meta is used when the returned document is initialized and thus when doc id is generated
214215
assert output["documents"][0].meta["file_path"] == "non_text_searchable.pdf"
215216
assert output["documents"][0].id != Document(content="").id
217+
218+
def test_run_detect_paragraphs_to_be_used_in_split_passage(self, test_files_path):
219+
converter = PyPDFToDocument(extraction_mode=PyPDFExtractionMode.LAYOUT)
220+
sources = [test_files_path / "pdf" / "sample_pdf_2.pdf"]
221+
pdf_doc = converter.run(sources=sources)
222+
splitter = DocumentSplitter(split_length=1, split_by="passage")
223+
docs = splitter.run(pdf_doc["documents"])
224+
225+
assert len(docs["documents"]) == 51
226+
227+
expected = (
228+
"A wiki (/ˈwɪki/ (About this soundlisten) WIK-ee) is a hypertext publication collaboratively\n"
229+
"edited and managed by its own audience directly using a web browser. A typical wiki\ncontains "
230+
"multiple pages for the subjects or scope of the project and may be either open\nto the public or "
231+
"limited to use within an organization for maintaining its internal knowledge\nbase. Wikis are "
232+
"enabled by wiki software, otherwise known as wiki engines. A wiki engine,\nbeing a form of a "
233+
"content management system, differs from other web-based systems\nsuch as blog software, in that "
234+
"the content is created without any defined owner or leader,\nand wikis have little inherent "
235+
"structure, allowing structure to emerge according to the\nneeds of the users.[1]\n\n"
236+
)
237+
assert docs["documents"][2].content == expected

0 commit comments

Comments
 (0)