|
6 | 6 | import pytest
|
7 | 7 |
|
8 | 8 | from haystack import Document
|
| 9 | +from haystack.components.preprocessors import DocumentSplitter |
9 | 10 | from haystack.dataclasses import ByteStream
|
10 | 11 | from haystack.components.converters.pdfminer import PDFMinerToDocument
|
11 | 12 |
|
@@ -155,3 +156,32 @@ def test_run_empty_document(self, caplog, test_files_path):
|
155 | 156 | # Check that not only content is used when the returned document is initialized and doc id is generated
|
156 | 157 | assert results["documents"][0].meta["file_path"] == "non_text_searchable.pdf"
|
157 | 158 | assert results["documents"][0].id != Document(content="").id
|
| 159 | + |
| 160 | + def test_run_detect_pages_and_split_by_passage(self, test_files_path): |
| 161 | + converter = PDFMinerToDocument() |
| 162 | + sources = [test_files_path / "pdf" / "sample_pdf_2.pdf"] |
| 163 | + pdf_doc = converter.run(sources=sources) |
| 164 | + splitter = DocumentSplitter(split_length=1, split_by="page") |
| 165 | + docs = splitter.run(pdf_doc["documents"]) |
| 166 | + assert len(docs["documents"]) == 4 |
| 167 | + |
| 168 | + def test_run_detect_paragraphs_to_be_used_in_split_passage(self, test_files_path): |
| 169 | + converter = PDFMinerToDocument() |
| 170 | + sources = [test_files_path / "pdf" / "sample_pdf_2.pdf"] |
| 171 | + pdf_doc = converter.run(sources=sources) |
| 172 | + splitter = DocumentSplitter(split_length=1, split_by="passage") |
| 173 | + docs = splitter.run(pdf_doc["documents"]) |
| 174 | + |
| 175 | + assert len(docs["documents"]) == 29 |
| 176 | + |
| 177 | + expected = ( |
| 178 | + "\nA wiki (/ˈwɪki/ (About this soundlisten) WIK-ee) is a hypertext publication collaboratively" |
| 179 | + " \nedited and managed by its own audience directly using a web browser. A typical wiki \ncontains " |
| 180 | + "multiple pages for the subjects or scope of the project and may be either open \nto the public or " |
| 181 | + "limited to use within an organization for maintaining its internal knowledge \nbase. Wikis are " |
| 182 | + "enabled by wiki software, otherwise known as wiki engines. A wiki engine, \nbeing a form of a " |
| 183 | + "content management system, differs from other web-based systems \nsuch as blog software, in that " |
| 184 | + "the content is created without any defined owner or leader, \nand wikis have little inherent " |
| 185 | + "structure, allowing structure to emerge according to the \nneeds of the users.[1] \n\n" |
| 186 | + ) |
| 187 | + assert docs["documents"][6].content == expected |
0 commit comments