|
8 | 8 |
|
9 | 9 | from haystack import Document, default_from_dict, default_to_dict
|
10 | 10 | from haystack.components.converters.pypdf import PyPDFToDocument, PyPDFExtractionMode
|
| 11 | +from haystack.components.preprocessors import DocumentSplitter |
11 | 12 | from haystack.dataclasses import ByteStream
|
12 | 13 |
|
13 | 14 |
|
@@ -213,3 +214,24 @@ def test_run_empty_document(self, caplog, test_files_path):
|
213 | 214 | # Check that meta is used when the returned document is initialized and thus when doc id is generated
|
214 | 215 | assert output["documents"][0].meta["file_path"] == "non_text_searchable.pdf"
|
215 | 216 | assert output["documents"][0].id != Document(content="").id
|
| 217 | + |
| 218 | + def test_run_detect_paragraphs_to_be_used_in_split_passage(self, test_files_path): |
| 219 | + converter = PyPDFToDocument(extraction_mode=PyPDFExtractionMode.LAYOUT) |
| 220 | + sources = [test_files_path / "pdf" / "sample_pdf_2.pdf"] |
| 221 | + pdf_doc = converter.run(sources=sources) |
| 222 | + splitter = DocumentSplitter(split_length=1, split_by="passage") |
| 223 | + docs = splitter.run(pdf_doc["documents"]) |
| 224 | + |
| 225 | + assert len(docs["documents"]) == 51 |
| 226 | + |
| 227 | + expected = ( |
| 228 | + "A wiki (/ˈwɪki/ (About this soundlisten) WIK-ee) is a hypertext publication collaboratively\n" |
| 229 | + "edited and managed by its own audience directly using a web browser. A typical wiki\ncontains " |
| 230 | + "multiple pages for the subjects or scope of the project and may be either open\nto the public or " |
| 231 | + "limited to use within an organization for maintaining its internal knowledge\nbase. Wikis are " |
| 232 | + "enabled by wiki software, otherwise known as wiki engines. A wiki engine,\nbeing a form of a " |
| 233 | + "content management system, differs from other web-based systems\nsuch as blog software, in that " |
| 234 | + "the content is created without any defined owner or leader,\nand wikis have little inherent " |
| 235 | + "structure, allowing structure to emerge according to the\nneeds of the users.[1]\n\n" |
| 236 | + ) |
| 237 | + assert docs["documents"][2].content == expected |
0 commit comments