fix: PDFMinerToDocument convert function - adding double new lines between each container_text so that passages can be detected. (#8729)

davidsbatista · web-flow · commit 5af2888e23da · 2025-01-17T13:01:16.000Z
* initial import

* adding double new lines between container_texts so that passages can be detected

* reducing type specification to avoid import error

* adding release notes

* renaming variable
diff --git a/haystack/components/converters/pdfminer.py b/haystack/components/converters/pdfminer.py
@@ -5,7 +5,7 @@
 import io
 import os
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, Iterator, List, Optional, Union
 
 from haystack import Document, component, logging
 from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
@@ -98,23 +98,27 @@ def __init__(  # pylint: disable=too-many-positional-arguments
         )
         self.store_full_path = store_full_path
 
-    def _converter(self, extractor) -> str:
+    @staticmethod
+    def _converter(lt_page_objs: Iterator) -> str:
         """
         Extracts text from PDF pages then converts the text into a single str
 
-        :param extractor:
+        :param lt_page_objs:
             Python generator that yields PDF pages.
 
         :returns:
             PDF text converted to single str
         """
         pages = []
-        for page in extractor:
+        for page in lt_page_objs:
             text = ""
             for container in page:
                 # Keep text only
                 if isinstance(container, LTTextContainer):
-                    text += container.get_text()
+                    container_text = container.get_text()
+                    if container_text:
+                        text += "\n\n"
+                    text += container_text
             pages.append(text)
 
         # Add a page delimiter
@@ -156,8 +160,8 @@ def run(
                 logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)
                 continue
             try:
-                pdf_reader = extract_pages(io.BytesIO(bytestream.data), laparams=self.layout_params)
-                text = self._converter(pdf_reader)
+                pages = extract_pages(io.BytesIO(bytestream.data), laparams=self.layout_params)
+                text = self._converter(pages)
             except Exception as e:
                 logger.warning(
                     "Could not read {source} and convert it to Document, skipping. {error}", source=source, error=e
diff --git a/releasenotes/notes/fixing-PDFMiner-for-passage-detection-62cf5c3e9758bcf9.yaml b/releasenotes/notes/fixing-PDFMiner-for-passage-detection-62cf5c3e9758bcf9.yaml
@@ -0,0 +1,4 @@
+---
+fixes:
+  - |
+    Updated `PDFMinerToDocument` convert function to to double new lines between container_text so that passages can later by `DocumentSplitter`.
diff --git a/test/components/converters/test_pdfminer_to_document.py b/test/components/converters/test_pdfminer_to_document.py
@@ -6,6 +6,7 @@
 import pytest
 
 from haystack import Document
+from haystack.components.preprocessors import DocumentSplitter
 from haystack.dataclasses import ByteStream
 from haystack.components.converters.pdfminer import PDFMinerToDocument
 
@@ -155,3 +156,32 @@ def test_run_empty_document(self, caplog, test_files_path):
             # Check that not only content is used when the returned document is initialized and doc id is generated
             assert results["documents"][0].meta["file_path"] == "non_text_searchable.pdf"
             assert results["documents"][0].id != Document(content="").id
+
+    def test_run_detect_pages_and_split_by_passage(self, test_files_path):
+        converter = PDFMinerToDocument()
+        sources = [test_files_path / "pdf" / "sample_pdf_2.pdf"]
+        pdf_doc = converter.run(sources=sources)
+        splitter = DocumentSplitter(split_length=1, split_by="page")
+        docs = splitter.run(pdf_doc["documents"])
+        assert len(docs["documents"]) == 4
+
+    def test_run_detect_paragraphs_to_be_used_in_split_passage(self, test_files_path):
+        converter = PDFMinerToDocument()
+        sources = [test_files_path / "pdf" / "sample_pdf_2.pdf"]
+        pdf_doc = converter.run(sources=sources)
+        splitter = DocumentSplitter(split_length=1, split_by="passage")
+        docs = splitter.run(pdf_doc["documents"])
+
+        assert len(docs["documents"]) == 29
+
+        expected = (
+            "\nA wiki (/ˈwɪki/ (About this soundlisten) WIK-ee) is a hypertext publication collaboratively"
+            " \nedited and managed by its own audience directly using a web browser. A typical wiki \ncontains "
+            "multiple pages for the subjects or scope of the project and may be either open \nto the public or "
+            "limited to use within an organization for maintaining its internal knowledge \nbase. Wikis are "
+            "enabled by wiki software, otherwise known as wiki engines. A wiki engine, \nbeing a form of a "
+            "content management system, diﬀers from other web-based systems \nsuch as blog software, in that "
+            "the content is created without any deﬁned owner or leader, \nand wikis have little inherent "
+            "structure, allowing structure to emerge according to the \nneeds of the users.[1] \n\n"
+        )
+        assert docs["documents"][6].content == expected

-Original file line number
+Diff line change
 +---
 +fixes:
 +  - |
 +    Updated `PDFMinerToDocument` convert function to to double new lines between container_text so that passages can later by `DocumentSplitter`.