implemented boilerplate transforms of documents. (#668)

RitxmSaha · web-flow · commit 1dc25f578c7c · 2024-08-12T13:47:00.000-07:00
* implemented boilerplate into pipeline

* fixed tests

* linted

* fixed mypy

* linted and fixed tests

* updated neo4j auth

* addressed vinayak comments

* linted

* added tests for new map functions

* changed neo4j auth

* linted
diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml
@@ -127,9 +127,9 @@ jobs:
       neo4j:
         image: neo4j:5.21.0
         env:
-          NEO4J_AUTH: neo4j/koala-stereo-comedy-spray-figure-6974  # DO NOT SET PASSWORD LESS THAN 8 CHARACTERS
           NEO4J_dbms_memory_heap_initial__size: 2G
           NEO4J_dbms_memory_heap_max__size: 2G
+          NEO4J_dbms_security_auth__enabled: false
           NEO4J_apoc_export_file_enabled: true
           NEO4J_apoc_import_file_enabled: true
           NEO4J_apoc_import_file_use__neo4j__config: true
diff --git a/lib/sycamore/sycamore/docset.py b/lib/sycamore/sycamore/docset.py
@@ -549,6 +549,9 @@ def extract_graph_structure(self, extractors: list[GraphExtractor], **kwargs) ->
                     .explode()
                 )
         """
+        from sycamore.transforms.extract_graph import ExtractDocumentStructure
+
+        self.plan = ExtractDocumentStructure(self.plan)
         docset = self
         for extractor in extractors:
             docset = extractor.extract(docset)
diff --git a/lib/sycamore/sycamore/tests/integration/connectors/neo4j/test_docset_to_neo4j.py b/lib/sycamore/sycamore/tests/integration/connectors/neo4j/test_docset_to_neo4j.py
@@ -1,104 +1,15 @@
 import sycamore
 from sycamore.tests.config import TEST_DIR
-from sycamore.data import HierarchicalDocument, Document
 from sycamore.transforms.partition import SycamorePartitioner
 
 
 def test_to_neo4j():
-    # helper function
-    def restructure_doc(doc: Document) -> HierarchicalDocument:
-        doc = HierarchicalDocument(doc.data)
-        return doc
-
-    # helper function
-    def children_to_section(doc: HierarchicalDocument) -> HierarchicalDocument:
-        import uuid
-
-        # if the first element is not a section header, insert generic placeholder
-        if len(doc.children) > 0 and doc.children[0]["type"] != "Section-header":
-            initial_page = HierarchicalDocument(
-                {
-                    "type": "Section-header",
-                    "bbox": (0, 0, 0, 0),
-                    "properties": {"score": 1, "page_number": 1},
-                    "text_representation": "Front Page",
-                    "binary_representation": b"Front Page",
-                }
-            )
-            doc.children.insert(0, initial_page)  # O(n) insert :( we should use deque for everything
-
-        if "relationships" not in doc.data:
-            doc.data["relationships"] = {}
-        if "label" not in doc.data:
-            doc.data["label"] = "DOCUMENT"
-
-        sections = []
-
-        section: HierarchicalDocument = None
-        element: HierarchicalDocument = None
-        for child in doc.children:
-            if "relationships" not in child.data:
-                child.data["relationships"] = {}
-            if (
-                child.type == "Section-header"
-                and "text_representation" in child.data
-                and len(child.data["text_representation"]) > 0
-            ):
-                if section is not None:
-                    next = {
-                        "TYPE": "NEXT",
-                        "properties": {},
-                        "START_ID": section.doc_id,
-                        "START_LABEL": "SECTION",
-                        "END_ID": child.doc_id,
-                        "END_LABEL": "SECTION",
-                    }
-                    child.data["relationships"][str(uuid.uuid4())] = next
-                    element = None
-                rel = {
-                    "TYPE": "SECTION_OF",
-                    "properties": {},
-                    "START_ID": child.doc_id,
-                    "START_LABEL": "SECTION",
-                    "END_ID": doc.doc_id,
-                    "END_LABEL": "DOCUMENT",
-                }
-                child.data["relationships"][str(uuid.uuid4())] = rel
-                child.data["label"] = "SECTION"
-                section = child
-                sections.append(section)
-            else:
-                if element is not None:
-                    next = {
-                        "TYPE": "NEXT",
-                        "properties": {},
-                        "START_ID": element.doc_id,
-                        "START_LABEL": "ELEMENT",
-                        "END_ID": child.doc_id,
-                        "END_LABEL": "ELEMENT",
-                    }
-                    child.data["relationships"][str(uuid.uuid4())] = next
-                rel = {
-                    "TYPE": "PART_OF",
-                    "properties": {},
-                    "START_ID": child.doc_id,
-                    "START_LABEL": "ELEMENT",
-                    "END_ID": section.doc_id,
-                    "END_LABEL": "SECTION",
-                }
-                child.data["relationships"][str(uuid.uuid4())] = rel
-                child.data["label"] = "ELEMENT"
-                element = child
-                section.data["children"].append(element)
-
-        doc.children = sections
-        return doc
 
     ## actual test ##
     path = str(TEST_DIR / "resources/data/pdfs/Ray_page11.pdf")
     context = sycamore.init()
     URI = "neo4j://localhost:7687"
-    AUTH = ("neo4j", "koala-stereo-comedy-spray-figure-6974")
+    AUTH = None
     DATABASE = "neo4j"
 
     ds = (
@@ -107,8 +18,7 @@ def children_to_section(doc: HierarchicalDocument) -> HierarchicalDocument:
             partitioner=SycamorePartitioner(extract_table_structure=True, use_ocr=True, extract_images=True),
             num_gpus=0.2,
         )
-        .map(restructure_doc)
-        .map(children_to_section)
+        .extract_graph_structure(extractors=[])
         .explode()
     )
 
diff --git a/lib/sycamore/sycamore/tests/unit/transforms/test_graph_extractor.py b/lib/sycamore/sycamore/tests/unit/transforms/test_graph_extractor.py
@@ -1,11 +1,18 @@
 from typing import Optional
 import sycamore
+from sycamore.data.document import Document
+from sycamore.data.element import Element
 from sycamore.llms.llms import LLM
 from sycamore.reader import DocSetReader
 from sycamore.transforms.extract_graph import GraphMetadata, MetadataExtractor, GraphEntity, EntityExtractor
+from sycamore.transforms.extract_graph import ExtractSummaries, ExtractDocumentStructure
 from sycamore.data import HierarchicalDocument
 from collections import defaultdict
 
+import logging
+
+logger = logging.getLogger(__name__)
+
 
 class TestGraphExtractor:
     metadata_docs = [
@@ -42,34 +49,38 @@ class TestGraphExtractor:
     ]
 
     entity_docs = [
-        HierarchicalDocument(
+        Document(
             {
                 "doc_id": "1",
-                "label": "Document",
                 "type": "pdf",
-                "relationships": {},
                 "properties": {"company": "3M", "sector": "Industrial", "doctype": "10K"},
-                "children": [
-                    HierarchicalDocument(
+                "elements": [
+                    Element(
                         {
-                            "doc_id": "2",
-                            "label": "Document",
-                            "type": "pdf",
-                            "relationships": {},
-                            "summary": "...",
+                            "type": "Section-header",
+                            "text_representation": "header-1",
                             "properties": {},
-                            "children": [],
                         }
                     ),
-                    HierarchicalDocument(
+                    Element(
                         {
-                            "doc_id": "3",
-                            "label": "Document",
-                            "type": "pdf",
-                            "relationships": {},
-                            "summary": "...",
+                            "type": "text",
+                            "text_representation": "i'm text-1",
+                            "properties": {},
+                        }
+                    ),
+                    Element(
+                        {
+                            "type": "Section-header",
+                            "text_representation": "header-2",
+                            "properties": {},
+                        }
+                    ),
+                    Element(
+                        {
+                            "type": "text",
+                            "text_representation": "i'm text-2",
                             "properties": {},
-                            "children": [],
                         }
                     ),
                 ],
@@ -172,3 +183,37 @@ def test_entity_extractor(self):
         assert len(nested_dict["Company"]["Microsoft"]) == 2
         assert len(nested_dict["Company"]["Google"]) == 2
         assert len(nested_dict["Company"]["3M"]) == 2
+
+    def test_extract_document_structure(self):
+        context = sycamore.init()
+        reader = DocSetReader(context)
+        ds = reader.document(self.entity_docs)
+
+        ds.plan = ExtractDocumentStructure(ds.plan)
+        docs = ds.take_all()
+
+        for document in docs:
+            assert document.data["label"] == "DOCUMENT"
+            for section in document.children:
+                assert section.data["label"] == "SECTION"
+                for element in section.children:
+                    assert element.data["label"] == "ELEMENT"
+
+    def test_summarize_sections(self):
+        context = sycamore.init()
+        reader = DocSetReader(context)
+        ds = reader.document(self.entity_docs)
+
+        ds.plan = ExtractDocumentStructure(ds.plan)
+        ds.plan = ExtractSummaries(ds.plan)
+        docs = ds.take_all()
+
+        summaries = [
+            "-----SECTION TITLE: header-1-----\n---Element Type: text---\ni'm text-1\n",
+            "-----SECTION TITLE: header-2-----\n---Element Type: text---\ni'm text-2\n",
+        ]
+
+        for document in docs:
+            for index, section in enumerate(document.children):
+                logger.warning(section.data["summary"])
+                assert section.data["summary"] == summaries[index]
diff --git a/lib/sycamore/sycamore/transforms/extract_graph.py b/lib/sycamore/sycamore/transforms/extract_graph.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING, Awaitable, Dict, Any
+from typing import TYPE_CHECKING, Awaitable, Dict, Any, Optional
 from sycamore.plan_nodes import Node
 from sycamore.transforms.map import Map
 from sycamore.data import Document, MetadataDocument, HierarchicalDocument
@@ -203,6 +203,7 @@ def extract(self, docset: "DocSet") -> "DocSet":
         """
         Extracts entities from documents then creates a document in the docset where they are stored as nodes
         """
+        docset.plan = ExtractSummaries(docset.plan)
         docset.plan = ExtractFeatures(docset.plan, self)
         docset = self.resolve(docset)
         return docset
@@ -300,6 +301,121 @@ def GraphEntityExtractorPrompt(entities, query):
     Output:"""
 
 
+class ExtractDocumentStructure(Map):
+    """
+    Extracts the structure of the document organizing document elements by their
+    respective section headers.
+    """
+
+    def __init__(self, child: Node, **resource_args):
+        super().__init__(child, f=ExtractDocumentStructure.structure_by_section, **resource_args)
+
+    @staticmethod
+    def structure_by_section(doc: Document) -> HierarchicalDocument:
+        import uuid
+
+        doc = HierarchicalDocument(doc.data)
+        # if the first element is not a section header, insert generic placeholder
+        if len(doc.children) > 0 and doc.children[0]["type"] != "Section-header":
+            initial_page = HierarchicalDocument(
+                {
+                    "type": "Section-header",
+                    "bbox": (0, 0, 0, 0),
+                    "properties": {"score": 1, "page_number": 1},
+                    "text_representation": "Front Page",
+                    "binary_representation": b"Front Page",
+                }
+            )
+            doc.children.insert(0, initial_page)  # O(n) insert :( we should use deque for everything
+
+        doc.data["relationships"] = doc.get("relationships", {})
+        doc.data["label"] = doc.get("label", "DOCUMENT")
+
+        sections = []
+
+        section: Optional[HierarchicalDocument] = None
+        element: Optional[HierarchicalDocument] = None
+        for child in doc.children:
+            child.data["relationships"] = child.get("relationships", {})
+            if child.type == "Section-header" and child.data.get("text_representation"):
+                if section is not None:
+                    next = {
+                        "TYPE": "NEXT",
+                        "properties": {},
+                        "START_ID": section.doc_id,
+                        "START_LABEL": "SECTION",
+                        "END_ID": child.doc_id,
+                        "END_LABEL": "SECTION",
+                    }
+                    child.data["relationships"][str(uuid.uuid4())] = next
+                    element = None
+                rel = {
+                    "TYPE": "SECTION_OF",
+                    "properties": {},
+                    "START_ID": child.doc_id,
+                    "START_LABEL": "SECTION",
+                    "END_ID": doc.doc_id,
+                    "END_LABEL": "DOCUMENT",
+                }
+                child.data["relationships"][str(uuid.uuid4())] = rel
+                child.data["label"] = "SECTION"
+                section = child
+                sections.append(section)
+            else:
+                assert section is not None
+                if element is not None:
+                    next = {
+                        "TYPE": "NEXT",
+                        "properties": {},
+                        "START_ID": element.doc_id,
+                        "START_LABEL": "ELEMENT",
+                        "END_ID": child.doc_id,
+                        "END_LABEL": "ELEMENT",
+                    }
+                    child.data["relationships"][str(uuid.uuid4())] = next
+                rel = {
+                    "TYPE": "PART_OF",
+                    "properties": {},
+                    "START_ID": child.doc_id,
+                    "START_LABEL": "ELEMENT",
+                    "END_ID": section.doc_id,
+                    "END_LABEL": "SECTION",
+                }
+                child.data["relationships"][str(uuid.uuid4())] = rel
+                child.data["label"] = "ELEMENT"
+                element = child
+                section.data["children"].append(element)
+
+        doc.children = sections
+        return doc
+
+
+class ExtractSummaries(Map):
+    """
+    Extracts summaries from child documents to be used for entity extraction. This function
+    generates summaries for sections within documents which are used during entity extraction.
+    """
+
+    def __init__(self, child: Node, **resource_args):
+        super().__init__(child, f=ExtractSummaries.summarize_sections, **resource_args)
+
+    @staticmethod
+    def summarize_sections(doc: HierarchicalDocument) -> HierarchicalDocument:
+        if "EXTRACTED_NODES" in doc.data:
+            return doc
+        for section in doc.children:
+            assert section.text_representation is not None
+            summary = f"-----SECTION TITLE: {section.text_representation.strip()}-----\n"
+            for element in section.children:
+                if element.type == "table":
+                    element.text_representation = element.data["table"].to_csv()
+                assert element.type is not None
+                assert element.text_representation is not None
+                summary += f"""---Element Type: {element.type.strip()}---\n{element.text_representation.strip()}\n"""
+            section.data["summary"] = summary
+        return doc
+
+
 class ExtractFeatures(Map):
     """
     Extracts features determined by a specific extractor from each document