remove collapse document summarizer

Signed-off-by: Henry Lindeman <[email protected]>
aryn-ai · Feb 27, 2025 · 6a373ad · 6a373ad
1 parent d18a03f
commit 6a373ad
Show file tree

Hide file tree

Showing 3 changed files with 20 additions and 110 deletions.
diff --git a/lib/sycamore/sycamore/query/execution/operations.py b/lib/sycamore/sycamore/query/execution/operations.py
@@ -5,24 +5,31 @@
 
 from sycamore import DocSet
 from sycamore.context import context_params, Context
-from sycamore.data import MetadataDocument, Document, Element
-from sycamore.functions import CharacterTokenizer, Tokenizer
+from sycamore.data import Document, Element
+from sycamore.functions.tokenizer import OpenAITokenizer
 from sycamore.llms.llms import LLM
 from sycamore.llms.prompts import RenderedPrompt, RenderedMessage
 from sycamore.llms.prompts.default_prompts import (
     SummarizeDataMessagesPrompt,
 )
 from sycamore.transforms.summarize import (
+    EtCetera,
     MultiStepDocumentSummarizer,
+    OneStepDocumentSummarizer,
     Summarizer,
-    CollapseDocumentSummarizer,
-    collapse,
-    QuestionAnsweringSummarizer,
 )
 
 log = structlog.get_logger(__name__)
+# multistep
 DEFAULT_DOCSET_SUMMARIZER_CLS = MultiStepDocumentSummarizer
-DEFAULT_SUMMARIZER_KWARGS: dict[str, Any] = {}
+DEFAULT_SUMMARIZER_KWARGS: dict[str, Any] = {
+    "fields": "*",
+    "tokenizer": OpenAITokenizer("gpt-4o"),
+    "max_tokens": 80_000,
+}
+# onestep
+DEFAULT_DOCSET_SUMMARIZER_CLS = OneStepDocumentSummarizer
+DEFAULT_SUMMARIZER_KWARGS = {"fields": [EtCetera], "tokenizer": OpenAITokenizer("gpt-4o"), "token_limit": 80_000}
 
 
 def math_operation(val1: int, val2: int, operator: str) -> Union[int, float]:
@@ -139,35 +146,3 @@ def _docset_to_singledoc(ds: DocSet) -> Document:
     explode.
     """
     return Document(elements=[Element(**d.data) for d in ds.take_all()])
-
-
-@context_params
-def summarize_map_reduce(
-    llm: LLM,
-    question: str,
-    result_description: str,
-    result_data: List[Any],
-    use_elements: bool = False,
-    num_elements: int = 5,
-    max_tokens: int = 10 * 1000,
-    tokenizer: Tokenizer = CharacterTokenizer(),
-) -> str:
-    """ """
-    text = f"Data description: {result_description}\n"
-    for i, result in enumerate(result_data):
-        if isinstance(result, DocSet):
-            docs = (
-                result.filter(lambda d: isinstance(d, MetadataDocument) is False)
-                .summarize(
-                    summarizer=CollapseDocumentSummarizer(llm, question)
-                )  # document-level summarization can be parallelized (per DocSet)
-                .take_all()
-            )
-            for doc in docs:
-                text += doc.properties["summary"] + "\n"
-
-        else:
-            text += str(result) + "\n"
-
-    final_summary = collapse(text, max_tokens, tokenizer, QuestionAnsweringSummarizer(llm, question))
-    return final_summary
diff --git a/lib/sycamore/sycamore/tests/integration/query/execution/test_operations.py b/lib/sycamore/sycamore/tests/integration/query/execution/test_operations.py
@@ -8,8 +8,8 @@
 from sycamore.query.execution.operations import (
     QuestionAnsweringSummarizer,
     collapse,
-    CollapseDocumentSummarizer,
-    summarize_map_reduce,
+    MultiStepDocumentSummarizer,
+    summarize_data,
 )
 from sycamore.tests.config import TEST_DIR
 from sycamore.transforms.partition import UnstructuredPdfPartitioner
@@ -109,15 +109,15 @@ def test_document_summarizer(self, llm):
         docs = [Document(item) for item in dicts]
 
         question = "What is"
-        doc_summarizer = CollapseDocumentSummarizer(llm, question)
+        doc_summarizer = MultiStepDocumentSummarizer(llm, question)
 
         docs[0].text_representation = text[:10000]
         doc = doc_summarizer.summarize(docs[0])
         assert doc.properties["summary"]
 
     def test_document_summarizer_in_sycamore(self, llm):
         question = "What is"
-        doc_summarizer = CollapseDocumentSummarizer(llm, question)
+        doc_summarizer = MultiStepDocumentSummarizer(llm, question)
         path = str(TEST_DIR / "resources/data/pdfs/Ray.pdf")
         context = sycamore.init(exec_mode=EXEC_RAY)
         result = (
@@ -138,7 +138,7 @@ def test_summarize_map_reduce(self, llm):
         docset = (
             context.read.binary(path, binary_format="pdf").partition(partitioner=UnstructuredPdfPartitioner()).explode()
         )
+        final_summary = summarize_data(llm, question, result_description="Ray paper", result_data=[docset])
 
-        final_summary = summarize_map_reduce(llm, question, "summary", [docset])
         print(final_summary)
         assert final_summary
diff --git a/lib/sycamore/sycamore/transforms/summarize.py b/lib/sycamore/sycamore/transforms/summarize.py
@@ -169,7 +169,8 @@ def collapse(text: str, tokens_per_chunk: int, tokenizer: Tokenizer, summarizer_
         {%- macro get_text(element, itvarname) %}
             {%- if elt.properties[itvarname] == 0 -%}
                 {%- if fields is defined -%}
-                    {%- if fields == "*" %}{% for p in element.properties %}{% if p.startswith('_') %}{% continue %}{% endif %}
+                    {%- if fields == "*" %}{% for p in element.properties %}
+                        {%- if p.startswith('_') %}{% continue %}{% endif %}
             {{ p }}: {{ element.properties[p] }}
                     {%- endfor -%}
                     {%- else %}{% for f in fields %}
@@ -327,72 +328,6 @@ def as_llm_map(self, child: Optional[Node], **kwargs) -> Node:
         return ct
 
 
-class CollapseDocumentSummarizer(Summarizer):
-    """
-    Summarizes a document by converting it all to text, then iteratively summarizing chunks
-    of the text + the existing summary to build up a full summary.
-
-    Args:
-        llm: LLM to use for summarization
-        question: Question to use as context for the summarization. The llm will attempt to
-            answer the question using the data in the document.
-        chunk_size: Size of the chunks to add in each round of summarization
-        tokenizer: Tokenizer to use to compute chunk sizes
-        use_elements: If True, will include data from the elements of the document as well
-            as the document itself. Default is False
-        num_elements: Limit on the number of elements to include if use_elements is true (take
-            the first num_elements elements). Default is 5
-    """
-
-    def __init__(
-        self,
-        llm: LLM,
-        question: str,
-        chunk_size: int = 10 * 1000,
-        tokenizer: Tokenizer = CharacterTokenizer(),
-        chunk_overlap: int = 0,
-        use_elements: bool = False,
-        num_elements: int = 5,
-    ):
-        self.llm = llm
-        self.question = question
-        self.chunk_size = chunk_size
-        self.tokenizer = tokenizer
-        self.chunk_overlap = chunk_overlap
-        self.use_elements = use_elements
-        self.num_elements = num_elements
-
-    def as_llm_map(self, child: Optional[Node], **kwargs):
-        return Map(child, f=self.summarize)  # type: ignore
-
-    def summarize(self, document: Document) -> Document:
-        text = self.get_text(document)
-        summary = collapse(text, self.chunk_size, self.tokenizer, QuestionAnsweringSummarizer(self.llm, self.question))
-        document.properties["summary"] = summary
-        return document
-
-    def get_text(self, doc: Document) -> str:
-        doc_text = ""
-        props_dict = doc.properties.get("entity", {})
-        props_dict.update({p: doc.properties[p] for p in set(doc.properties) - set(BASE_PROPS)})
-        for k, v in props_dict.items():
-            doc_text += f"{k}: {v}\n"
-
-        doc_text_representation = ""
-        if not self.use_elements:
-            if doc.text_representation is not None:
-                doc_text_representation += doc.text_representation[:NUM_TEXT_CHARS_GENERATE]
-        else:
-            for element in doc.elements[: self.num_elements]:
-                # Greedy fill doc level text length
-                if len(doc_text_representation) >= NUM_TEXT_CHARS_GENERATE:
-                    break
-                doc_text_representation += (element.text_representation or "") + "\n"
-        doc_text += f"Text contents:\n{doc_text_representation}\n"
-
-        return doc_text
-
-
 OneStepSummarizerPrompt = JinjaPrompt(
     system="You are a helpful text summarizer",
     user=textwrap.dedent(