Skip to content

Commit

Permalink
remove collapse document summarizer
Browse files Browse the repository at this point in the history
Signed-off-by: Henry Lindeman <[email protected]>
  • Loading branch information
HenryL27 committed Feb 27, 2025
1 parent d18a03f commit 6a373ad
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 110 deletions.
51 changes: 13 additions & 38 deletions lib/sycamore/sycamore/query/execution/operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,24 +5,31 @@

from sycamore import DocSet
from sycamore.context import context_params, Context
from sycamore.data import MetadataDocument, Document, Element
from sycamore.functions import CharacterTokenizer, Tokenizer
from sycamore.data import Document, Element
from sycamore.functions.tokenizer import OpenAITokenizer
from sycamore.llms.llms import LLM
from sycamore.llms.prompts import RenderedPrompt, RenderedMessage
from sycamore.llms.prompts.default_prompts import (
SummarizeDataMessagesPrompt,
)
from sycamore.transforms.summarize import (
EtCetera,
MultiStepDocumentSummarizer,
OneStepDocumentSummarizer,
Summarizer,
CollapseDocumentSummarizer,
collapse,
QuestionAnsweringSummarizer,
)

log = structlog.get_logger(__name__)
# multistep
DEFAULT_DOCSET_SUMMARIZER_CLS = MultiStepDocumentSummarizer
DEFAULT_SUMMARIZER_KWARGS: dict[str, Any] = {}
DEFAULT_SUMMARIZER_KWARGS: dict[str, Any] = {
"fields": "*",
"tokenizer": OpenAITokenizer("gpt-4o"),
"max_tokens": 80_000,
}
# onestep
DEFAULT_DOCSET_SUMMARIZER_CLS = OneStepDocumentSummarizer
DEFAULT_SUMMARIZER_KWARGS = {"fields": [EtCetera], "tokenizer": OpenAITokenizer("gpt-4o"), "token_limit": 80_000}


def math_operation(val1: int, val2: int, operator: str) -> Union[int, float]:
Expand Down Expand Up @@ -139,35 +146,3 @@ def _docset_to_singledoc(ds: DocSet) -> Document:
explode.
"""
return Document(elements=[Element(**d.data) for d in ds.take_all()])


@context_params
def summarize_map_reduce(
llm: LLM,
question: str,
result_description: str,
result_data: List[Any],
use_elements: bool = False,
num_elements: int = 5,
max_tokens: int = 10 * 1000,
tokenizer: Tokenizer = CharacterTokenizer(),
) -> str:
""" """
text = f"Data description: {result_description}\n"
for i, result in enumerate(result_data):
if isinstance(result, DocSet):
docs = (
result.filter(lambda d: isinstance(d, MetadataDocument) is False)
.summarize(
summarizer=CollapseDocumentSummarizer(llm, question)
) # document-level summarization can be parallelized (per DocSet)
.take_all()
)
for doc in docs:
text += doc.properties["summary"] + "\n"

else:
text += str(result) + "\n"

final_summary = collapse(text, max_tokens, tokenizer, QuestionAnsweringSummarizer(llm, question))
return final_summary
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
from sycamore.query.execution.operations import (
QuestionAnsweringSummarizer,
collapse,
CollapseDocumentSummarizer,
summarize_map_reduce,
MultiStepDocumentSummarizer,
summarize_data,
)
from sycamore.tests.config import TEST_DIR
from sycamore.transforms.partition import UnstructuredPdfPartitioner
Expand Down Expand Up @@ -109,15 +109,15 @@ def test_document_summarizer(self, llm):
docs = [Document(item) for item in dicts]

question = "What is"
doc_summarizer = CollapseDocumentSummarizer(llm, question)
doc_summarizer = MultiStepDocumentSummarizer(llm, question)

docs[0].text_representation = text[:10000]
doc = doc_summarizer.summarize(docs[0])
assert doc.properties["summary"]

def test_document_summarizer_in_sycamore(self, llm):
question = "What is"
doc_summarizer = CollapseDocumentSummarizer(llm, question)
doc_summarizer = MultiStepDocumentSummarizer(llm, question)
path = str(TEST_DIR / "resources/data/pdfs/Ray.pdf")
context = sycamore.init(exec_mode=EXEC_RAY)
result = (
Expand All @@ -138,7 +138,7 @@ def test_summarize_map_reduce(self, llm):
docset = (
context.read.binary(path, binary_format="pdf").partition(partitioner=UnstructuredPdfPartitioner()).explode()
)
final_summary = summarize_data(llm, question, result_description="Ray paper", result_data=[docset])

final_summary = summarize_map_reduce(llm, question, "summary", [docset])
print(final_summary)
assert final_summary
69 changes: 2 additions & 67 deletions lib/sycamore/sycamore/transforms/summarize.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,8 @@ def collapse(text: str, tokens_per_chunk: int, tokenizer: Tokenizer, summarizer_
{%- macro get_text(element, itvarname) %}
{%- if elt.properties[itvarname] == 0 -%}
{%- if fields is defined -%}
{%- if fields == "*" %}{% for p in element.properties %}{% if p.startswith('_') %}{% continue %}{% endif %}
{%- if fields == "*" %}{% for p in element.properties %}
{%- if p.startswith('_') %}{% continue %}{% endif %}
{{ p }}: {{ element.properties[p] }}
{%- endfor -%}
{%- else %}{% for f in fields %}
Expand Down Expand Up @@ -327,72 +328,6 @@ def as_llm_map(self, child: Optional[Node], **kwargs) -> Node:
return ct


class CollapseDocumentSummarizer(Summarizer):
"""
Summarizes a document by converting it all to text, then iteratively summarizing chunks
of the text + the existing summary to build up a full summary.
Args:
llm: LLM to use for summarization
question: Question to use as context for the summarization. The llm will attempt to
answer the question using the data in the document.
chunk_size: Size of the chunks to add in each round of summarization
tokenizer: Tokenizer to use to compute chunk sizes
use_elements: If True, will include data from the elements of the document as well
as the document itself. Default is False
num_elements: Limit on the number of elements to include if use_elements is true (take
the first num_elements elements). Default is 5
"""

def __init__(
self,
llm: LLM,
question: str,
chunk_size: int = 10 * 1000,
tokenizer: Tokenizer = CharacterTokenizer(),
chunk_overlap: int = 0,
use_elements: bool = False,
num_elements: int = 5,
):
self.llm = llm
self.question = question
self.chunk_size = chunk_size
self.tokenizer = tokenizer
self.chunk_overlap = chunk_overlap
self.use_elements = use_elements
self.num_elements = num_elements

def as_llm_map(self, child: Optional[Node], **kwargs):
return Map(child, f=self.summarize) # type: ignore

def summarize(self, document: Document) -> Document:
text = self.get_text(document)
summary = collapse(text, self.chunk_size, self.tokenizer, QuestionAnsweringSummarizer(self.llm, self.question))
document.properties["summary"] = summary
return document

def get_text(self, doc: Document) -> str:
doc_text = ""
props_dict = doc.properties.get("entity", {})
props_dict.update({p: doc.properties[p] for p in set(doc.properties) - set(BASE_PROPS)})
for k, v in props_dict.items():
doc_text += f"{k}: {v}\n"

doc_text_representation = ""
if not self.use_elements:
if doc.text_representation is not None:
doc_text_representation += doc.text_representation[:NUM_TEXT_CHARS_GENERATE]
else:
for element in doc.elements[: self.num_elements]:
# Greedy fill doc level text length
if len(doc_text_representation) >= NUM_TEXT_CHARS_GENERATE:
break
doc_text_representation += (element.text_representation or "") + "\n"
doc_text += f"Text contents:\n{doc_text_representation}\n"

return doc_text


OneStepSummarizerPrompt = JinjaPrompt(
system="You are a helpful text summarizer",
user=textwrap.dedent(
Expand Down

0 comments on commit 6a373ad

Please sign in to comment.