Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[llm unify 7/n] Summarize #1192

Open
wants to merge 28 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
ff9ca26
initial heirarchical document summarize implementation
HenryL27 Feb 20, 2025
53879e6
ruff
HenryL27 Feb 20, 2025
9dffee1
make some tests work
HenryL27 Feb 20, 2025
532f7e8
fix more tests
HenryL27 Feb 21, 2025
8d2b2f8
mypy
HenryL27 Feb 21, 2025
aa1e51f
fix llm filter codegen
HenryL27 Feb 21, 2025
93fb7ec
put back collapsing summarizer
HenryL27 Feb 24, 2025
41e7a19
fix names
HenryL27 Feb 24, 2025
d77df22
add docset summarizer parametrization
HenryL27 Feb 24, 2025
8f10485
add roundrobin summarizer
HenryL27 Feb 25, 2025
8066362
mypy and ruff
HenryL27 Feb 25, 2025
6eb82c2
rename to RoundRobinOneshotDocumentSummarizer
HenryL27 Feb 25, 2025
13d6041
factor complicated common jinja logic to fragments
HenryL27 Feb 25, 2025
90e6e4e
add max tokens heirarchical summarizer
HenryL27 Feb 25, 2025
a621b8d
ruff
HenryL27 Feb 25, 2025
b690923
fix unit tests
HenryL27 Feb 25, 2025
b1acb1e
mypy
HenryL27 Feb 25, 2025
eb18e7e
add unit tests for summarizers
HenryL27 Feb 26, 2025
7fce340
a whole bunch of docstrings
HenryL27 Feb 26, 2025
2c887b1
oops didn't mean to commit this
HenryL27 Feb 26, 2025
e22111a
move complex prompts to be next to the complex code that sets them up
HenryL27 Feb 26, 2025
3fe1c14
have summmarize_data take a summarizer instance rather than a summari…
HenryL27 Feb 26, 2025
687d1e4
fix unit tests
HenryL27 Feb 26, 2025
d18a03f
inline get text macro since it's only used by one template
HenryL27 Feb 26, 2025
6a373ad
remove collapse document summarizer
HenryL27 Feb 27, 2025
5b267a7
apparently that lets me get rid of collapse and qasummarizer too, nice
HenryL27 Feb 27, 2025
d156b03
mypy + tests
HenryL27 Feb 27, 2025
9104aff
ruff
HenryL27 Feb 27, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions lib/sycamore/sycamore/docset.py
Original file line number Diff line number Diff line change
Expand Up @@ -733,10 +733,8 @@ def summarize(self, summarizer: Summarizer, **kwargs) -> "DocSet":
.partition(partitioner=ArynPartitioner())
.summarize(summarizer=summarizer)
"""
from sycamore.transforms import Summarize

summaries = Summarize(self.plan, summarizer=summarizer, **kwargs)
return DocSet(self.context, summaries)
map = summarizer.as_llm_map(self.plan, **kwargs)
return DocSet(self.context, map)

def mark_bbox_preset(self, tokenizer: Tokenizer, token_limit: int = 512, **kwargs) -> "DocSet":
"""
Expand Down
9 changes: 9 additions & 0 deletions lib/sycamore/sycamore/llms/prompts/default_prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,15 @@ class _TextSummarizerGuidancePrompt(SimplePrompt):
""",
)

TextSummarizerJinjaPrompt = JinjaElementPrompt(
system="You are a helpful text summarizer.",
user="""Write a summary of the following. Use only the information provided.
Include as many key details as possible. Do not make up your answer. Only return the summary as part of your answer.

{{ elt.text_representation }}
""",
)


class _SchemaZeroShotGuidancePrompt(SimplePrompt):
system = "You are a helpful entity extractor. You only return JSON Schema."
Expand Down
196 changes: 77 additions & 119 deletions lib/sycamore/sycamore/query/execution/operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,32 @@

from sycamore import DocSet
from sycamore.context import context_params, Context
from sycamore.data import MetadataDocument
from sycamore.functions import CharacterTokenizer, Tokenizer
from sycamore.data import Document, Element
from sycamore.functions.tokenizer import OpenAITokenizer
from sycamore.llms.llms import LLM
from sycamore.llms.prompts import RenderedPrompt, RenderedMessage
from sycamore.llms.prompts.default_prompts import (
SummarizeDataMessagesPrompt,
)
from sycamore.transforms.summarize import (
NUM_TEXT_CHARS_GENERATE,
DocumentSummarizer,
collapse,
QuestionAnsweringSummarizer,
BASE_PROPS,
EtCetera,
MultiStepDocumentSummarizer,
OneStepDocumentSummarizer,
Summarizer,
)

log = structlog.get_logger(__name__)
# multistep
DEFAULT_DOCSET_SUMMARIZER_CLS = MultiStepDocumentSummarizer # type: ignore

DEFAULT_SUMMARIZER_KWARGS: dict[str, Any] = {
"fields": "*",
"tokenizer": OpenAITokenizer("gpt-4o"),
"max_tokens": 80_000,
}
# onestep
DEFAULT_DOCSET_SUMMARIZER_CLS = OneStepDocumentSummarizer # type: ignore
DEFAULT_SUMMARIZER_KWARGS = {"fields": [EtCetera], "tokenizer": OpenAITokenizer("gpt-4o"), "token_limit": 80_000}


def math_operation(val1: int, val2: int, operator: str) -> Union[int, float]:
Expand Down Expand Up @@ -52,14 +63,12 @@ def math_operation(val1: int, val2: int, operator: str) -> Union[int, float]:
@context_params
def summarize_data(
llm: LLM,
question: str,
question: Optional[str],
result_description: str,
result_data: List[Any],
use_elements: bool = False,
num_elements: int = 5,
max_tokens: int = 120 * 1000,
tokenizer: Tokenizer = CharacterTokenizer(),
summaries_as_text: bool = False,
context: Optional[Context] = None,
docset_summarizer: Optional[Summarizer] = None,
**kwargs,
) -> str:
"""
Expand All @@ -71,123 +80,72 @@ def summarize_data(
question: Question to answer.
result_description: Description of each of the inputs in result_data.
result_data: List of inputs.
use_elements: Use text contents from document.elements instead of document.text_representation.
num_elements: Number of elements whose text to use from each document.
max_tokens: Maximum number of tokens allowed in the summary to send to the LLM.
tokenizer: Tokenizer to use for counting against max_tokens.
summaries_as_text: If true, summarize all documents in the result_data docsets and treat
those summaries as the text representation for the final summarize step.
context: Optional Context object to get default parameters from.
docset_summarizer: Summarizer class to use to summarize the docset.
Default is `DEFAULT_DOCSET_SUMMARIZER`
summarizer_kwargs: keyword arguments to pass to the docset summarizer constructor. e.g.
`tokenizer`, `token_limit`, and `element_batch_size`
**kwargs: Additional keyword arguments.

Returns:
Conversational response to question.
"""
text = _get_text_for_summarize_data(
result_description=result_description,
result_data=result_data,
use_elements=use_elements,
num_elements=num_elements,
max_tokens=max_tokens,
tokenizer=tokenizer,
**kwargs,
)
messages = SummarizeDataMessagesPrompt(question=question, text=text).as_messages()
prompt_kwargs = {"messages": messages}

# call to LLM
completion = llm.generate_old(prompt_kwargs=prompt_kwargs, llm_kwargs={"temperature": 0})

# LLM response
if docset_summarizer is None:
docset_summarizer = DEFAULT_DOCSET_SUMMARIZER_CLS(
llm=llm, question=question, **DEFAULT_SUMMARIZER_KWARGS # type: ignore
)

if all(isinstance(d, DocSet) for d in result_data):
return summarize_data_docsets(
llm,
question,
result_data,
docset_summarizer=docset_summarizer,
data_description=result_description,
summaries_as_text=summaries_as_text,
)

# If data is not DocSets, text is this list here
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we force data to always be docsets? If it somehow isn't convert it to a DocSet?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

According to vinayak if it's not docsets it's a single scalar (output of Count or Math operator). You could, I guess, wrap it in a Document and wrap that in a DocSet. Seems like hunting ducks with a bazooka. Also the data will look very different so you probably can't use the same prompting anyway

# TODO: Jinjify.
text = f"Data description: {result_description}\n"
for i, d in enumerate(result_data):
text += f"Input {i + 1}: {str(d)}\n"

messages = SummarizeDataMessagesPrompt(question=question or "", text=text).as_messages()
prompt = RenderedPrompt(messages=[RenderedMessage(role=m["role"], content=m["content"]) for m in messages])
completion = llm.generate(prompt=prompt)
return completion


def _get_text_for_summarize_data(
result_description: str,
result_data: List[Any],
use_elements: bool,
num_elements: int,
max_tokens: Optional[int] = None,
tokenizer: Optional[Tokenizer] = None,
**kwargs,
) -> str:
text = f"Data description: {result_description}\n"
if (max_tokens is not None and tokenizer is None) or (max_tokens is None and tokenizer is not None):
raise ValueError("Both max_tokens and tokenizer must be provided together.")

for i, result in enumerate(result_data):
text += f"Input {i + 1}:\n"

# consolidates relevant properties to give to LLM
if isinstance(result, DocSet):
done = False
# For query result caching in the executor, we need to consume the documents
# so that the materialized data is complete, even if they are not all included
# in the input prompt to the LLM.
for di, doc in enumerate(result.take_all()):
if isinstance(doc, MetadataDocument):
continue
if done:
continue
props_dict = doc.properties.get("entity", {})
props_dict.update({p: doc.properties[p] for p in set(doc.properties) - set(BASE_PROPS)})
doc_text = f"Document {di}:\n"
for k, v in props_dict.items():
doc_text += f"{k}: {v}\n"

doc_text_representation = ""
if not use_elements:
if doc.text_representation is not None:
doc_text_representation += doc.text_representation[:NUM_TEXT_CHARS_GENERATE]
else:
for element in doc.elements[:num_elements]:
# Greedy fill doc level text length
if len(doc_text_representation) >= NUM_TEXT_CHARS_GENERATE:
break
doc_text_representation += (element.text_representation or "") + "\n"
doc_text += f"Text contents:\n{doc_text_representation}\n"

if tokenizer is not None and max_tokens is not None: # for mypy
total_token_count = len(tokenizer.tokenize(text + doc_text))
if total_token_count > max_tokens:
log.warn(
"Unable to add all text from to the LLM summary request due to token limit."
f" Sending text from {di + 1} docs."
)
done = True
continue
text += doc_text + "\n"
else:
text += str(result) + "\n"

return text
def sum_to_text(d: Document) -> Document:
if "summary" in d.properties:
d.text_representation = d.properties.pop("summary")
return d


@context_params
def summarize_map_reduce(
def summarize_data_docsets(
llm: LLM,
question: str,
result_description: str,
result_data: List[Any],
use_elements: bool = False,
num_elements: int = 5,
max_tokens: int = 10 * 1000,
tokenizer: Tokenizer = CharacterTokenizer(),
question: Optional[str],
result_data: List[DocSet],
docset_summarizer: Summarizer,
data_description: Optional[str] = None,
summaries_as_text: bool = False,
) -> str:
""" """
text = f"Data description: {result_description}\n"
for i, result in enumerate(result_data):
if isinstance(result, DocSet):
docs = (
result.filter(lambda d: isinstance(d, MetadataDocument) is False)
.summarize(
summarizer=DocumentSummarizer(llm, question)
) # document-level summarization can be parallelized (per DocSet)
.take_all()
)
for doc in docs:
text += doc.properties["summary"] + "\n"

else:
text += str(result) + "\n"

final_summary = collapse(text, max_tokens, tokenizer, QuestionAnsweringSummarizer(llm, question))
return final_summary
if summaries_as_text:
result_data = [ds.summarize(docset_summarizer).map(sum_to_text) for ds in result_data]

single_docs = [_docset_to_singledoc(ds) for ds in result_data]
agged_ds = result_data[0].context.read.document(single_docs).summarize(docset_summarizer)
texts = [d.properties["summary"] for d in agged_ds.take_all()]
return "\n".join(texts)


def _docset_to_singledoc(ds: DocSet) -> Document:
"""
Converts a docset into a single document by turning every Document
into an Element of a global parent document. Essentially a reverse
explode.
"""
return Document(elements=[Element(**d.data) for d in ds.take_all()])
5 changes: 2 additions & 3 deletions lib/sycamore/sycamore/query/execution/sycamore_operator.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,6 @@ def execute(self) -> Any:
result_description=description,
result_data=self.inputs,
context=self.context,
use_elements=True,
**self.get_execute_args(),
)
return result
Expand Down Expand Up @@ -283,7 +282,7 @@ def script(self, input_var: Optional[str] = None, output_var: Optional[str] = No
input_str = input_var or get_var_name(self.logical_node.input_nodes()[0])
output_str = output_var or get_var_name(self.logical_node)
result = f"""
prompt = LlmFilterMessagesPrompt(filter_question='{self.logical_node.question}').as_messages()
prompt = LlmFilterMessagesJinjaPrompt.set(filter_question='{self.logical_node.question}')
{output_str} = {input_str}.llm_filter(
new_field='_autogen_LLMFilterOutput',
prompt=prompt,
Expand All @@ -293,7 +292,7 @@ def script(self, input_var: Optional[str] = None, output_var: Optional[str] = No
)
"""
return result, [
"from sycamore.llms.prompts.default_prompts import LlmFilterMessagesPrompt",
"from sycamore.llms.prompts.default_prompts import LlmFilterMessagesJinjaPrompt",
]


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,10 @@
import sycamore
from sycamore import EXEC_RAY
from sycamore.data import Document
from sycamore.functions import CharacterTokenizer
from sycamore.llms import OpenAI, OpenAIModels
from sycamore.query.execution.operations import (
QuestionAnsweringSummarizer,
collapse,
DocumentSummarizer,
summarize_map_reduce,
MultiStepDocumentSummarizer,
summarize_data,
)
from sycamore.tests.config import TEST_DIR
from sycamore.transforms.partition import UnstructuredPdfPartitioner
Expand All @@ -24,39 +21,6 @@ def llm():

class TestOperations:

def test_collapse(self, llm):
question = "What is"
summarizer_fn = QuestionAnsweringSummarizer(llm, question)

"""
Use this code to generate the text file.

path = str(TEST_DIR / "resources/data/pdfs/Ray.pdf")
context = sycamore.init(exec_mode=EXEC_RAY)
result = (
context.read.binary(path, binary_format="pdf")
.partition(partitioner=UnstructuredPdfPartitioner())
.explode()
#.summarize(summarizer=LLMElementTextSummarizer(llm))
.take_all()
)
text = ""
for doc in result:
#for element in doc.elements:
if doc.text_representation:
text += doc.text_representation + "\n"
# text += "\n"
"""

text_path = str(TEST_DIR / "resources/data/texts/Ray.txt")
text = open(text_path, "r").read()

max_tokens = 10000
tokenizer = CharacterTokenizer()
summary = collapse(text, max_tokens, tokenizer, summarizer_fn)
assert summary is not None
print(f"{len(summary)}\n\n{summary}")

def test_document_summarizer(self, llm):
text_path = str(TEST_DIR / "resources/data/texts/Ray.txt")
text = open(text_path, "r").read()
Expand Down Expand Up @@ -109,15 +73,15 @@ def test_document_summarizer(self, llm):
docs = [Document(item) for item in dicts]

question = "What is"
doc_summarizer = DocumentSummarizer(llm, question)
doc_summarizer = MultiStepDocumentSummarizer(llm, question)

docs[0].text_representation = text[:10000]
doc = doc_summarizer.summarize(docs[0])
assert doc.properties["summary"]

def test_document_summarizer_in_sycamore(self, llm):
question = "What is"
doc_summarizer = DocumentSummarizer(llm, question)
doc_summarizer = MultiStepDocumentSummarizer(llm, question)
path = str(TEST_DIR / "resources/data/pdfs/Ray.pdf")
context = sycamore.init(exec_mode=EXEC_RAY)
result = (
Expand All @@ -138,7 +102,7 @@ def test_summarize_map_reduce(self, llm):
docset = (
context.read.binary(path, binary_format="pdf").partition(partitioner=UnstructuredPdfPartitioner()).explode()
)
final_summary = summarize_data(llm, question, result_description="Ray paper", result_data=[docset])

final_summary = summarize_map_reduce(llm, question, "summary", [docset])
print(final_summary)
assert final_summary
Loading
Loading