Skip to content

Commit 7fce340

Browse files
committed
a whole bunch of docstrings
Signed-off-by: Henry Lindeman <[email protected]>
1 parent eb18e7e commit 7fce340

File tree

2 files changed

+89
-5
lines changed

2 files changed

+89
-5
lines changed

lib/sycamore/sycamore/query/execution/operations.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -76,11 +76,13 @@ def summarize_data(
7676
question: Question to answer.
7777
result_description: Description of each of the inputs in result_data.
7878
result_data: List of inputs.
79-
use_elements: Use text contents from document.elements instead of document.text_representation.
80-
num_elements: Number of elements whose text to use from each document.
81-
max_tokens: Maximum number of tokens allowed in the summary to send to the LLM.
82-
tokenizer: Tokenizer to use for counting against max_tokens.
79+
summaries_as_text: If true, summarize all documents in the result_data docsets and treat
80+
those summaries as the text representation for the final summarize step.
8381
context: Optional Context object to get default parameters from.
82+
docset_summarizer: Summarizer class to use to summarize the docset.
83+
Default is `DEFAULT_DOCSET_SUMMARIZER`
84+
summarizer_kwargs: keyword arguments to pass to the docset summarizer constructor. e.g.
85+
`tokenizer`, `token_limit`, and `element_batch_size`
8486
**kwargs: Additional keyword arguments.
8587
8688
Returns:

lib/sycamore/sycamore/transforms/summarize.py

Lines changed: 83 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,11 +142,36 @@ def collapse(text: str, tokens_per_chunk: int, tokenizer: Tokenizer, summarizer_
142142

143143

144144
class HeirarchicalDocumentSummarizer(Summarizer):
145+
"""
146+
Summarizes a document by constructing a heirarchical tree of batches of elements,
147+
summarizing each one, and then repeating the process on the remaining summaries. For
148+
example, with element_batch_size=3:
149+
Elements - e0 - e1 - e2 - e3 - e4 - e5 - e6 - e7 - e8 - e9 - e10
150+
| | | | | | | | | | |
151+
summary 0-2 - summary 3-5 - summary 6-8 - summary 9-10
152+
| | | |
153+
summary 0-8 summary 9-10
154+
| |
155+
summary 0-10
156+
157+
Args:
158+
llm: The llm to use to summarize
159+
question: Optional question to use as context for summarization. If set, the llm
160+
will attempt to use the data it's summarizing to answer the question
161+
prompt: Prompt to use for each summarization. Caution: The default (SummarizeBranchingFactorJinjaPrompt)
162+
has some fairly complicated logic encoded in it to make the tree construction work
163+
correctly.
164+
fields: List of fields to include in each element's representation in the prompt. Specify
165+
with dotted notation (e.g. properties.title), or use "*" to capture everything. If None,
166+
will include no fields.
167+
element_batch_size: Branching factor of the constructed tree. Default is 10.
168+
"""
169+
145170
def __init__(
146171
self,
147172
llm: LLM,
148173
question: Optional[str] = None,
149-
prompt: Optional[JinjaElementPrompt] = None,
174+
prompt: Optional[SycamorePrompt] = None,
150175
fields: Union[None, Literal["*"], list[str]] = None,
151176
element_batch_size: Optional[int] = None,
152177
):
@@ -225,6 +250,27 @@ def cleanup(doc: Document) -> Document:
225250

226251

227252
class MaxTokensHeirarchicalDocumentSummarizer(Summarizer):
253+
"""
254+
Summarizes a document by constructing a tree, similarly to HeirarchicalDocumentSummarizer.
255+
Each batch of elements is determined by the number of tokens - each sub-summarization takes
256+
as many elements as possible within the token limit.
257+
258+
Args:
259+
llm: LLM to use for summarization
260+
question: Optional question to use as context for the summarization. If set, the llm will
261+
attempt to answer the question with the data provided
262+
prompt: Prompt to use for each summarization. Caution: The default (MaxTokensHeirarchicalSummarizerPrompt)
263+
has some fairly complicated logic encoded in it to make the tree construction work correctly.
264+
fields: List of fields to include in each element's representation in the prompt. Specify
265+
with dotted notation (e.g. properties.title), or use "*" to capture everything. If None,
266+
will include no fields.
267+
max_tokens: token limit for each summarization. Default is 10k (default tokenizer is by character).
268+
tokenizer: tokenizer to use when computing how many tokens a prompt will take. Default is
269+
CharacterTokenizer
270+
rounds: number of rounds of heirarchical summarization to perform. The number of elements that can be
271+
included in the summary is O(e^rounds), so rounds can be small. Default is 4.
272+
"""
273+
228274
def __init__(
229275
self,
230276
llm: LLM,
@@ -311,6 +357,22 @@ def as_llm_map(self, child: Optional[Node], **kwargs) -> Node:
311357

312358

313359
class CollapseDocumentSummarizer(Summarizer):
360+
"""
361+
Summarizes a document by converting it all to text, then iteratively summarizing chunks
362+
of the text + the existing summary to build up a full summary.
363+
364+
Args:
365+
llm: LLM to use for summarization
366+
question: Question to use as context for the summarization. The llm will attempt to
367+
answer the question using the data in the document.
368+
chunk_size: Size of the chunks to add in each round of summarization
369+
tokenizer: Tokenizer to use to compute chunk sizes
370+
use_elements: If True, will include data from the elements of the document as well
371+
as the document itself. Default is False
372+
num_elements: Limit on the number of elements to include if use_elements is true (take
373+
the first num_elements elements). Default is 5
374+
"""
375+
314376
def __init__(
315377
self,
316378
llm: LLM,
@@ -366,6 +428,26 @@ class EtCetera:
366428

367429

368430
class RoundRobinOneshotDocumentSummarizer(Summarizer):
431+
"""
432+
Summarizes a document in a single LLM call by taking as much data as possible
433+
from every element, spread across them evenly. Intended for use with summarize_data,
434+
where a summarizer is used to summarize an entire docset.
435+
436+
Args:
437+
llm: LLM to use for summarization
438+
question: Question to use as context for the summary. The llm will attempt to
439+
use the data provided to answer the question.
440+
token_limit: Token limit for the prompt. Default is 10k (default tokenizer is
441+
by character)
442+
tokenizer: Tokenizer to use to count tokens (to not exceed the token limit).
443+
Default is CharacterTokenizer
444+
fields: List of fields to include from every element. To include any additional
445+
fields (after the ones specified), end the list with `EtCetera`. Default is
446+
empty list, which stands for 'as many fields as fit within the token limit'
447+
and is equivalent to `[EtCetera]`
448+
449+
"""
450+
369451
def __init__(
370452
self,
371453
llm: LLM,

0 commit comments

Comments
 (0)