@@ -142,11 +142,36 @@ def collapse(text: str, tokens_per_chunk: int, tokenizer: Tokenizer, summarizer_
142
142
143
143
144
144
class HeirarchicalDocumentSummarizer (Summarizer ):
145
+ """
146
+ Summarizes a document by constructing a heirarchical tree of batches of elements,
147
+ summarizing each one, and then repeating the process on the remaining summaries. For
148
+ example, with element_batch_size=3:
149
+ Elements - e0 - e1 - e2 - e3 - e4 - e5 - e6 - e7 - e8 - e9 - e10
150
+ | | | | | | | | | | |
151
+ summary 0-2 - summary 3-5 - summary 6-8 - summary 9-10
152
+ | | | |
153
+ summary 0-8 summary 9-10
154
+ | |
155
+ summary 0-10
156
+
157
+ Args:
158
+ llm: The llm to use to summarize
159
+ question: Optional question to use as context for summarization. If set, the llm
160
+ will attempt to use the data it's summarizing to answer the question
161
+ prompt: Prompt to use for each summarization. Caution: The default (SummarizeBranchingFactorJinjaPrompt)
162
+ has some fairly complicated logic encoded in it to make the tree construction work
163
+ correctly.
164
+ fields: List of fields to include in each element's representation in the prompt. Specify
165
+ with dotted notation (e.g. properties.title), or use "*" to capture everything. If None,
166
+ will include no fields.
167
+ element_batch_size: Branching factor of the constructed tree. Default is 10.
168
+ """
169
+
145
170
def __init__ (
146
171
self ,
147
172
llm : LLM ,
148
173
question : Optional [str ] = None ,
149
- prompt : Optional [JinjaElementPrompt ] = None ,
174
+ prompt : Optional [SycamorePrompt ] = None ,
150
175
fields : Union [None , Literal ["*" ], list [str ]] = None ,
151
176
element_batch_size : Optional [int ] = None ,
152
177
):
@@ -225,6 +250,27 @@ def cleanup(doc: Document) -> Document:
225
250
226
251
227
252
class MaxTokensHeirarchicalDocumentSummarizer (Summarizer ):
253
+ """
254
+ Summarizes a document by constructing a tree, similarly to HeirarchicalDocumentSummarizer.
255
+ Each batch of elements is determined by the number of tokens - each sub-summarization takes
256
+ as many elements as possible within the token limit.
257
+
258
+ Args:
259
+ llm: LLM to use for summarization
260
+ question: Optional question to use as context for the summarization. If set, the llm will
261
+ attempt to answer the question with the data provided
262
+ prompt: Prompt to use for each summarization. Caution: The default (MaxTokensHeirarchicalSummarizerPrompt)
263
+ has some fairly complicated logic encoded in it to make the tree construction work correctly.
264
+ fields: List of fields to include in each element's representation in the prompt. Specify
265
+ with dotted notation (e.g. properties.title), or use "*" to capture everything. If None,
266
+ will include no fields.
267
+ max_tokens: token limit for each summarization. Default is 10k (default tokenizer is by character).
268
+ tokenizer: tokenizer to use when computing how many tokens a prompt will take. Default is
269
+ CharacterTokenizer
270
+ rounds: number of rounds of heirarchical summarization to perform. The number of elements that can be
271
+ included in the summary is O(e^rounds), so rounds can be small. Default is 4.
272
+ """
273
+
228
274
def __init__ (
229
275
self ,
230
276
llm : LLM ,
@@ -311,6 +357,22 @@ def as_llm_map(self, child: Optional[Node], **kwargs) -> Node:
311
357
312
358
313
359
class CollapseDocumentSummarizer (Summarizer ):
360
+ """
361
+ Summarizes a document by converting it all to text, then iteratively summarizing chunks
362
+ of the text + the existing summary to build up a full summary.
363
+
364
+ Args:
365
+ llm: LLM to use for summarization
366
+ question: Question to use as context for the summarization. The llm will attempt to
367
+ answer the question using the data in the document.
368
+ chunk_size: Size of the chunks to add in each round of summarization
369
+ tokenizer: Tokenizer to use to compute chunk sizes
370
+ use_elements: If True, will include data from the elements of the document as well
371
+ as the document itself. Default is False
372
+ num_elements: Limit on the number of elements to include if use_elements is true (take
373
+ the first num_elements elements). Default is 5
374
+ """
375
+
314
376
def __init__ (
315
377
self ,
316
378
llm : LLM ,
@@ -366,6 +428,26 @@ class EtCetera:
366
428
367
429
368
430
class RoundRobinOneshotDocumentSummarizer (Summarizer ):
431
+ """
432
+ Summarizes a document in a single LLM call by taking as much data as possible
433
+ from every element, spread across them evenly. Intended for use with summarize_data,
434
+ where a summarizer is used to summarize an entire docset.
435
+
436
+ Args:
437
+ llm: LLM to use for summarization
438
+ question: Question to use as context for the summary. The llm will attempt to
439
+ use the data provided to answer the question.
440
+ token_limit: Token limit for the prompt. Default is 10k (default tokenizer is
441
+ by character)
442
+ tokenizer: Tokenizer to use to count tokens (to not exceed the token limit).
443
+ Default is CharacterTokenizer
444
+ fields: List of fields to include from every element. To include any additional
445
+ fields (after the ones specified), end the list with `EtCetera`. Default is
446
+ empty list, which stands for 'as many fields as fit within the token limit'
447
+ and is equivalent to `[EtCetera]`
448
+
449
+ """
450
+
369
451
def __init__ (
370
452
self ,
371
453
llm : LLM ,
0 commit comments