-
Notifications
You must be signed in to change notification settings - Fork 895
/
Copy pathllm_as_a_judge.py
76 lines (55 loc) · 2.26 KB
/
llm_as_a_judge.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
from __future__ import annotations
import asyncio
from dataclasses import dataclass
from typing import Literal
from agents import Agent, ItemHelpers, Runner, TResponseInputItem, trace
"""
This example shows the LLM as a judge pattern. The first agent generates an outline for a story.
The second agent judges the outline and provides feedback. We loop until the judge is satisfied
with the outline.
"""
story_outline_generator = Agent(
name="story_outline_generator",
instructions=(
"You generate a very short story outline based on the user's input."
"If there is any feedback provided, use it to improve the outline."
),
)
@dataclass
class EvaluationFeedback:
feedback: str
score: Literal["pass", "needs_improvement", "fail"]
evaluator = Agent[None](
name="evaluator",
instructions=(
"You evaluate a story outline and decide if it's good enough."
"If it's not good enough, you provide feedback on what needs to be improved."
"Never give it a pass on the first try."
),
output_type=EvaluationFeedback,
)
async def main() -> None:
msg = input("What kind of story would you like to hear? ")
input_items: list[TResponseInputItem] = [{"content": msg, "role": "user"}]
latest_outline: str | None = None
# We'll run the entire workflow in a single trace
with trace("LLM as a judge"):
while True:
story_outline_result = await Runner.run(
story_outline_generator,
input_items,
)
input_items = story_outline_result.to_input_list()
latest_outline = ItemHelpers.text_message_outputs(story_outline_result.new_items)
print("Story outline generated")
evaluator_result = await Runner.run(evaluator, input_items)
result: EvaluationFeedback = evaluator_result.final_output
print(f"Evaluator score: {result.score}")
if result.score == "pass":
print("Story outline is good enough, exiting.")
break
print("Re-running with feedback")
input_items.append({"content": f"Feedback: {result.feedback}", "role": "user"})
print(f"Final story outline: {latest_outline}")
if __name__ == "__main__":
asyncio.run(main())