Skip to content

Add max score testing framework with Bootstrap and Simple testers #1914

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 11 commits into
base: meta-knn-few-shot
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion dspy/evaluate/auto_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,10 @@ def __init__(self, threshold=0.66, decompositional=False):
self.module = dspy.ChainOfThought(SemanticRecallPrecision)

def forward(self, example, pred, trace=None):
scores = self.module(question=example.question, ground_truth=example.response, system_response=pred.response)
ground_truth = example.response if hasattr(example, 'response') else getattr(example, 'answer', None)
system_response = pred.response if hasattr(pred, 'response') else getattr(pred, 'answer', None)

scores = self.module(question=example.question, ground_truth=ground_truth, system_response=system_response)
score = f1_score(scores.precision, scores.recall)

return score if trace is None else score >= self.threshold
Expand Down
35 changes: 35 additions & 0 deletions dspy/evaluate/metrics.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# TODO: This should move internally. Same for passage_match. dspy.metrics.answer_exact_match, dspy.metrics.answer_passage_match

import dsp
from dsp import normalize_text
from dspy.evaluate import SemanticF1


def answer_exact_match(example, pred, trace=None, frac=1.0):
Expand All @@ -20,3 +22,36 @@ def answer_passage_match(example, pred, trace=None):
return dsp.passage_match(pred.context, [example.answer])
else: # type(example.answer) is list
return dsp.passage_match(pred.context, example.answer)

def answer_similar_match(example, pred, trace=None):
assert(type(example.answer) is str or type(example.answer) is list)

def is_substring(text1, text2):
# Normalize both texts using the existing normalize_text function
text1 = normalize_text(text1)
text2 = normalize_text(text2)
return text1 in text2 or text2 in text1

pred_answer = pred.answer
if type(example.answer) is str:
return is_substring(pred_answer, example.answer)
else: # type(example.answer) is list
return any(is_substring(pred_answer, ans) for ans in example.answer)

def answer_exact_match_and_semantic(example, pred, trace=None, frac=1.0, threshold=0.95):
"""
Combines exact match and semantic F1 score checks.
Returns True if either exact match succeeds or semantic F1 score is above threshold.
"""
# Check exact match first
exact_match = answer_exact_match(example, pred, trace=trace, frac=frac)

if exact_match:
return True

# If no exact match, check semantic similarity
semantic_f1 = SemanticF1(threshold=threshold)
semantic_score = semantic_f1(example, pred, trace=True)

return semantic_score

54 changes: 54 additions & 0 deletions testing/max_score_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import os
from dotenv import load_dotenv
import dspy
from max_score_tester import BootstrapMaxScoreTester
from optimizer_tester import OptimizerTester
from phoenix.otel import register
from openinference.instrumentation.dspy import DSPyInstrumentor
from openinference.instrumentation.litellm import LiteLLMInstrumentor

# Load environment variables from .env file
load_dotenv()

# Get API key from environment variables
api_key_openai = os.getenv('OPENAI_API_KEY')

if not api_key_openai:
raise ValueError("OPENAI_API_KEY not found in environment variables")

# Setup Phoenix tracing
register(
endpoint="https://app.phoenix.arize.com/v1/traces",
)

# Initialize OpenTelemetry instrumentation
DSPyInstrumentor().instrument(skip_dep_check=True)
LiteLLMInstrumentor().instrument(skip_dep_check=True)

lm = dspy.LM(model="openai/gpt-4o-mini", api_key=api_key_openai)
embedder = dspy.Embedder(
model="openai/text-embedding-3-small",
api_key=api_key_openai
)
dspy.settings.configure(lm=lm, embedder=embedder)

# Initialize the tester
tester = BootstrapMaxScoreTester(
n_programs=2,
max_labeled_demos=1,
early_stopping_threshold=0.95,
num_threads=32,
dataset_name="hover_retrieve_discrete"
)

# Load dataset
optimizer_tester = OptimizerTester(task_model=lm, prompt_model=lm)
task = optimizer_tester.load_dataset("hover_retrieve_discrete")

# Run the test
results = tester.test_dataset(task)

print(f"Maximum Train Score: {results['train_results']['solved_items']:.3f}")
print(f"Maximum Dev Score: {results['dev_results']['solved_items']:.3f}")
print(f"Maximum Test Score: {results['test_results']['solved_items']:.3f}")

Loading