stanfordnlp · ammirsm · Dec 9, 2024 · Dec 9, 2024 · Dec 10, 2024 · Dec 10, 2024
diff --git a/dspy/evaluate/auto_evaluation.py b/dspy/evaluate/auto_evaluation.py
@@ -45,7 +45,10 @@ def __init__(self, threshold=0.66, decompositional=False):
             self.module = dspy.ChainOfThought(SemanticRecallPrecision)
 
     def forward(self, example, pred, trace=None):
-        scores = self.module(question=example.question, ground_truth=example.response, system_response=pred.response)
+        ground_truth = example.response if hasattr(example, 'response') else getattr(example, 'answer', None)
+        system_response = pred.response if hasattr(pred, 'response') else getattr(pred, 'answer', None)
+
+        scores = self.module(question=example.question, ground_truth=ground_truth, system_response=system_response)
         score = f1_score(scores.precision, scores.recall)
 
         return score if trace is None else score >= self.threshold

diff --git a/dspy/evaluate/metrics.py b/dspy/evaluate/metrics.py
@@ -1,6 +1,8 @@
 # TODO: This should move internally. Same for passage_match. dspy.metrics.answer_exact_match, dspy.metrics.answer_passage_match
 
 import dsp
+from dsp import normalize_text
+from dspy.evaluate import SemanticF1
 
 
 def answer_exact_match(example, pred, trace=None, frac=1.0):
@@ -20,3 +22,36 @@ def answer_passage_match(example, pred, trace=None):
         return dsp.passage_match(pred.context, [example.answer])
     else: # type(example.answer) is list
         return dsp.passage_match(pred.context, example.answer)
+
+def answer_similar_match(example, pred, trace=None):
+    assert(type(example.answer) is str or type(example.answer) is list)
+
+    def is_substring(text1, text2):
+        # Normalize both texts using the existing normalize_text function
+        text1 = normalize_text(text1)
+        text2 = normalize_text(text2)
+        return text1 in text2 or text2 in text1
+
+    pred_answer = pred.answer
+    if type(example.answer) is str:
+        return is_substring(pred_answer, example.answer)
+    else:  # type(example.answer) is list
+        return any(is_substring(pred_answer, ans) for ans in example.answer)
+
+def answer_exact_match_and_semantic(example, pred, trace=None, frac=1.0, threshold=0.95):
+    """
+    Combines exact match and semantic F1 score checks.
+    Returns True if either exact match succeeds or semantic F1 score is above threshold.
+    """
+    # Check exact match first
+    exact_match = answer_exact_match(example, pred, trace=trace, frac=frac)
+
+    if exact_match:
+        return True
+
+    # If no exact match, check semantic similarity
+    semantic_f1 = SemanticF1(threshold=threshold)
+    semantic_score = semantic_f1(example, pred, trace=True)
+
+    return semantic_score
+
diff --git a/testing/max_score_example.py b/testing/max_score_example.py
@@ -0,0 +1,54 @@
+import os
+from dotenv import load_dotenv
+import dspy
+from max_score_tester import BootstrapMaxScoreTester
+from optimizer_tester import OptimizerTester
+from phoenix.otel import register
+from openinference.instrumentation.dspy import DSPyInstrumentor
+from openinference.instrumentation.litellm import LiteLLMInstrumentor
+
+# Load environment variables from .env file
+load_dotenv()
+
+# Get API key from environment variables
+api_key_openai = os.getenv('OPENAI_API_KEY')
+
+if not api_key_openai:
+    raise ValueError("OPENAI_API_KEY not found in environment variables")
+
+# Setup Phoenix tracing
+register(
+    endpoint="https://app.phoenix.arize.com/v1/traces",
+)
+
+# Initialize OpenTelemetry instrumentation
+DSPyInstrumentor().instrument(skip_dep_check=True)
+LiteLLMInstrumentor().instrument(skip_dep_check=True)
+
+lm = dspy.LM(model="openai/gpt-4o-mini", api_key=api_key_openai)
+embedder = dspy.Embedder(
+    model="openai/text-embedding-3-small",
+    api_key=api_key_openai
+)
+dspy.settings.configure(lm=lm, embedder=embedder)
+
+# Initialize the tester
+tester = BootstrapMaxScoreTester(
+    n_programs=2,
+    max_labeled_demos=1,
+    early_stopping_threshold=0.95,
+    num_threads=32,
+    dataset_name="hover_retrieve_discrete"
+)
+
+# Load dataset
+optimizer_tester = OptimizerTester(task_model=lm, prompt_model=lm)
+task = optimizer_tester.load_dataset("hover_retrieve_discrete")
+
+# Run the test
+results = tester.test_dataset(task)
+
+print(f"Maximum Train Score: {results['train_results']['solved_items']:.3f}")
+print(f"Maximum Dev Score: {results['dev_results']['solved_items']:.3f}")
+print(f"Maximum Test Score: {results['test_results']['solved_items']:.3f}")
+