19
19
RelevanceEvaluator ,
20
20
SimilarityEvaluator ,
21
21
GroundednessEvaluator ,
22
- # QAEvaluator,
22
+ QAEvaluator ,
23
23
ContentSafetyEvaluator ,
24
24
GroundednessProEvaluator ,
25
25
ProtectedMaterialEvaluator ,
@@ -88,7 +88,7 @@ def test_evaluate_singleton_inputs(self, model_config, azure_cred, project_scope
88
88
"fluency" : FluencyEvaluator (model_config ),
89
89
"relevance" : RelevanceEvaluator (model_config ),
90
90
"similarity" : SimilarityEvaluator (model_config ),
91
- # "qa": QAEvaluator(model_config),
91
+ "qa" : QAEvaluator (model_config ),
92
92
"grounded_pro" : GroundednessProEvaluator (azure_cred , project_scope ),
93
93
"protected_material" : ProtectedMaterialEvaluator (azure_cred , project_scope ),
94
94
"indirect_attack" : IndirectAttackEvaluator (azure_cred , project_scope ),
@@ -105,7 +105,7 @@ def test_evaluate_singleton_inputs(self, model_config, azure_cred, project_scope
105
105
row_result_df = pd .DataFrame (result ["rows" ])
106
106
metrics = result ["metrics" ]
107
107
108
- assert len (row_result_df .keys ()) == 48 # 63 with qa
108
+ assert len (row_result_df .keys ()) == 63
109
109
assert len (row_result_df ["inputs.query" ]) == 3
110
110
assert len (row_result_df ["inputs.context" ]) == 3
111
111
assert len (row_result_df ["inputs.response" ]) == 3
@@ -154,23 +154,23 @@ def test_evaluate_singleton_inputs(self, model_config, azure_cred, project_scope
154
154
assert len (row_result_df ["outputs.content_safety.violence" ]) == 3
155
155
assert len (row_result_df ["outputs.content_safety.violence_score" ]) == 3
156
156
assert len (row_result_df ["outputs.content_safety.violence_reason" ]) == 3
157
- # assert len(row_result_df["outputs.qa.f1_score"]) == 3
158
- # assert len(row_result_df["outputs.qa.groundedness"]) == 3
159
- # assert len(row_result_df["outputs.qa.gpt_groundedness"]) == 3
160
- # assert len(row_result_df["outputs.qa.groundedness_reason"]) == 3
161
- # assert len(row_result_df["outputs.qa.coherence"]) == 3
162
- # assert len(row_result_df["outputs.qa.gpt_coherence"]) == 3
163
- # assert len(row_result_df["outputs.qa.coherence_reason"]) == 3
164
- # assert len(row_result_df["outputs.qa.fluency"]) == 3
165
- # assert len(row_result_df["outputs.qa.gpt_fluency"]) == 3
166
- # assert len(row_result_df["outputs.qa.fluency_reason"]) == 3
167
- # assert len(row_result_df["outputs.qa.relevance"]) == 3
168
- # assert len(row_result_df["outputs.qa.gpt_relevance"]) == 3
169
- # assert len(row_result_df["outputs.qa.relevance_reason"]) == 3
170
- # assert len(row_result_df["outputs.qa.similarity"]) == 3
171
- # assert len(row_result_df["outputs.qa.gpt_similarity"]) == 3
157
+ assert len (row_result_df ["outputs.qa.f1_score" ]) == 3
158
+ assert len (row_result_df ["outputs.qa.groundedness" ]) == 3
159
+ assert len (row_result_df ["outputs.qa.gpt_groundedness" ]) == 3
160
+ assert len (row_result_df ["outputs.qa.groundedness_reason" ]) == 3
161
+ assert len (row_result_df ["outputs.qa.coherence" ]) == 3
162
+ assert len (row_result_df ["outputs.qa.gpt_coherence" ]) == 3
163
+ assert len (row_result_df ["outputs.qa.coherence_reason" ]) == 3
164
+ assert len (row_result_df ["outputs.qa.fluency" ]) == 3
165
+ assert len (row_result_df ["outputs.qa.gpt_fluency" ]) == 3
166
+ assert len (row_result_df ["outputs.qa.fluency_reason" ]) == 3
167
+ assert len (row_result_df ["outputs.qa.relevance" ]) == 3
168
+ assert len (row_result_df ["outputs.qa.gpt_relevance" ]) == 3
169
+ assert len (row_result_df ["outputs.qa.relevance_reason" ]) == 3
170
+ assert len (row_result_df ["outputs.qa.similarity" ]) == 3
171
+ assert len (row_result_df ["outputs.qa.gpt_similarity" ]) == 3
172
172
173
- assert len (metrics .keys ()) == 28 # 39 with qa
173
+ assert len (metrics .keys ()) == 39
174
174
assert metrics ["f1_score.f1_score" ] >= 0
175
175
assert metrics ["gleu.gleu_score" ] >= 0
176
176
assert metrics ["bleu.bleu_score" ] >= 0
@@ -199,17 +199,17 @@ def test_evaluate_singleton_inputs(self, model_config, azure_cred, project_scope
199
199
assert metrics ["protected_material.protected_material_defect_rate" ] >= 0
200
200
assert metrics ["indirect_attack.xpia_defect_rate" ] >= 0
201
201
assert metrics ["eci.eci_defect_rate" ] >= 0
202
- # assert metrics["qa.f1_score"] >= 0
203
- # assert metrics["qa.groundedness"] >= 0
204
- # assert metrics["qa.gpt_groundedness"] >= 0
205
- # assert metrics["qa.coherence"] >= 0
206
- # assert metrics["qa.gpt_coherence"] >= 0
207
- # assert metrics["qa.fluency"] >= 0
208
- # assert metrics["qa.gpt_fluency"] >= 0
209
- # assert metrics["qa.relevance"] >= 0
210
- # assert metrics["qa.gpt_relevance"] >= 0
211
- # assert metrics["qa.similarity"] >= 0
212
- # assert metrics["qa.gpt_similarity"] >= 0
202
+ assert metrics ["qa.f1_score" ] >= 0
203
+ assert metrics ["qa.groundedness" ] >= 0
204
+ assert metrics ["qa.gpt_groundedness" ] >= 0
205
+ assert metrics ["qa.coherence" ] >= 0
206
+ assert metrics ["qa.gpt_coherence" ] >= 0
207
+ assert metrics ["qa.fluency" ] >= 0
208
+ assert metrics ["qa.gpt_fluency" ] >= 0
209
+ assert metrics ["qa.relevance" ] >= 0
210
+ assert metrics ["qa.gpt_relevance" ] >= 0
211
+ assert metrics ["qa.similarity" ] >= 0
212
+ assert metrics ["qa.gpt_similarity" ] >= 0
213
213
214
214
def test_evaluate_conversation (self , model_config , data_convo_file , azure_cred , project_scope ):
215
215
evaluators = {
0 commit comments