-
Notifications
You must be signed in to change notification settings - Fork 336
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add evaluation core functions (#515)
## Motivation and Context (Why the change? What's the scenario?) First code commit for evaluation of KM memory by using RAGAS methodology. ## High level description (Approach, Design) Add Test set generation with SK, by using KM index for existing data. Add Test set evaluation of KM by using SK.
- Loading branch information
1 parent
37b001d
commit dd9915d
Showing
38 changed files
with
1,968 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
// Copyright (c) Microsoft. All rights reserved. | ||
|
||
using System; | ||
using System.Collections.Generic; | ||
using System.IO; | ||
using System.Linq; | ||
using System.Reflection; | ||
using System.Security.Cryptography; | ||
using System.Threading.Tasks; | ||
using Microsoft.KernelMemory.MemoryStorage; | ||
|
||
namespace Microsoft.KernelMemory.Evaluation; | ||
|
||
public abstract class EvaluationEngine | ||
{ | ||
protected string GetSKPrompt(string pluginName, string functionName) | ||
{ | ||
var resourceStream = Assembly.GetExecutingAssembly() | ||
.GetManifestResourceStream($"Prompts/{pluginName}/{functionName}.txt"); | ||
|
||
using var reader = new StreamReader(resourceStream!); | ||
var text = reader.ReadToEnd(); | ||
return text; | ||
} | ||
|
||
protected async Task<T> Try<T>(int maxCount, Func<int, Task<T>> action) | ||
{ | ||
do | ||
{ | ||
try | ||
{ | ||
return await action(maxCount).ConfigureAwait(false); | ||
} | ||
catch (Exception) | ||
{ | ||
if (maxCount == 0) | ||
{ | ||
throw; | ||
} | ||
} | ||
} while (maxCount-- > 0); | ||
|
||
throw new InvalidProgramException(); | ||
} | ||
|
||
/// <summary> | ||
/// Split records into nodes | ||
/// </summary> | ||
/// <param name="records">The records to create nodes.</param> | ||
/// <param name="count">The number of nodes to create.</param> | ||
/// <returns></returns> | ||
protected IEnumerable<MemoryRecord[]> SplitRecordsIntoNodes(MemoryRecord[] records, int count) | ||
{ | ||
var groups = new List<MemoryRecord[]>(); | ||
var groupSize = (int)Math.Round((double)records.Length / count); | ||
|
||
for (int i = 0; i < count; i++) | ||
{ | ||
var group = records | ||
.Skip(i * groupSize) | ||
.Take(groupSize) | ||
.ToArray(); | ||
|
||
groups.Add(group); | ||
} | ||
|
||
return groups; | ||
} | ||
|
||
protected IEnumerable<T> Shuffle<T>(IEnumerable<T> source) | ||
{ | ||
var span = source.ToArray().AsSpan(); | ||
|
||
RandomNumberGenerator.Shuffle(span); | ||
|
||
return span.ToArray(); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
// Copyright (c) Microsoft. All rights reserved. | ||
|
||
namespace Microsoft.KernelMemory.Evaluation; | ||
|
||
public sealed class EvaluationMetrics | ||
{ | ||
/// <summary> | ||
/// Scores the relevancy of the answer according to the given question. | ||
/// </summary> | ||
public float AnswerRelevancy { get; set; } | ||
|
||
/// <summary> | ||
/// Scores the semantic similarity of ground truth with generated answer. | ||
/// </summary> | ||
public float AnswerSemanticSimilarity { get; set; } | ||
|
||
/// <summary> | ||
/// Measures answer correctness compared to ground truth as a combination of factuality and semantic similarity. | ||
/// </summary> | ||
public float AnswerCorrectness { get; set; } | ||
|
||
/// <summary> | ||
/// Measures the factual consistency of the generated answer against the given context. | ||
/// </summary> | ||
public float Faithfulness { get; set; } | ||
|
||
/// <summary> | ||
/// Average Precision is a metric that evaluates whether all of the relevant items selected by the model are ranked higher or not. | ||
/// </summary> | ||
public float ContextPrecision { get; set; } | ||
|
||
/// <summary> | ||
/// Estimates context recall by estimating TP and FN using annotated answer and retrieved context. | ||
/// </summary> | ||
public float ContextRecall { get; set; } | ||
} |
74 changes: 74 additions & 0 deletions
74
applications/evaluation/Evaluators/AnswerCorrectness/AnswerCorrectnessEvaluator.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
// Copyright (c) Microsoft. All rights reserved. | ||
|
||
using System.Collections.Generic; | ||
using System.Linq; | ||
using System.Text.Json; | ||
using System.Threading.Tasks; | ||
using Microsoft.KernelMemory.Evaluation; | ||
using Microsoft.KernelMemory.Evaluation.TestSet; | ||
using Microsoft.SemanticKernel; | ||
using Microsoft.SemanticKernel.Connectors.OpenAI; | ||
|
||
// ReSharper disable CheckNamespace | ||
namespace Microsoft.KernelMemory.Evaluators.AnswerCorrectness; | ||
|
||
internal sealed class AnswerCorrectnessEvaluator : EvaluationEngine | ||
{ | ||
private readonly Kernel _kernel; | ||
|
||
private KernelFunction ExtractStatements => this._kernel.CreateFunctionFromPrompt(this.GetSKPrompt("Extraction", "Statements"), new OpenAIPromptExecutionSettings | ||
{ | ||
Temperature = 1e-8f, | ||
}, functionName: nameof(this.ExtractStatements)); | ||
|
||
private KernelFunction EvaluateCorrectness => this._kernel.CreateFunctionFromPrompt(this.GetSKPrompt("Evaluation", "Correctness"), new OpenAIPromptExecutionSettings | ||
{ | ||
Temperature = 1e-8f, | ||
}, functionName: nameof(this.EvaluateCorrectness)); | ||
|
||
public AnswerCorrectnessEvaluator(Kernel kernel) | ||
{ | ||
this._kernel = kernel.Clone(); | ||
} | ||
|
||
internal async Task<float> Evaluate(TestSetItem testSet, MemoryAnswer answer, Dictionary<string, object?> metadata) | ||
{ | ||
var statements = await this.Try(3, async (remainingTry) => | ||
{ | ||
var extraction = await this.ExtractStatements.InvokeAsync(this._kernel, new KernelArguments | ||
{ | ||
{ "question", answer.Question }, | ||
{ "answer", answer.Result } | ||
}).ConfigureAwait(false); | ||
|
||
return JsonSerializer.Deserialize<IEnumerable<string>>(extraction.GetValue<string>()!); | ||
}).ConfigureAwait(false); | ||
|
||
if (statements is null) | ||
{ | ||
return 0; | ||
} | ||
|
||
var evaluation = await this.Try(3, async (remainingTry) => | ||
{ | ||
var extraction = await this.EvaluateCorrectness.InvokeAsync(this._kernel, new KernelArguments | ||
{ | ||
{ "question", answer.Question }, | ||
{ "answer", JsonSerializer.Serialize(statements) }, | ||
{ "ground_truth", JsonSerializer.Serialize(testSet.Context) } | ||
}).ConfigureAwait(false); | ||
|
||
return JsonSerializer.Deserialize<CorrectnessEvaluation>(extraction.GetValue<string>()!); | ||
}).ConfigureAwait(false); | ||
|
||
if (evaluation is null) | ||
{ | ||
return 0; | ||
} | ||
|
||
metadata.Add($"{nameof(AnswerCorrectnessEvaluator)}-Evaluation", evaluation); | ||
|
||
return (float)evaluation.TP.Count() / | ||
(float)(evaluation.TP.Count() + .5 * (evaluation.FP.Count() + evaluation.FN.Count())); | ||
} | ||
} |
18 changes: 18 additions & 0 deletions
18
applications/evaluation/Evaluators/AnswerCorrectness/CorrectnessEvaluation.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
// Copyright (c) Microsoft. All rights reserved. | ||
|
||
using System.Collections.Generic; | ||
|
||
// ReSharper disable InconsistentNaming | ||
// ReSharper disable CheckNamespace | ||
namespace Microsoft.KernelMemory.Evaluators.AnswerCorrectness; | ||
|
||
#pragma warning disable CA1812 // 'CorrectnessEvaluation' is an internal class that is apparently never instantiated. If so, remove the code from the assembly. If this class is intended to contain only static members, make it 'static' (Module in Visual Basic). (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca1812) | ||
internal sealed class CorrectnessEvaluation | ||
#pragma warning restore CA1812 // 'CorrectnessEvaluation' is an internal class that is apparently never instantiated. If so, remove the code from the assembly. If this class is intended to contain only static members, make it 'static' (Module in Visual Basic). (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca1812) | ||
{ | ||
public IEnumerable<StatementEvaluation> FP { get; set; } = null!; | ||
|
||
public IEnumerable<StatementEvaluation> FN { get; set; } = null!; | ||
|
||
public IEnumerable<StatementEvaluation> TP { get; set; } = null!; | ||
} |
12 changes: 12 additions & 0 deletions
12
applications/evaluation/Evaluators/AnswerCorrectness/StatementEvaluation.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
// Copyright (c) Microsoft. All rights reserved. | ||
|
||
namespace Microsoft.KernelMemory.Evaluators.AnswerCorrectness; | ||
|
||
#pragma warning disable CA1812 // 'StatementEvaluation' is an internal class that is apparently never instantiated. If so, remove the code from the assembly. If this class is intended to contain only static members, make it 'static' (Module in Visual Basic). (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca1812) | ||
internal sealed class StatementEvaluation | ||
#pragma warning restore CA1812 // 'StatementEvaluation' is an internal class that is apparently never instantiated. If so, remove the code from the assembly. If this class is intended to contain only static members, make it 'static' (Module in Visual Basic). (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca1812) | ||
{ | ||
public string Statement { get; set; } = null!; | ||
|
||
public string Reason { get; set; } = null!; | ||
} |
41 changes: 41 additions & 0 deletions
41
applications/evaluation/Evaluators/AnswerSimilarity/AnswerSimilarityEvaluator.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
// Copyright (c) Microsoft. All rights reserved. | ||
|
||
using System.Collections.Generic; | ||
using System.Linq; | ||
using System.Numerics.Tensors; | ||
using System.Threading.Tasks; | ||
using Microsoft.Extensions.DependencyInjection; | ||
using Microsoft.KernelMemory.Evaluation; | ||
using Microsoft.KernelMemory.Evaluation.TestSet; | ||
using Microsoft.SemanticKernel; | ||
using Microsoft.SemanticKernel.Embeddings; | ||
|
||
// ReSharper disable CheckNamespace | ||
namespace Microsoft.KernelMemory.Evaluators.AnswerSimilarity; | ||
|
||
internal sealed class AnswerSimilarityEvaluator : EvaluationEngine | ||
{ | ||
private readonly Kernel _kernel; | ||
|
||
private readonly ITextEmbeddingGenerationService _textEmbeddingGenerationService; | ||
|
||
public AnswerSimilarityEvaluator(Kernel kernel) | ||
{ | ||
this._kernel = kernel.Clone(); | ||
|
||
this._textEmbeddingGenerationService = this._kernel.Services.GetRequiredService<ITextEmbeddingGenerationService>(); | ||
} | ||
|
||
internal async Task<float> Evaluate(TestSetItem testSet, MemoryAnswer answer, Dictionary<string, object?> metadata) | ||
{ | ||
var answerEmbeddings = await this._textEmbeddingGenerationService | ||
.GenerateEmbeddingsAsync([testSet.GroundTruth, answer.Result], this._kernel) | ||
.ConfigureAwait(false); | ||
|
||
var evaluation = TensorPrimitives.CosineSimilarity(answerEmbeddings.First().Span, answerEmbeddings.Last().Span); | ||
|
||
metadata.Add($"{nameof(AnswerSimilarityEvaluator)}-Evaluation", evaluation); | ||
|
||
return evaluation; | ||
} | ||
} |
Oops, something went wrong.