Skip to content

Commit dd9915d

Browse files
authored
Add evaluation core functions (#515)
## Motivation and Context (Why the change? What's the scenario?) First code commit for evaluation of KM memory by using RAGAS methodology. ## High level description (Approach, Design) Add Test set generation with SK, by using KM index for existing data. Add Test set evaluation of KM by using SK.
1 parent 37b001d commit dd9915d

38 files changed

+1968
-2
lines changed

KernelMemory.sln

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -265,7 +265,7 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "SQLServer", "extensions\SQL
265265
EndProject
266266
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Elasticsearch", "extensions\Elasticsearch\Elasticsearch\Elasticsearch.csproj", "{2E10420F-BF96-411C-8FE0-F6268F2EEB67}"
267267
EndProject
268-
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Elasticsearch.UnitTests", "extensions\Elasticsearch\Elasticsearch.FunctionalTests\Elasticsearch.FunctionalTests.csproj", "{C5E6B28C-F54D-423D-954D-A9EAEFB89732}"
268+
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Elasticsearch.FunctionalTests", "extensions\Elasticsearch\Elasticsearch.FunctionalTests\Elasticsearch.FunctionalTests.csproj", "{C5E6B28C-F54D-423D-954D-A9EAEFB89732}"
269269
EndProject
270270
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Discord", "extensions\Discord\Discord\Discord.csproj", "{43877864-6AE8-4B03-BEDA-6B6FA8BB1D8B}"
271271
EndProject
@@ -281,6 +281,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "110-dotnet-anthropic", "exa
281281
EndProject
282282
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "302-dotnet-sk-km-chat", "examples\302-dotnet-sk-km-chat\302-dotnet-sk-km-chat.csproj", "{37FA99CB-AD22-4BAC-B76F-961F84422DEE}"
283283
EndProject
284+
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "401-evaluation", "examples\401-evaluation\401-evaluation.csproj", "{D1308C73-79B6-4635-B50D-420742D09C20}"
285+
EndProject
284286
Global
285287
GlobalSection(SolutionConfigurationPlatforms) = preSolution
286288
Debug|Any CPU = Debug|Any CPU
@@ -528,6 +530,9 @@ Global
528530
{37FA99CB-AD22-4BAC-B76F-961F84422DEE}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
529531
{37FA99CB-AD22-4BAC-B76F-961F84422DEE}.Debug|Any CPU.Build.0 = Debug|Any CPU
530532
{37FA99CB-AD22-4BAC-B76F-961F84422DEE}.Release|Any CPU.ActiveCfg = Release|Any CPU
533+
{D1308C73-79B6-4635-B50D-420742D09C20}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
534+
{D1308C73-79B6-4635-B50D-420742D09C20}.Debug|Any CPU.Build.0 = Debug|Any CPU
535+
{D1308C73-79B6-4635-B50D-420742D09C20}.Release|Any CPU.ActiveCfg = Release|Any CPU
531536
EndGlobalSection
532537
GlobalSection(SolutionProperties) = preSolution
533538
HideSolutionNode = FALSE
@@ -613,6 +618,7 @@ Global
613618
{A0C81A29-715F-463E-A243-7E45DB8AE53F} = {155DA079-E267-49AF-973A-D1D44681970F}
614619
{EE0D8645-2770-4E12-8E18-019B30970FE6} = {0A43C65C-6007-4BB4-B3FE-8D439FC91841}
615620
{37FA99CB-AD22-4BAC-B76F-961F84422DEE} = {0A43C65C-6007-4BB4-B3FE-8D439FC91841}
621+
{D1308C73-79B6-4635-B50D-420742D09C20} = {0A43C65C-6007-4BB4-B3FE-8D439FC91841}
616622
EndGlobalSection
617623
GlobalSection(ExtensibilityGlobals) = postSolution
618624
SolutionGuid = {CC136C62-115C-41D1-B414-F9473EFF6EA8}

applications/evaluation/Evaluation.csproj

Lines changed: 56 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
<PropertyGroup>
44
<TargetFramework>net8.0</TargetFramework>
55
<AssemblyName>Microsoft.KernelMemory.Evaluation</AssemblyName>
6-
<RootNamespace>Microsoft.KernelMemory</RootNamespace>
6+
<RootNamespace>Microsoft.KernelMemory.Evaluation</RootNamespace>
77
<NoWarn>$(NoWarn);KMEXP00;CA1711;CS1591;CS1574;NU5104;SKEXP0001;</NoWarn>
88
</PropertyGroup>
99

@@ -20,4 +20,59 @@
2020
<ProjectReference Include="..\..\service\Core\Core.csproj" />
2121
</ItemGroup>
2222

23+
<ItemGroup>
24+
<EmbeddedResource Include="Prompts\Evaluation\ContextRecall.txt">
25+
<CustomToolNamespace></CustomToolNamespace>
26+
<LogicalName>$([System.String]::new('%(RelativeDir)').Replace('\','/'))%(FileName)%(Extension)</LogicalName>
27+
</EmbeddedResource>
28+
<EmbeddedResource Include="Prompts\Evaluation\Correctness.txt">
29+
<CustomToolNamespace></CustomToolNamespace>
30+
<LogicalName>$([System.String]::new('%(RelativeDir)').Replace('\','/'))%(FileName)%(Extension)</LogicalName>
31+
</EmbeddedResource>
32+
<EmbeddedResource Include="Prompts\Evaluation\ContextPrecision.txt">
33+
<CustomToolNamespace></CustomToolNamespace>
34+
<LogicalName>$([System.String]::new('%(RelativeDir)').Replace('\','/'))%(FileName)%(Extension)</LogicalName>
35+
</EmbeddedResource>
36+
<EmbeddedResource Include="Prompts\Extraction\Question.txt">
37+
<CustomToolNamespace></CustomToolNamespace>
38+
<LogicalName>$([System.String]::new('%(RelativeDir)').Replace('\','/'))%(FileName)%(Extension)</LogicalName>
39+
</EmbeddedResource>
40+
<EmbeddedResource Include="Prompts\Evaluation\Faithfulness.txt">
41+
<CustomToolNamespace></CustomToolNamespace>
42+
<LogicalName>$([System.String]::new('%(RelativeDir)').Replace('\','/'))%(FileName)%(Extension)</LogicalName>
43+
</EmbeddedResource>
44+
<EmbeddedResource Include="Prompts\Extraction\Statements.txt">
45+
<CustomToolNamespace></CustomToolNamespace>
46+
<LogicalName>$([System.String]::new('%(RelativeDir)').Replace('\','/'))%(FileName)%(Extension)</LogicalName>
47+
</EmbeddedResource>
48+
<EmbeddedResource Include="Prompts\Extraction\Keyphrase.txt">
49+
<CustomToolNamespace></CustomToolNamespace>
50+
<LogicalName>$([System.String]::new('%(RelativeDir)').Replace('\','/'))%(FileName)%(Extension)</LogicalName>
51+
</EmbeddedResource>
52+
<EmbeddedResource Include="Prompts\SyntheticData\ConditionalQuestion.txt">
53+
<CustomToolNamespace></CustomToolNamespace>
54+
<LogicalName>$([System.String]::new('%(RelativeDir)').Replace('\','/'))%(FileName)%(Extension)</LogicalName>
55+
</EmbeddedResource>
56+
<EmbeddedResource Include="Prompts\SyntheticData\QuestionAnswer.txt">
57+
<CustomToolNamespace></CustomToolNamespace>
58+
<LogicalName>$([System.String]::new('%(RelativeDir)').Replace('\','/'))%(FileName)%(Extension)</LogicalName>
59+
</EmbeddedResource>
60+
<EmbeddedResource Include="Prompts\SyntheticData\MultiContextQuestion.txt">
61+
<CustomToolNamespace></CustomToolNamespace>
62+
<LogicalName>$([System.String]::new('%(RelativeDir)').Replace('\','/'))%(FileName)%(Extension)</LogicalName>
63+
</EmbeddedResource>
64+
<EmbeddedResource Include="Prompts\SyntheticData\ReasoningQuestion.txt">
65+
<CustomToolNamespace></CustomToolNamespace>
66+
<LogicalName>$([System.String]::new('%(RelativeDir)').Replace('\','/'))%(FileName)%(Extension)</LogicalName>
67+
</EmbeddedResource>
68+
<EmbeddedResource Include="Prompts\SyntheticData\SeedQuestion.txt">
69+
<CustomToolNamespace></CustomToolNamespace>
70+
<LogicalName>$([System.String]::new('%(RelativeDir)').Replace('\','/'))%(FileName)%(Extension)</LogicalName>
71+
</EmbeddedResource>
72+
<EmbeddedResource Include="Prompts\Transmutation\Translate.txt">
73+
<CustomToolNamespace></CustomToolNamespace>
74+
<LogicalName>$([System.String]::new('%(RelativeDir)').Replace('\','/'))%(FileName)%(Extension)</LogicalName>
75+
</EmbeddedResource>
76+
</ItemGroup>
77+
2378
</Project>
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
// Copyright (c) Microsoft. All rights reserved.
2+
3+
using System;
4+
using System.Collections.Generic;
5+
using System.IO;
6+
using System.Linq;
7+
using System.Reflection;
8+
using System.Security.Cryptography;
9+
using System.Threading.Tasks;
10+
using Microsoft.KernelMemory.MemoryStorage;
11+
12+
namespace Microsoft.KernelMemory.Evaluation;
13+
14+
public abstract class EvaluationEngine
15+
{
16+
protected string GetSKPrompt(string pluginName, string functionName)
17+
{
18+
var resourceStream = Assembly.GetExecutingAssembly()
19+
.GetManifestResourceStream($"Prompts/{pluginName}/{functionName}.txt");
20+
21+
using var reader = new StreamReader(resourceStream!);
22+
var text = reader.ReadToEnd();
23+
return text;
24+
}
25+
26+
protected async Task<T> Try<T>(int maxCount, Func<int, Task<T>> action)
27+
{
28+
do
29+
{
30+
try
31+
{
32+
return await action(maxCount).ConfigureAwait(false);
33+
}
34+
catch (Exception)
35+
{
36+
if (maxCount == 0)
37+
{
38+
throw;
39+
}
40+
}
41+
} while (maxCount-- > 0);
42+
43+
throw new InvalidProgramException();
44+
}
45+
46+
/// <summary>
47+
/// Split records into nodes
48+
/// </summary>
49+
/// <param name="records">The records to create nodes.</param>
50+
/// <param name="count">The number of nodes to create.</param>
51+
/// <returns></returns>
52+
protected IEnumerable<MemoryRecord[]> SplitRecordsIntoNodes(MemoryRecord[] records, int count)
53+
{
54+
var groups = new List<MemoryRecord[]>();
55+
var groupSize = (int)Math.Round((double)records.Length / count);
56+
57+
for (int i = 0; i < count; i++)
58+
{
59+
var group = records
60+
.Skip(i * groupSize)
61+
.Take(groupSize)
62+
.ToArray();
63+
64+
groups.Add(group);
65+
}
66+
67+
return groups;
68+
}
69+
70+
protected IEnumerable<T> Shuffle<T>(IEnumerable<T> source)
71+
{
72+
var span = source.ToArray().AsSpan();
73+
74+
RandomNumberGenerator.Shuffle(span);
75+
76+
return span.ToArray();
77+
}
78+
}
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
// Copyright (c) Microsoft. All rights reserved.
2+
3+
namespace Microsoft.KernelMemory.Evaluation;
4+
5+
public sealed class EvaluationMetrics
6+
{
7+
/// <summary>
8+
/// Scores the relevancy of the answer according to the given question.
9+
/// </summary>
10+
public float AnswerRelevancy { get; set; }
11+
12+
/// <summary>
13+
/// Scores the semantic similarity of ground truth with generated answer.
14+
/// </summary>
15+
public float AnswerSemanticSimilarity { get; set; }
16+
17+
/// <summary>
18+
/// Measures answer correctness compared to ground truth as a combination of factuality and semantic similarity.
19+
/// </summary>
20+
public float AnswerCorrectness { get; set; }
21+
22+
/// <summary>
23+
/// Measures the factual consistency of the generated answer against the given context.
24+
/// </summary>
25+
public float Faithfulness { get; set; }
26+
27+
/// <summary>
28+
/// Average Precision is a metric that evaluates whether all of the relevant items selected by the model are ranked higher or not.
29+
/// </summary>
30+
public float ContextPrecision { get; set; }
31+
32+
/// <summary>
33+
/// Estimates context recall by estimating TP and FN using annotated answer and retrieved context.
34+
/// </summary>
35+
public float ContextRecall { get; set; }
36+
}
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
// Copyright (c) Microsoft. All rights reserved.
2+
3+
using System.Collections.Generic;
4+
using System.Linq;
5+
using System.Text.Json;
6+
using System.Threading.Tasks;
7+
using Microsoft.KernelMemory.Evaluation;
8+
using Microsoft.KernelMemory.Evaluation.TestSet;
9+
using Microsoft.SemanticKernel;
10+
using Microsoft.SemanticKernel.Connectors.OpenAI;
11+
12+
// ReSharper disable CheckNamespace
13+
namespace Microsoft.KernelMemory.Evaluators.AnswerCorrectness;
14+
15+
internal sealed class AnswerCorrectnessEvaluator : EvaluationEngine
16+
{
17+
private readonly Kernel _kernel;
18+
19+
private KernelFunction ExtractStatements => this._kernel.CreateFunctionFromPrompt(this.GetSKPrompt("Extraction", "Statements"), new OpenAIPromptExecutionSettings
20+
{
21+
Temperature = 1e-8f,
22+
}, functionName: nameof(this.ExtractStatements));
23+
24+
private KernelFunction EvaluateCorrectness => this._kernel.CreateFunctionFromPrompt(this.GetSKPrompt("Evaluation", "Correctness"), new OpenAIPromptExecutionSettings
25+
{
26+
Temperature = 1e-8f,
27+
}, functionName: nameof(this.EvaluateCorrectness));
28+
29+
public AnswerCorrectnessEvaluator(Kernel kernel)
30+
{
31+
this._kernel = kernel.Clone();
32+
}
33+
34+
internal async Task<float> Evaluate(TestSetItem testSet, MemoryAnswer answer, Dictionary<string, object?> metadata)
35+
{
36+
var statements = await this.Try(3, async (remainingTry) =>
37+
{
38+
var extraction = await this.ExtractStatements.InvokeAsync(this._kernel, new KernelArguments
39+
{
40+
{ "question", answer.Question },
41+
{ "answer", answer.Result }
42+
}).ConfigureAwait(false);
43+
44+
return JsonSerializer.Deserialize<IEnumerable<string>>(extraction.GetValue<string>()!);
45+
}).ConfigureAwait(false);
46+
47+
if (statements is null)
48+
{
49+
return 0;
50+
}
51+
52+
var evaluation = await this.Try(3, async (remainingTry) =>
53+
{
54+
var extraction = await this.EvaluateCorrectness.InvokeAsync(this._kernel, new KernelArguments
55+
{
56+
{ "question", answer.Question },
57+
{ "answer", JsonSerializer.Serialize(statements) },
58+
{ "ground_truth", JsonSerializer.Serialize(testSet.Context) }
59+
}).ConfigureAwait(false);
60+
61+
return JsonSerializer.Deserialize<CorrectnessEvaluation>(extraction.GetValue<string>()!);
62+
}).ConfigureAwait(false);
63+
64+
if (evaluation is null)
65+
{
66+
return 0;
67+
}
68+
69+
metadata.Add($"{nameof(AnswerCorrectnessEvaluator)}-Evaluation", evaluation);
70+
71+
return (float)evaluation.TP.Count() /
72+
(float)(evaluation.TP.Count() + .5 * (evaluation.FP.Count() + evaluation.FN.Count()));
73+
}
74+
}
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
// Copyright (c) Microsoft. All rights reserved.
2+
3+
using System.Collections.Generic;
4+
5+
// ReSharper disable InconsistentNaming
6+
// ReSharper disable CheckNamespace
7+
namespace Microsoft.KernelMemory.Evaluators.AnswerCorrectness;
8+
9+
#pragma warning disable CA1812 // 'CorrectnessEvaluation' is an internal class that is apparently never instantiated. If so, remove the code from the assembly. If this class is intended to contain only static members, make it 'static' (Module in Visual Basic). (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca1812)
10+
internal sealed class CorrectnessEvaluation
11+
#pragma warning restore CA1812 // 'CorrectnessEvaluation' is an internal class that is apparently never instantiated. If so, remove the code from the assembly. If this class is intended to contain only static members, make it 'static' (Module in Visual Basic). (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca1812)
12+
{
13+
public IEnumerable<StatementEvaluation> FP { get; set; } = null!;
14+
15+
public IEnumerable<StatementEvaluation> FN { get; set; } = null!;
16+
17+
public IEnumerable<StatementEvaluation> TP { get; set; } = null!;
18+
}
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
// Copyright (c) Microsoft. All rights reserved.
2+
3+
namespace Microsoft.KernelMemory.Evaluators.AnswerCorrectness;
4+
5+
#pragma warning disable CA1812 // 'StatementEvaluation' is an internal class that is apparently never instantiated. If so, remove the code from the assembly. If this class is intended to contain only static members, make it 'static' (Module in Visual Basic). (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca1812)
6+
internal sealed class StatementEvaluation
7+
#pragma warning restore CA1812 // 'StatementEvaluation' is an internal class that is apparently never instantiated. If so, remove the code from the assembly. If this class is intended to contain only static members, make it 'static' (Module in Visual Basic). (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca1812)
8+
{
9+
public string Statement { get; set; } = null!;
10+
11+
public string Reason { get; set; } = null!;
12+
}
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
// Copyright (c) Microsoft. All rights reserved.
2+
3+
using System.Collections.Generic;
4+
using System.Linq;
5+
using System.Numerics.Tensors;
6+
using System.Threading.Tasks;
7+
using Microsoft.Extensions.DependencyInjection;
8+
using Microsoft.KernelMemory.Evaluation;
9+
using Microsoft.KernelMemory.Evaluation.TestSet;
10+
using Microsoft.SemanticKernel;
11+
using Microsoft.SemanticKernel.Embeddings;
12+
13+
// ReSharper disable CheckNamespace
14+
namespace Microsoft.KernelMemory.Evaluators.AnswerSimilarity;
15+
16+
internal sealed class AnswerSimilarityEvaluator : EvaluationEngine
17+
{
18+
private readonly Kernel _kernel;
19+
20+
private readonly ITextEmbeddingGenerationService _textEmbeddingGenerationService;
21+
22+
public AnswerSimilarityEvaluator(Kernel kernel)
23+
{
24+
this._kernel = kernel.Clone();
25+
26+
this._textEmbeddingGenerationService = this._kernel.Services.GetRequiredService<ITextEmbeddingGenerationService>();
27+
}
28+
29+
internal async Task<float> Evaluate(TestSetItem testSet, MemoryAnswer answer, Dictionary<string, object?> metadata)
30+
{
31+
var answerEmbeddings = await this._textEmbeddingGenerationService
32+
.GenerateEmbeddingsAsync([testSet.GroundTruth, answer.Result], this._kernel)
33+
.ConfigureAwait(false);
34+
35+
var evaluation = TensorPrimitives.CosineSimilarity(answerEmbeddings.First().Span, answerEmbeddings.Last().Span);
36+
37+
metadata.Add($"{nameof(AnswerSimilarityEvaluator)}-Evaluation", evaluation);
38+
39+
return evaluation;
40+
}
41+
}

0 commit comments

Comments
 (0)