Skip to content

Commit

Permalink
Add evaluation core functions (#515)
Browse files Browse the repository at this point in the history
## Motivation and Context (Why the change? What's the scenario?)
First code commit for evaluation of KM memory by using RAGAS
methodology.

## High level description (Approach, Design)

Add Test set generation with SK, by using KM index for existing data. 
Add Test set evaluation of KM by using SK.
  • Loading branch information
kbeaugrand authored May 25, 2024
1 parent 37b001d commit dd9915d
Show file tree
Hide file tree
Showing 38 changed files with 1,968 additions and 2 deletions.
8 changes: 7 additions & 1 deletion KernelMemory.sln
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,7 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "SQLServer", "extensions\SQL
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Elasticsearch", "extensions\Elasticsearch\Elasticsearch\Elasticsearch.csproj", "{2E10420F-BF96-411C-8FE0-F6268F2EEB67}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Elasticsearch.UnitTests", "extensions\Elasticsearch\Elasticsearch.FunctionalTests\Elasticsearch.FunctionalTests.csproj", "{C5E6B28C-F54D-423D-954D-A9EAEFB89732}"
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Elasticsearch.FunctionalTests", "extensions\Elasticsearch\Elasticsearch.FunctionalTests\Elasticsearch.FunctionalTests.csproj", "{C5E6B28C-F54D-423D-954D-A9EAEFB89732}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Discord", "extensions\Discord\Discord\Discord.csproj", "{43877864-6AE8-4B03-BEDA-6B6FA8BB1D8B}"
EndProject
Expand All @@ -281,6 +281,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "110-dotnet-anthropic", "exa
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "302-dotnet-sk-km-chat", "examples\302-dotnet-sk-km-chat\302-dotnet-sk-km-chat.csproj", "{37FA99CB-AD22-4BAC-B76F-961F84422DEE}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "401-evaluation", "examples\401-evaluation\401-evaluation.csproj", "{D1308C73-79B6-4635-B50D-420742D09C20}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Expand Down Expand Up @@ -528,6 +530,9 @@ Global
{37FA99CB-AD22-4BAC-B76F-961F84422DEE}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{37FA99CB-AD22-4BAC-B76F-961F84422DEE}.Debug|Any CPU.Build.0 = Debug|Any CPU
{37FA99CB-AD22-4BAC-B76F-961F84422DEE}.Release|Any CPU.ActiveCfg = Release|Any CPU
{D1308C73-79B6-4635-B50D-420742D09C20}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{D1308C73-79B6-4635-B50D-420742D09C20}.Debug|Any CPU.Build.0 = Debug|Any CPU
{D1308C73-79B6-4635-B50D-420742D09C20}.Release|Any CPU.ActiveCfg = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
Expand Down Expand Up @@ -613,6 +618,7 @@ Global
{A0C81A29-715F-463E-A243-7E45DB8AE53F} = {155DA079-E267-49AF-973A-D1D44681970F}
{EE0D8645-2770-4E12-8E18-019B30970FE6} = {0A43C65C-6007-4BB4-B3FE-8D439FC91841}
{37FA99CB-AD22-4BAC-B76F-961F84422DEE} = {0A43C65C-6007-4BB4-B3FE-8D439FC91841}
{D1308C73-79B6-4635-B50D-420742D09C20} = {0A43C65C-6007-4BB4-B3FE-8D439FC91841}
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {CC136C62-115C-41D1-B414-F9473EFF6EA8}
Expand Down
57 changes: 56 additions & 1 deletion applications/evaluation/Evaluation.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
<PropertyGroup>
<TargetFramework>net8.0</TargetFramework>
<AssemblyName>Microsoft.KernelMemory.Evaluation</AssemblyName>
<RootNamespace>Microsoft.KernelMemory</RootNamespace>
<RootNamespace>Microsoft.KernelMemory.Evaluation</RootNamespace>
<NoWarn>$(NoWarn);KMEXP00;CA1711;CS1591;CS1574;NU5104;SKEXP0001;</NoWarn>
</PropertyGroup>

Expand All @@ -20,4 +20,59 @@
<ProjectReference Include="..\..\service\Core\Core.csproj" />
</ItemGroup>

<ItemGroup>
<EmbeddedResource Include="Prompts\Evaluation\ContextRecall.txt">
<CustomToolNamespace></CustomToolNamespace>
<LogicalName>$([System.String]::new('%(RelativeDir)').Replace('\','/'))%(FileName)%(Extension)</LogicalName>
</EmbeddedResource>
<EmbeddedResource Include="Prompts\Evaluation\Correctness.txt">
<CustomToolNamespace></CustomToolNamespace>
<LogicalName>$([System.String]::new('%(RelativeDir)').Replace('\','/'))%(FileName)%(Extension)</LogicalName>
</EmbeddedResource>
<EmbeddedResource Include="Prompts\Evaluation\ContextPrecision.txt">
<CustomToolNamespace></CustomToolNamespace>
<LogicalName>$([System.String]::new('%(RelativeDir)').Replace('\','/'))%(FileName)%(Extension)</LogicalName>
</EmbeddedResource>
<EmbeddedResource Include="Prompts\Extraction\Question.txt">
<CustomToolNamespace></CustomToolNamespace>
<LogicalName>$([System.String]::new('%(RelativeDir)').Replace('\','/'))%(FileName)%(Extension)</LogicalName>
</EmbeddedResource>
<EmbeddedResource Include="Prompts\Evaluation\Faithfulness.txt">
<CustomToolNamespace></CustomToolNamespace>
<LogicalName>$([System.String]::new('%(RelativeDir)').Replace('\','/'))%(FileName)%(Extension)</LogicalName>
</EmbeddedResource>
<EmbeddedResource Include="Prompts\Extraction\Statements.txt">
<CustomToolNamespace></CustomToolNamespace>
<LogicalName>$([System.String]::new('%(RelativeDir)').Replace('\','/'))%(FileName)%(Extension)</LogicalName>
</EmbeddedResource>
<EmbeddedResource Include="Prompts\Extraction\Keyphrase.txt">
<CustomToolNamespace></CustomToolNamespace>
<LogicalName>$([System.String]::new('%(RelativeDir)').Replace('\','/'))%(FileName)%(Extension)</LogicalName>
</EmbeddedResource>
<EmbeddedResource Include="Prompts\SyntheticData\ConditionalQuestion.txt">
<CustomToolNamespace></CustomToolNamespace>
<LogicalName>$([System.String]::new('%(RelativeDir)').Replace('\','/'))%(FileName)%(Extension)</LogicalName>
</EmbeddedResource>
<EmbeddedResource Include="Prompts\SyntheticData\QuestionAnswer.txt">
<CustomToolNamespace></CustomToolNamespace>
<LogicalName>$([System.String]::new('%(RelativeDir)').Replace('\','/'))%(FileName)%(Extension)</LogicalName>
</EmbeddedResource>
<EmbeddedResource Include="Prompts\SyntheticData\MultiContextQuestion.txt">
<CustomToolNamespace></CustomToolNamespace>
<LogicalName>$([System.String]::new('%(RelativeDir)').Replace('\','/'))%(FileName)%(Extension)</LogicalName>
</EmbeddedResource>
<EmbeddedResource Include="Prompts\SyntheticData\ReasoningQuestion.txt">
<CustomToolNamespace></CustomToolNamespace>
<LogicalName>$([System.String]::new('%(RelativeDir)').Replace('\','/'))%(FileName)%(Extension)</LogicalName>
</EmbeddedResource>
<EmbeddedResource Include="Prompts\SyntheticData\SeedQuestion.txt">
<CustomToolNamespace></CustomToolNamespace>
<LogicalName>$([System.String]::new('%(RelativeDir)').Replace('\','/'))%(FileName)%(Extension)</LogicalName>
</EmbeddedResource>
<EmbeddedResource Include="Prompts\Transmutation\Translate.txt">
<CustomToolNamespace></CustomToolNamespace>
<LogicalName>$([System.String]::new('%(RelativeDir)').Replace('\','/'))%(FileName)%(Extension)</LogicalName>
</EmbeddedResource>
</ItemGroup>

</Project>
78 changes: 78 additions & 0 deletions applications/evaluation/EvaluationEngine.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
// Copyright (c) Microsoft. All rights reserved.

using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Reflection;
using System.Security.Cryptography;
using System.Threading.Tasks;
using Microsoft.KernelMemory.MemoryStorage;

namespace Microsoft.KernelMemory.Evaluation;

public abstract class EvaluationEngine
{
protected string GetSKPrompt(string pluginName, string functionName)
{
var resourceStream = Assembly.GetExecutingAssembly()
.GetManifestResourceStream($"Prompts/{pluginName}/{functionName}.txt");

using var reader = new StreamReader(resourceStream!);
var text = reader.ReadToEnd();
return text;
}

protected async Task<T> Try<T>(int maxCount, Func<int, Task<T>> action)
{
do
{
try
{
return await action(maxCount).ConfigureAwait(false);
}
catch (Exception)
{
if (maxCount == 0)
{
throw;
}
}
} while (maxCount-- > 0);

throw new InvalidProgramException();
}

/// <summary>
/// Split records into nodes
/// </summary>
/// <param name="records">The records to create nodes.</param>
/// <param name="count">The number of nodes to create.</param>
/// <returns></returns>
protected IEnumerable<MemoryRecord[]> SplitRecordsIntoNodes(MemoryRecord[] records, int count)
{
var groups = new List<MemoryRecord[]>();
var groupSize = (int)Math.Round((double)records.Length / count);

for (int i = 0; i < count; i++)
{
var group = records
.Skip(i * groupSize)
.Take(groupSize)
.ToArray();

groups.Add(group);
}

return groups;
}

protected IEnumerable<T> Shuffle<T>(IEnumerable<T> source)
{
var span = source.ToArray().AsSpan();

RandomNumberGenerator.Shuffle(span);

return span.ToArray();
}
}
36 changes: 36 additions & 0 deletions applications/evaluation/EvaluationMetrics.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
// Copyright (c) Microsoft. All rights reserved.

namespace Microsoft.KernelMemory.Evaluation;

public sealed class EvaluationMetrics
{
/// <summary>
/// Scores the relevancy of the answer according to the given question.
/// </summary>
public float AnswerRelevancy { get; set; }

/// <summary>
/// Scores the semantic similarity of ground truth with generated answer.
/// </summary>
public float AnswerSemanticSimilarity { get; set; }

/// <summary>
/// Measures answer correctness compared to ground truth as a combination of factuality and semantic similarity.
/// </summary>
public float AnswerCorrectness { get; set; }

/// <summary>
/// Measures the factual consistency of the generated answer against the given context.
/// </summary>
public float Faithfulness { get; set; }

/// <summary>
/// Average Precision is a metric that evaluates whether all of the relevant items selected by the model are ranked higher or not.
/// </summary>
public float ContextPrecision { get; set; }

/// <summary>
/// Estimates context recall by estimating TP and FN using annotated answer and retrieved context.
/// </summary>
public float ContextRecall { get; set; }
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
// Copyright (c) Microsoft. All rights reserved.

using System.Collections.Generic;
using System.Linq;
using System.Text.Json;
using System.Threading.Tasks;
using Microsoft.KernelMemory.Evaluation;
using Microsoft.KernelMemory.Evaluation.TestSet;
using Microsoft.SemanticKernel;
using Microsoft.SemanticKernel.Connectors.OpenAI;

// ReSharper disable CheckNamespace
namespace Microsoft.KernelMemory.Evaluators.AnswerCorrectness;

internal sealed class AnswerCorrectnessEvaluator : EvaluationEngine
{
private readonly Kernel _kernel;

private KernelFunction ExtractStatements => this._kernel.CreateFunctionFromPrompt(this.GetSKPrompt("Extraction", "Statements"), new OpenAIPromptExecutionSettings
{
Temperature = 1e-8f,
}, functionName: nameof(this.ExtractStatements));

private KernelFunction EvaluateCorrectness => this._kernel.CreateFunctionFromPrompt(this.GetSKPrompt("Evaluation", "Correctness"), new OpenAIPromptExecutionSettings
{
Temperature = 1e-8f,
}, functionName: nameof(this.EvaluateCorrectness));

public AnswerCorrectnessEvaluator(Kernel kernel)
{
this._kernel = kernel.Clone();
}

internal async Task<float> Evaluate(TestSetItem testSet, MemoryAnswer answer, Dictionary<string, object?> metadata)
{
var statements = await this.Try(3, async (remainingTry) =>
{
var extraction = await this.ExtractStatements.InvokeAsync(this._kernel, new KernelArguments
{
{ "question", answer.Question },
{ "answer", answer.Result }
}).ConfigureAwait(false);

return JsonSerializer.Deserialize<IEnumerable<string>>(extraction.GetValue<string>()!);
}).ConfigureAwait(false);

if (statements is null)
{
return 0;
}

var evaluation = await this.Try(3, async (remainingTry) =>
{
var extraction = await this.EvaluateCorrectness.InvokeAsync(this._kernel, new KernelArguments
{
{ "question", answer.Question },
{ "answer", JsonSerializer.Serialize(statements) },
{ "ground_truth", JsonSerializer.Serialize(testSet.Context) }
}).ConfigureAwait(false);

return JsonSerializer.Deserialize<CorrectnessEvaluation>(extraction.GetValue<string>()!);
}).ConfigureAwait(false);

if (evaluation is null)
{
return 0;
}

metadata.Add($"{nameof(AnswerCorrectnessEvaluator)}-Evaluation", evaluation);

return (float)evaluation.TP.Count() /
(float)(evaluation.TP.Count() + .5 * (evaluation.FP.Count() + evaluation.FN.Count()));
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
// Copyright (c) Microsoft. All rights reserved.

using System.Collections.Generic;

// ReSharper disable InconsistentNaming
// ReSharper disable CheckNamespace
namespace Microsoft.KernelMemory.Evaluators.AnswerCorrectness;

#pragma warning disable CA1812 // 'CorrectnessEvaluation' is an internal class that is apparently never instantiated. If so, remove the code from the assembly. If this class is intended to contain only static members, make it 'static' (Module in Visual Basic). (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca1812)
internal sealed class CorrectnessEvaluation
#pragma warning restore CA1812 // 'CorrectnessEvaluation' is an internal class that is apparently never instantiated. If so, remove the code from the assembly. If this class is intended to contain only static members, make it 'static' (Module in Visual Basic). (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca1812)
{
public IEnumerable<StatementEvaluation> FP { get; set; } = null!;

public IEnumerable<StatementEvaluation> FN { get; set; } = null!;

public IEnumerable<StatementEvaluation> TP { get; set; } = null!;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
// Copyright (c) Microsoft. All rights reserved.

namespace Microsoft.KernelMemory.Evaluators.AnswerCorrectness;

#pragma warning disable CA1812 // 'StatementEvaluation' is an internal class that is apparently never instantiated. If so, remove the code from the assembly. If this class is intended to contain only static members, make it 'static' (Module in Visual Basic). (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca1812)
internal sealed class StatementEvaluation
#pragma warning restore CA1812 // 'StatementEvaluation' is an internal class that is apparently never instantiated. If so, remove the code from the assembly. If this class is intended to contain only static members, make it 'static' (Module in Visual Basic). (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca1812)
{
public string Statement { get; set; } = null!;

public string Reason { get; set; } = null!;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
// Copyright (c) Microsoft. All rights reserved.

using System.Collections.Generic;
using System.Linq;
using System.Numerics.Tensors;
using System.Threading.Tasks;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.KernelMemory.Evaluation;
using Microsoft.KernelMemory.Evaluation.TestSet;
using Microsoft.SemanticKernel;
using Microsoft.SemanticKernel.Embeddings;

// ReSharper disable CheckNamespace
namespace Microsoft.KernelMemory.Evaluators.AnswerSimilarity;

internal sealed class AnswerSimilarityEvaluator : EvaluationEngine
{
private readonly Kernel _kernel;

private readonly ITextEmbeddingGenerationService _textEmbeddingGenerationService;

public AnswerSimilarityEvaluator(Kernel kernel)
{
this._kernel = kernel.Clone();

this._textEmbeddingGenerationService = this._kernel.Services.GetRequiredService<ITextEmbeddingGenerationService>();
}

internal async Task<float> Evaluate(TestSetItem testSet, MemoryAnswer answer, Dictionary<string, object?> metadata)
{
var answerEmbeddings = await this._textEmbeddingGenerationService
.GenerateEmbeddingsAsync([testSet.GroundTruth, answer.Result], this._kernel)
.ConfigureAwait(false);

var evaluation = TensorPrimitives.CosineSimilarity(answerEmbeddings.First().Span, answerEmbeddings.Last().Span);

metadata.Add($"{nameof(AnswerSimilarityEvaluator)}-Evaluation", evaluation);

return evaluation;
}
}
Loading

0 comments on commit dd9915d

Please sign in to comment.